diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk
index bc41f502dc..b13c7d9537 100644
--- a/packages/mediacenter/LibreELEC-settings/package.mk
+++ b/packages/mediacenter/LibreELEC-settings/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="LibreELEC-settings"
-PKG_VERSION="ca96ddd"
+PKG_VERSION="463a99b"
 PKG_ARCH="any"
 PKG_LICENSE="prop."
 PKG_SITE="https://libreelec.tv"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
index 7a7d2b8c7d..48c556becb 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.2sf"
-PKG_VERSION="05fa941"
+PKG_VERSION="c9de26d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
index 6553771742..e8c3ec66dd 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.dumb"
-PKG_VERSION="6c15ef8"
+PKG_VERSION="54fba3d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk
index 635219b69a..790031232a 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.fluidsynth"
-PKG_VERSION="67fd270"
+PKG_VERSION="e0f1809"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk
index 50b427edc3..a8869dea91 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.gme"
-PKG_VERSION="4f8c49a"
+PKG_VERSION="8328bf2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
index 01672584be..ad4fca57bc 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.gsf"
-PKG_VERSION="122ff46"
+PKG_VERSION="acf4998"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
index 6fa3cf4d59..171eb826cc 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.modplug"
-PKG_VERSION="ae0b214"
+PKG_VERSION="950682e"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
index 287742f463..1a89833f4e 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.ncsf"
-PKG_VERSION="8835a04"
+PKG_VERSION="f914839"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk
index a863abb86c..abc1b28a72 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.nosefart"
-PKG_VERSION="1d3de76"
+PKG_VERSION="1a9f949"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
index 8b0c74fc01..4ecea39638 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.openmpt"
-PKG_VERSION="ceaffa1"
+PKG_VERSION="fbcbfda"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
index 7fa1de2a89..949ebf33ed 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.organya"
-PKG_VERSION="8573890"
+PKG_VERSION="bacd0ab"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
index 5f9499dc00..e56eef901b 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.qsf"
-PKG_VERSION="5edc117"
+PKG_VERSION="e581a67"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
index 8c3e46e370..d02c09d184 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.sidplay"
-PKG_VERSION="3e8a22e"
+PKG_VERSION="4083bc5"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk
index e5c6355d13..2ed9a187d1 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.snesapu"
-PKG_VERSION="84b7104"
+PKG_VERSION="b151c13"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
index 9db3a5b9f4..24b2492662 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.ssf"
-PKG_VERSION="b12c97d"
+PKG_VERSION="62750ac"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk
index 4ca18b373d..5c936c07f7 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.stsound"
-PKG_VERSION="a306bf6"
+PKG_VERSION="89ed4f3"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
index a36019f2b0..9b49a85c98 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.timidity"
-PKG_VERSION="20823d2"
+PKG_VERSION="8bd7092"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
index 051388a700..000968ae18 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.vgmstream"
-PKG_VERSION="a7c6153"
+PKG_VERSION="de21bab"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
index b3f9d7628f..b6311b92ff 100644
--- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="imagedecoder.raw"
-PKG_VERSION="e7e2c2d"
+PKG_VERSION="37ef22e"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index 376066d34c..520a823a7b 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="inputstream.adaptive"
-PKG_VERSION="f23ba39"
+PKG_VERSION="7bde41f"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/liberty-developer/inputstream.adaptive/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
index c32210797f..8d127b6ca2 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="inputstream.rtmp"
-PKG_VERSION="6c1af46"
+PKG_VERSION="1052cd3"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
index bd41ada7a4..54b056a9fd 100644
--- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="peripheral.joystick"
-PKG_VERSION="3c7ea59"
+PKG_VERSION="07aa1fe"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
index 46f59751a4..81437f0690 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.argustv"
-PKG_VERSION="7135b27"
+PKG_VERSION="8f89814"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
index bd09eabe29..4246a7373a 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.demo"
-PKG_VERSION="1606b61"
+PKG_VERSION="978f428"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
index 9ba09b8ddf..f636a14cf2 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.dvblink"
-PKG_VERSION="2634f6f"
+PKG_VERSION="b7d887c"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
index 19145a8d20..8dc3656030 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.dvbviewer"
-PKG_VERSION="13c6e5e"
+PKG_VERSION="d099cfa"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
index d412e2e83e..769c12fc71 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.filmon"
-PKG_VERSION="e026519"
+PKG_VERSION="0f1d34d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
index 40821e7945..1e830288ab 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.hdhomerun"
-PKG_VERSION="98cb8d4"
+PKG_VERSION="ab91169"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index 6e4e7e9b0d..760637cdc7 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="2993f43"
+PKG_VERSION="3911c7f"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
index d5c01284f2..cededa128e 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.iptvsimple"
-PKG_VERSION="53d63cc"
+PKG_VERSION="f83990a"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
index bd074c7eeb..bb4c31a819 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.mediaportal.tvserver"
-PKG_VERSION="d4dad61"
+PKG_VERSION="367b128"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
index e0069b66b8..e1d141cee4 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.nextpvr"
-PKG_VERSION="bb21826"
+PKG_VERSION="e6ece9f"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
index 81b84d221f..dfa32a2a30 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.njoy"
-PKG_VERSION="1ce9aba"
+PKG_VERSION="296f558"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
index cea1924c41..3adfdff286 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.pctv"
-PKG_VERSION="e3b2b84"
+PKG_VERSION="eab5f85"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
index 861e7b75e5..f4d25e2c91 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.stalker"
-PKG_VERSION="0466af9"
+PKG_VERSION="62b7908"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
index f262f26244..08879bb604 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vbox"
-PKG_VERSION="d61c501"
+PKG_VERSION="6001735"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
index 3150befcd5..61de07c987 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="9ede401"
+PKG_VERSION="b7c3f3b"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index e93797d356..55a9b0d0aa 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vuplus"
-PKG_VERSION="d7fdd1e"
+PKG_VERSION="78df030"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
index c91c248235..0e57b13566 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.wmc"
-PKG_VERSION="5aa3b1c"
+PKG_VERSION="27a88ca"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
index 3a81c78ebb..1cd361fbf2 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.asteroids"
-PKG_VERSION="111881d"
+PKG_VERSION="5d6fd4e"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
index 469c31c557..3e15639ce8 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.asterwave"
-PKG_VERSION="2c82b03"
+PKG_VERSION="8e6428c"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
index 897a07266a..e0c1b43078 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.biogenesis"
-PKG_VERSION="8cf0d12"
+PKG_VERSION="8d1ef04"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
index 8f7e7820e4..dac8c78fff 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.cpblobs"
-PKG_VERSION="585c25b"
+PKG_VERSION="1922717"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
index 9d4f937209..451aae9115 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.greynetic"
-PKG_VERSION="2c103d0"
+PKG_VERSION="e4dc6eb"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
index 9654b7c08f..5bb332bcbc 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.matrixtrails"
-PKG_VERSION="84ca058"
+PKG_VERSION="b5a245f"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
index d871d059e6..cf80fcc26d 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.pingpong"
-PKG_VERSION="88c7fed"
+PKG_VERSION="21ae78d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
index 8310d5e589..c402795381 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.pyro"
-PKG_VERSION="91a863a"
+PKG_VERSION="1477bd4"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
index 5689a8d44a..ceb1f6801b 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.shadertoy"
-PKG_VERSION="f576d4b"
+PKG_VERSION="434f5ce"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
index 42489b22e2..6f990f313c 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensaver.stars"
-PKG_VERSION="8ff5ad1"
+PKG_VERSION="6c62026"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
index cc064de4e8..0515ed5dd3 100644
--- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="screensavers.rsxs"
-PKG_VERSION="b68a652"
+PKG_VERSION="579ec13"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
new file mode 100644
index 0000000000..1c39495fa7
--- /dev/null
+++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
@@ -0,0 +1,41 @@
+################################################################################
+#      This file is part of LibreELEC - http://www.libreelec.tv
+#      Copyright (C) 2017-present Team LibreELEC
+#
+#  LibreELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  LibreELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LibreELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="vfs.rar"
+PKG_VERSION="26800eb"
+PKG_REV="1"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://www.kodi.tv"
+PKG_URL="https://github.com/notspiff/vfs.rar/archive/$PKG_VERSION.tar.gz"
+PKG_DEPENDS_TARGET="toolchain kodi-platform"
+PKG_SECTION=""
+PKG_SHORTDESC="vfs.rar"
+PKG_LONGDESC="vfs.rar"
+PKG_AUTORECONF="no"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="kodi.vfs"
+
+addon() {
+  mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/
+  cp -R $PKG_BUILD/.install_pkg/usr/share/$MEDIACENTER/addons/$PKG_NAME/* $ADDON_BUILD/$PKG_ADDON_ID/
+
+  ADDONSO=$(xmlstarlet sel -t -v "/addon/extension/@library_linux" $ADDON_BUILD/$PKG_ADDON_ID/addon.xml)
+  cp -L $PKG_BUILD/.install_pkg/usr/lib/$MEDIACENTER/addons/$PKG_NAME/$ADDONSO $ADDON_BUILD/$PKG_ADDON_ID/
+}
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
index 9b48bfcef5..e93cc0e120 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.fishbmc"
-PKG_VERSION="9704420"
+PKG_VERSION="611e9a9"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
index 4f222578ef..0935f7b344 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.goom"
-PKG_VERSION="745d8c9"
+PKG_VERSION="6bfc884"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
index e7acb8a2f6..872c7b672f 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.projectm"
-PKG_VERSION="5450aa2"
+PKG_VERSION="5bb3897"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
index f534521dfa..1c877680fc 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.shadertoy"
-PKG_VERSION="6db9a48"
+PKG_VERSION="ae677ac"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
index 83f470dd26..5badd52b0b 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.spectrum"
-PKG_VERSION="73c8786"
+PKG_VERSION="d7d9c14"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
index 0715bb3055..d12ed3bc5f 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="visualization.waveform"
-PKG_VERSION="ede2fd6"
+PKG_VERSION="2a71ba0"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 894d78f3a7..8d27e5f2ac 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="91a9066"
+PKG_VERSION="61e98fd"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 0ee857cdfc..b169d807ac 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="ffmpeg"
-# Current branch is: release/3.1-xbmc
-PKG_VERSION="33c167d"
+# Current branch is: release/3.3-kodi
+PKG_VERSION="eb0819c"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
@@ -159,7 +159,6 @@ configure_target() {
               --disable-dxva2 \
               --enable-runtime-cpudetect \
               $FFMPEG_TABLES \
-              --disable-memalign-hack \
               --disable-encoders \
               --enable-encoder=ac3 \
               --enable-encoder=aac \
@@ -188,7 +187,6 @@ configure_target() {
               --disable-libopencore-amrwb \
               --disable-libopencv \
               --disable-libdc1394 \
-              --disable-libfaac \
               --disable-libfreetype \
               --disable-libgsm \
               --disable-libmp3lame \
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
index 3623a64577..f9b7f1bd34 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
@@ -1,7 +1,7 @@
-diff --git a/.gitignore b/.gitignore
+diff --git b/.gitignore a/.gitignore
 index 524fb73..305632b 100644
---- a/.gitignore
-+++ b/.gitignore
+--- b/.gitignore
++++ a/.gitignore
 @@ -23,6 +23,7 @@
  .\#*
  /.config
@@ -10,10 +10,81 @@ index 524fb73..305632b 100644
  /ffmpeg
  /ffplay
  /ffprobe
-diff --git a/ffmpeg.c b/ffmpeg.c
-index 9ffd833..7a86d7e 100644
---- a/ffmpeg.c
-+++ b/ffmpeg.c
+diff --git b/Changelog a/Changelog
+index 6f023a9..ad53c9d 100644
+--- b/Changelog
++++ a/Changelog
+@@ -1,7 +1,7 @@
+ Entries are sorted chronologically from oldest to youngest within each release,
+ releases are sorted from youngest to oldest.
+ 
+-version 3.3:
++version <next>:
+ - CrystalHD decoder moved to new decode API
+ - add internal ebur128 library, remove external libebur128 dependency
+ - Pro-MPEG CoP #3-R2 FEC protocol
+@@ -22,7 +22,6 @@ version 3.3:
+ - threshold filter
+ - midequalizer filter
+ - Optimal Huffman tables for (M)JPEG encoding
+-- VAAPI-accelerated MPEG-2 and VP8 encoding
+ - FM Screen Capture Codec decoder
+ - native Opus encoder
+ - ScreenPressor decoder
+@@ -33,7 +32,6 @@ version 3.3:
+ - Removed the legacy X11 screen grabber, use XCB instead
+ - MPEG-7 Video Signature filter
+ - Removed asyncts filter (use af_aresample instead)
+-- Intel QSV-accelerated VP8 video decoding
+ 
+ 
+ version 3.2:
+@@ -121,6 +119,7 @@ version 3.1:
+ - libutvideo wrapper removed
+ - YUY2 Lossless Codec decoder
+ - VideoToolbox H.264 encoder
++- VAAPI-accelerated MPEG-2 and VP8 encoding
+ 
+ 
+ version 3.0:
+diff --git b/RELEASE_NOTES a/RELEASE_NOTES
+new file mode 100644
+index 0000000..c3ec010
+--- /dev/null
++++ a/RELEASE_NOTES
+@@ -0,0 +1,15 @@
++
++              ┌────────────────────────────────────────┐
++              │ RELEASE NOTES for FFmpeg 3.2 "Hypatia" │
++              └────────────────────────────────────────┘
++
++   The FFmpeg Project proudly presents FFmpeg 3.2 "Hypatia", about 4
++   months after the release of FFmpeg 3.1.
++
++   A complete Changelog is available at the root of the project, and the
++   complete Git history on http://source.ffmpeg.org.
++
++   We hope you will like this release as much as we enjoyed working on it, and
++   as usual, if you have any questions about it, or any FFmpeg related topic,
++   feel free to join us on the #ffmpeg IRC channel (on irc.freenode.net) or ask
++   on the mailing-lists.
+diff --git b/doc/Doxyfile a/doc/Doxyfile
+index 0891899..8f855f8 100644
+--- b/doc/Doxyfile
++++ a/doc/Doxyfile
+@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
+ # could be handy for archiving the generated documentation or if some version
+ # control system is used.
+ 
+-PROJECT_NUMBER         =
++PROJECT_NUMBER         = 3.2
+ 
+ # Using the PROJECT_BRIEF tag one can provide an optional one line description
+ # for a project that appears at the top of each page and should give viewer a
+diff --git b/ffmpeg.c a/ffmpeg.c
+index 11faf0d..494c23d 100644
+--- b/ffmpeg.c
++++ a/ffmpeg.c
 @@ -23,6 +23,11 @@
   * multimedia converter based on the FFmpeg libraries
   */
@@ -26,7 +97,7 @@ index 9ffd833..7a86d7e 100644
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -66,6 +71,25 @@
+@@ -68,6 +73,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -52,7 +123,7 @@ index 9ffd833..7a86d7e 100644
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -158,6 +182,169 @@ static int restore_tty;
+@@ -164,6 +188,174 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -168,11 +239,16 @@ index 9ffd833..7a86d7e 100644
 +#ifdef RPI_ZERO_COPY
 +{
 +    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
++    if (fr_buf == NULL) {
++        mmal_buffer_header_release(buf);
++        return;
++    }
 +
 +    buf->user_data = fr_buf;
 +    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->alloc_size =
-+        buf->length = av_rpi_zc_numbytes(fr_buf);
++    buf->offset = av_rpi_zc_offset(fr_buf);
++    buf->length = av_rpi_zc_length(fr_buf);
++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
 +
 +    ++rpi_display_count;
 +}
@@ -222,7 +298,7 @@ index 9ffd833..7a86d7e 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -575,6 +767,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
@@ -234,9 +310,9 @@ index 9ffd833..7a86d7e 100644
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
-         av_freep(&ist->filters);
+@@ -587,6 +784,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->hwaccel_device);
+         av_freep(&ist->dts_buffer);
  
 +#ifdef RPI_ZERO_COPY
 +        av_rpi_zc_uninit(ist->dec_ctx);
@@ -244,7 +320,7 @@ index 9ffd833..7a86d7e 100644
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -617,6 +817,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -252,7 +328,7 @@ index 9ffd833..7a86d7e 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
+@@ -1050,6 +1251,15 @@ static void do_video_out(OutputFile *of,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
@@ -265,10 +341,10 @@ index 9ffd833..7a86d7e 100644
 +    }
 +#endif
 +
-     if (filter->inputs[0]->frame_rate.num > 0 &&
-         filter->inputs[0]->frame_rate.den > 0)
-         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+     frame_rate = av_buffersink_get_frame_rate(filter);
+     if (frame_rate.num > 0 && frame_rate.den > 0)
+         duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
+@@ -2873,6 +3083,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -281,10 +357,10 @@ index 9ffd833..7a86d7e 100644
          ist->dec_ctx->thread_safe_callbacks = 1;
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index fd0d1f0..40d22d2 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
+diff --git b/libavcodec/Makefile a/libavcodec/Makefile
+index 0dd0c7b..d2eb014 100644
+--- b/libavcodec/Makefile
++++ a/libavcodec/Makefile
 @@ -5,6 +5,11 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
@@ -297,7 +373,7 @@ index fd0d1f0..40d22d2 100644
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
+@@ -47,6 +52,10 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
@@ -308,23 +384,37 @@ index fd0d1f0..40d22d2 100644
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -973,8 +982,7 @@ OBJS-$(CONFIG_AAC_ADTSTOASC_BSF)          += aac_adtstoasc_bsf.o aacadtsdec.o \
+ OBJS-$(CONFIG_CHOMP_BSF)                  += chomp_bsf.o
+ OBJS-$(CONFIG_DUMP_EXTRADATA_BSF)         += dump_extradata_bsf.o
+ OBJS-$(CONFIG_DCA_CORE_BSF)               += dca_core_bsf.o
+-OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF)      += extract_extradata_bsf.o    \
+-                                             h2645_parse.o
++OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF)      += extract_extradata_bsf.o
+ OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF)       += h264_mp4toannexb_bsf.o
+ OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF)       += hevc_mp4toannexb_bsf.o
+ OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
+@@ -1103,3 +1111,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
++QASM := $(SUBDIR)../pi-util/qasm.py
++
++ifneq ("$(wildcard $(QASM))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++endif
 +
 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 54efaad..02a89c3 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
+diff --git b/libavcodec/allcodecs.c a/libavcodec/allcodecs.c
+index 4df4772..ca05158 100644
+--- b/libavcodec/allcodecs.c
++++ a/libavcodec/allcodecs.c
+@@ -696,6 +696,7 @@ static void register_all(void)
      REGISTER_PARSER(H261,               h261);
      REGISTER_PARSER(H263,               h263);
      REGISTER_PARSER(H264,               h264);
@@ -332,11 +422,11 @@ index 54efaad..02a89c3 100644
      REGISTER_PARSER(HEVC,               hevc);
      REGISTER_PARSER(MJPEG,              mjpeg);
      REGISTER_PARSER(MLP,                mlp);
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index a4ceca7..1354c14 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+diff --git b/libavcodec/arm/Makefile a/libavcodec/arm/Makefile
+index 1eeac54..f96f93b 100644
+--- b/libavcodec/arm/Makefile
++++ a/libavcodec/arm/Makefile
+@@ -135,8 +135,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
                                            arm/hevcdsp_deblock_neon.o    \
@@ -348,10 +438,10 @@ index a4ceca7..1354c14 100644
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
-diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+diff --git b/libavcodec/arm/cabac.h a/libavcodec/arm/cabac.h
 index fdbf86b..0a3980a 100644
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
+--- b/libavcodec/arm/cabac.h
++++ a/libavcodec/arm/cabac.h
 @@ -26,13 +26,34 @@
  #include "libavutil/internal.h"
  #include "libavcodec/cabac.h"
@@ -530,11 +620,11 @@ index fdbf86b..0a3980a 100644
  #endif /* HAVE_ARMV6T2_INLINE */
  
  #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
+diff --git b/libavcodec/arm/hevc_cabac.h a/libavcodec/arm/hevc_cabac.h
 new file mode 100644
 index 0000000..31d3c59
 --- /dev/null
-+++ b/libavcodec/arm/hevc_cabac.h
++++ a/libavcodec/arm/hevc_cabac.h
 @@ -0,0 +1,491 @@
 +/*
 + * This file is part of FFmpeg.
@@ -1027,10 +1117,10 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+diff --git b/libavcodec/arm/hevcdsp_deblock_neon.S a/libavcodec/arm/hevcdsp_deblock_neon.S
 index 166bddb..a088cc3 100644
---- a/libavcodec/arm/hevcdsp_deblock_neon.S
-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+--- b/libavcodec/arm/hevcdsp_deblock_neon.S
++++ a/libavcodec/arm/hevcdsp_deblock_neon.S
 @@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
          vst1.8   {d4}, [r0]
          bx       lr
@@ -1159,11 +1249,11 @@ index 166bddb..a088cc3 100644
 +90:     mov         a3, #1
 +        b           11b
 +endfunc
-diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+diff --git b/libavcodec/arm/hevcdsp_epel_neon.S a/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
 index 0000000..00eab9e
 --- /dev/null
-+++ b/libavcodec/arm/hevcdsp_epel_neon.S
++++ a/libavcodec/arm/hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
@@ -1502,10 +1592,10 @@ index 0000000..00eab9e
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 5591807..49c70dd 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
+diff --git b/libavcodec/arm/hevcdsp_init_neon.c a/libavcodec/arm/hevcdsp_init_neon.c
+index 1a3912c..5c72e1d 100644
+--- b/libavcodec/arm/hevcdsp_init_neon.c
++++ a/libavcodec/arm/hevcdsp_init_neon.c
 @@ -22,6 +22,8 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
@@ -1515,9 +1605,9 @@ index 5591807..49c70dd 100644
  
  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
+@@ -43,6 +45,21 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
  
 +void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
 +void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
@@ -1687,9 +1777,9 @@ index 5591807..49c70dd 100644
  {
      if (bit_depth == 8) {
 @@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+         c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
+         c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
+         c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
 +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
 +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
 +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
@@ -1729,11 +1819,11 @@ index 5591807..49c70dd 100644
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
  }
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+diff --git b/libavcodec/arm/hevcdsp_sao_neon.S a/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
 index 0000000..9c7808d
 --- /dev/null
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
++++ a/libavcodec/arm/hevcdsp_sao_neon.S
 @@ -0,0 +1,510 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
@@ -2245,28 +2335,28 @@ index 0000000..9c7808d
 +        bx      lr
 +endfunc
 +
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 39713ed..25eb52b 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -410,6 +410,8 @@ enum AVCodecID {
-     AV_CODEC_ID_SHEERVIDEO,
-     AV_CODEC_ID_YLC,
+diff --git b/libavcodec/avcodec.h a/libavcodec/avcodec.h
+index d780477..5807e1b 100644
+--- b/libavcodec/avcodec.h
++++ a/libavcodec/avcodec.h
+@@ -443,6 +443,8 @@ enum AVCodecID {
+     AV_CODEC_ID_XPM,
+     AV_CODEC_ID_AV1,
  
 +    AV_CODEC_ID_H264_MVC,
 +
      /* various PCM "codecs" */
      AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
      AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_DC_CLIP          4096
+@@ -2925,6 +2927,7 @@ typedef struct AVCodecContext {
  #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
  #define FF_BUG_TRUNCATED       16384
-+#define FF_BUG_GMC_UNSUPPORTED 32768
+ #define FF_BUG_IEDGE           32768
++#define FF_BUG_GMC_UNSUPPORTED (1 << 16)
  
      /**
       * strictly follow the standard (MPEG-4, ...).
-@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+@@ -3276,6 +3279,9 @@ typedef struct AVCodecContext {
  #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
  #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
  #define FF_PROFILE_H264_CAVLC_444            44
@@ -2276,23 +2366,25 @@ index 39713ed..25eb52b 100644
  
  #define FF_PROFILE_VC1_SIMPLE   0
  #define FF_PROFILE_VC1_MAIN     1
-@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+@@ -3586,7 +3592,13 @@ typedef struct AVCodecContext {
  #endif
  
-+    /**
+     /**
+-     * Audio only. The amount of padding (in samples) appended by the encoder to
 +     * Opaque pointer for use by replacement get_buffer2 code
 +     *
 +     * @author jc (08/02/2016)
 +     */
 +    void * get_buffer_context;
- } AVCodecContext;
- 
- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
++
++    /* Audio only. The amount of padding (in samples) appended by the encoder to
+      * the end of the audio. I.e. this number of decoded samples must be
+      * discarded by the caller from the end of the stream to get the original
+      * audio without any trailing padding.
+diff --git b/libavcodec/cabac.h a/libavcodec/cabac.h
 index 1bf1c62..ccfa991 100644
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
+--- b/libavcodec/cabac.h
++++ a/libavcodec/cabac.h
 @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
  typedef struct CABACContext{
      int low;
@@ -2309,15 +2401,50 @@ index 1bf1c62..ccfa991 100644
      const uint8_t *bytestream_start;
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 9d94b72..535ebf0 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+diff --git b/libavcodec/codec_desc.c a/libavcodec/codec_desc.c
+index 9711019..9f99a2c 100644
+--- b/libavcodec/codec_desc.c
++++ a/libavcodec/codec_desc.c
+@@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = {
+         .props     = AV_CODEC_PROP_LOSSLESS,
+         .mime_types= MT("image/png"),
      },
 +    {
++        .id        = AV_CODEC_ID_CFHD,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "cfhd",
++        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
++        .props     = AV_CODEC_PROP_LOSSY,
++    },
++    {
++        .id        = AV_CODEC_ID_TRUEMOTION2RT,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "truemotion2rt",
++        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
++        .props     = AV_CODEC_PROP_LOSSY,
++    },
++    {
++        .id        = AV_CODEC_ID_MAGICYUV,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "magicyuv",
++        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"),
++        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
++    },
++    {
++        .id        = AV_CODEC_ID_SHEERVIDEO,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "sheervideo",
++        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
++        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
++    },
++    {
++        .id        = AV_CODEC_ID_YLC,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "ylc",
++        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
++        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
++    },
++    {
 +        .id        = AV_CODEC_ID_H264_MVC,
 +        .type      = AVMEDIA_TYPE_VIDEO,
 +        .name      = "h264_mvc",
@@ -2327,50 +2454,103 @@ index 9d94b72..535ebf0 100644
  
      /* various PCM "codecs" */
      {
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index efe3555..16358aa 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -126,7 +126,9 @@ enum {
-     NAL_END_STREAM      = 11,
-     NAL_FILLER_DATA     = 12,
-     NAL_SPS_EXT         = 13,
-+    NAL_SPS_SUBSET      = 15,
-     NAL_AUXILIARY_SLICE = 19,
-+    NAL_SLICE_EXT       = 20,
-     NAL_FF_IGNORE       = 0xff0f001,
+diff --git b/libavcodec/dvdsubdec.c a/libavcodec/dvdsubdec.c
+index 4e9c058..22ce728 100644
+--- b/libavcodec/dvdsubdec.c
++++ a/libavcodec/dvdsubdec.c
+@@ -189,12 +189,12 @@ static void guess_palette(DVDSubContext* ctx,
+                 r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
+                 g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
+                 b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
+-                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
++                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24);
+                 color_used[colormap[i]] = (i + 1);
+                 j++;
+             } else {
+                 rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
+-                                    ((alpha[i] * 17U) << 24);
++                                    ((alpha[i] * 17) << 24);
+             }
+         }
+     }
+diff --git b/libavcodec/h264.h a/libavcodec/h264.h
+index 86df5eb..22c4f1d 100644
+--- b/libavcodec/h264.h
++++ a/libavcodec/h264.h
+@@ -41,7 +41,9 @@ enum {
+     H264_NAL_END_STREAM      = 11,
+     H264_NAL_FILLER_DATA     = 12,
+     H264_NAL_SPS_EXT         = 13,
++    H264_NAL_SPS_SUBSET      = 15,
+     H264_NAL_AUXILIARY_SLICE = 19,
++    H264_NAL_SLICE_EXT       = 20,
  };
  
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index ce4bab2..b9b0c78 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
+ #endif /* AVCODEC_H264_H */
+diff --git b/libavcodec/h264_parse.c a/libavcodec/h264_parse.c
+index ea202e7..0c87319 100644
+--- b/libavcodec/h264_parse.c
++++ a/libavcodec/h264_parse.c
+@@ -59,9 +59,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
+             if (luma_weight_flag) {
+                 pwt->luma_weight[i][list][0] = get_se_golomb(gb);
+                 pwt->luma_weight[i][list][1] = get_se_golomb(gb);
+-                if ((int8_t)pwt->luma_weight[i][list][0] != pwt->luma_weight[i][list][0] ||
+-                    (int8_t)pwt->luma_weight[i][list][1] != pwt->luma_weight[i][list][1])
+-                    goto out_range_weight;
+                 if (pwt->luma_weight[i][list][0] != luma_def ||
+                     pwt->luma_weight[i][list][1] != 0) {
+                     pwt->use_weight             = 1;
+@@ -79,9 +76,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
+                     for (j = 0; j < 2; j++) {
+                         pwt->chroma_weight[i][list][j][0] = get_se_golomb(gb);
+                         pwt->chroma_weight[i][list][j][1] = get_se_golomb(gb);
+-                        if ((int8_t)pwt->chroma_weight[i][list][j][0] != pwt->chroma_weight[i][list][j][0] ||
+-                            (int8_t)pwt->chroma_weight[i][list][j][1] != pwt->chroma_weight[i][list][j][1])
+-                            goto out_range_weight;
+                         if (pwt->chroma_weight[i][list][j][0] != chroma_def ||
+                             pwt->chroma_weight[i][list][j][1] != 0) {
+                             pwt->use_weight_chroma        = 1;
+@@ -110,9 +104,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
+     }
+     pwt->use_weight = pwt->use_weight || pwt->use_weight_chroma;
+     return 0;
+-out_range_weight:
+-    avpriv_request_sample(logctx, "Out of range weight\n");
+-    return AVERROR_INVALIDDATA;
+ }
+ 
+ /**
+diff --git b/libavcodec/h264_parser.c a/libavcodec/h264_parser.c
+index bc35a61..055828c 100644
+--- b/libavcodec/h264_parser.c
++++ a/libavcodec/h264_parser.c
+@@ -60,6 +60,8 @@ typedef struct H264ParseContext {
      uint8_t parse_history[6];
      int parse_history_count;
      int parse_last_mb;
 +    int is_mvc;
 +    int slice_ext;
+     int64_t reference_dts;
+     int last_frame_num, last_picture_structure;
  } H264ParseContext;
- 
- 
-@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+@@ -109,24 +111,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
          } else if (state <= 5) {
              int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
-+                nalu_type == NAL_SPS_SUBSET) {
+             if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
+-                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
++                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
++                nalu_type == H264_NAL_SPS_SUBSET) {
                  if (pc->frame_start_found) {
                      i++;
                      goto found;
                  }
-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
--                       nalu_type == NAL_IDR_SLICE) {
-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
+             } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
+-                       nalu_type == H264_NAL_IDR_SLICE) {
++                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
                  state += 8;
 +
-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
++                p->slice_ext = (nalu_type == H264_NAL_SLICE_EXT);
                  continue;
              }
              state = 7;
@@ -2386,7 +2566,7 @@ index ce4bab2..b9b0c78 100644
                  p->parse_history_count = 0;
                  mb= get_ue_golomb_long(&gb);
                  p->parse_last_mb = mb;
-@@ -145,7 +150,7 @@ found:
+@@ -149,7 +154,7 @@ found:
      pc->frame_start_found = 0;
      if (p->is_avc)
          return next_avc;
@@ -2395,7 +2575,7 @@ index ce4bab2..b9b0c78 100644
  }
  
  static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
+@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s,
          }
      }
  
@@ -2405,16 +2585,16 @@ index ce4bab2..b9b0c78 100644
  
      if (avctx->framerate.num)
          avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
+@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx,
          if ((state & 0xFFFFFF00) != 0x100)
              break;
          nalu_type = state & 0x1F;
--        if (nalu_type == NAL_SPS) {
-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
+-        if (nalu_type == H264_NAL_SPS) {
++        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) {
              has_sps = 1;
-         } else if (nalu_type == NAL_PPS)
+         } else if (nalu_type == H264_NAL_PPS)
              has_pps = 1;
-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
+@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = {
      .parser_close   = h264_close,
      .split          = h264_split,
  };
@@ -2438,2373 +2618,69 @@ index ce4bab2..b9b0c78 100644
 +    .parser_close   = h264_close,
 +    .split          = h264_split,
 +};
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b478065..88dd40b 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -41,8 +41,186 @@
- #include "hevc.h"
- #include "profiles.h"
- 
-+#ifdef RPI
-+  #include "rpi_qpu.h"
-+  #include "rpi_user_vcsm.h"
-+  // Move Inter prediction into separate pass
-+  #define RPI_INTER
-+
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+    #define RPI_MULTI_MAILBOX
-+  #endif
-+
-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-+
-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
-+  //#define RPI_SIMULATE_QPUS
-+  #ifdef RPI_WORKER
-+    #include "pthread.h"
-+  #endif
-+
-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
-+  static void rpi_execute_transform(HEVCContext *s);
-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
-+  static void rpi_execute_pred_cmds(HEVCContext *s);
-+  static void rpi_execute_inter_cmds(HEVCContext *s);
-+  static void rpi_begin(HEVCContext *s);
-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
-+
-+#endif
-+
-+// #define DISABLE_MC
-+
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
-+
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
-+}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
-+
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
-+
-+#ifdef RPI_INTER_QPU
-+
-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-+// For each block of 64*64 the smallest block size is 8x4
-+// We also need an extra command for the setup information
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define RPI_LUMA_COMMAND_WORDS 10
-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+
-+// TODO Chroma only needs 4 taps
-+
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8][1] = {
-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
-+        { ENCODE_COEFFS(  2,  58,  10,  2) },
-+        { ENCODE_COEFFS(  4,  54,  16,  2) },
-+        { ENCODE_COEFFS(  6,  46,  28,  4) },
-+        { ENCODE_COEFFS(  4,  36,  36,  4) },
-+        { ENCODE_COEFFS(  4,  28,  46,  6) },
-+        { ENCODE_COEFFS(  2,  16,  54,  4) },
-+        { ENCODE_COEFFS(  2,  10,  58,  2) }
-+};
-+
-+#endif
-+
-+
-+#ifdef RPI_WORKER
-+
-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-+
-+#define LOG_ENTER
-+#define LOG_EXIT
-+
-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-+static void worker_submit_job(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++;
-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call this to say we have completed pass1
-+static void worker_complete_job(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++;
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+static void worker_wait(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  while( s->worker_head !=s->worker_tail)
-+  {
-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+  }
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+static void worker_pass0_ready(HEVCContext *s)
-+{
-+  LOG_ENTER
-+    pthread_mutex_lock(&s->worker_mutex);
-+    // tail is number of submitted jobs
-+    // head is number of completed jobs
-+    // tail-head is number of outstanding jobs in the queue
-+    // we need to ensure there is at least 1 space left for us to use
-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-+    {
-+      // Wait until another job is completed
-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+static void *worker_start(void *arg)
-+{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
-+
-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
-+    {
-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+
-+    if (s->kill_worker) {
-+      break;
-+    }
-+    LOG_ENTER
-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+    rpi_launch_vpu_qpu(s);
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+
-+    worker_complete_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
-+}
-+
-+#endif
-+
- /**
-  * NOTE: Each function hls_foo correspond to the function foo in the
-  * specification (HLS stands for High Level Syntax).
-@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- /* free everything allocated  by pic_arrays_init() */
- static void pic_arrays_free(HEVCContext *s)
- {
-+#ifdef RPI
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      if (s->coeffs_buf_arm[job][0]) {
-+        gpu_free(&s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = 0;
-+      }
-+      if (s->coeffs_buf_arm[job][2]) {
-+        gpu_free(&s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = 0;
-+      }
-+    }
-+#endif
-+#ifdef RPI_DEBLOCK_VPU
-+    {
-+        int i;
-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
-+
-+            if (dvq->vpu_cmds_arm) {
-+                gpu_free(&dvq->deblock_vpu_gmem);
-+              dvq->vpu_cmds_arm = 0;
-+            }
-+        }
-+    }
-+#endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
- 
-@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     int ctb_count        = sps->ctb_width * sps->ctb_height;
-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
- 
-+#ifdef RPI
-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+    int job;
-+
-+    av_assert0(sps);
-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      printf("Allocated %d\n",coefs_per_row);
-+      for(job=0;job<RPI_MAX_JOBS;job++) {
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+        if (!s->coeffs_buf_arm[job][0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+        if (!s->coeffs_buf_arm[job][2])
-+            goto fail;
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+      }
-+    }
-+#endif
-+#ifdef RPI_DEBLOCK_VPU
-+    {
-+        int i;
-+        s->enable_rpi_deblock = !sps->sao_enabled;
-+        s->setup_width = (sps->width+15) / 16;
-+        s->setup_height = (sps->height+15) / 16;
-+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
-+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
-+
-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
-+        {
-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
-+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
-+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
-+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
-+            const unsigned int total_size =- cmd_size + y_size + uv_size;
-+            int p_vc;
-+            uint8_t * p_arm;
-+ #if RPI_VPU_DEBLOCK_CACHED
-+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
-+ #else
-+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
-+ #endif
-+            p_vc = dvq->deblock_vpu_gmem.vc;
-+            p_arm = dvq->deblock_vpu_gmem.arm;
-+
-+            // Zap all
-+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
-+
-+            // Subdivide
-+            dvq->vpu_cmds_arm = (void*)p_arm;
-+            dvq->vpu_cmds_vc = p_vc;
-+
-+            p_arm += cmd_size;
-+            p_vc += cmd_size;
-+
-+            dvq->y_setup_arm = (void*)p_arm;
-+            dvq->y_setup_vc = (void*)p_vc;
-+
-+            p_arm += y_size;
-+            p_vc += y_size;
-+
-+            dvq->uv_setup_arm = (void*)p_arm;
-+            dvq->uv_setup_vc = (void*)p_vc;
-+
-+            dvq->cmd_id = -1;
-+        }
-+
-+        s->dvq_n = 0;
-+        s->dvq = s->dvq_ents + s->dvq_n;
-+    }
-+#endif
-+
-     s->bs_width  = (width  >> 2) + 1;
-     s->bs_height = (height >> 2) + 1;
- 
-@@ -137,6 +422,29 @@ fail:
-     return AVERROR(ENOMEM);
- }
- 
-+static void default_pred_weight_table(HEVCContext * const s)
-+{
-+  unsigned int i;
-+  s->sh.luma_log2_weight_denom = 0;
-+  s->sh.chroma_log2_weight_denom = 0;
-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+      s->sh.luma_weight_l0[i] = 1;
-+      s->sh.luma_offset_l0[i] = 0;
-+      s->sh.chroma_weight_l0[i][0] = 1;
-+      s->sh.chroma_offset_l0[i][0] = 0;
-+      s->sh.chroma_weight_l0[i][1] = 1;
-+      s->sh.chroma_offset_l0[i][1] = 0;
-+  }
-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+      s->sh.luma_weight_l1[i] = 1;
-+      s->sh.luma_offset_l1[i] = 0;
-+      s->sh.chroma_weight_l1[i][0] = 1;
-+      s->sh.chroma_offset_l1[i][0] = 0;
-+      s->sh.chroma_weight_l1[i][1] = 1;
-+      s->sh.chroma_offset_l1[i][1] = 0;
-+  }
-+}
-+
- static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
- {
-     int i = 0;
-@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
-                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
-                 pred_weight_table(s, gb);
-             }
-+            else
-+            {
-+              // Give us unit weights
-+              default_pred_weight_table(s);
-+            }
- 
-             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
-     return 0;
- }
- 
-+#ifdef RPI
-+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
-+{
-+    if (s->enable_rpi) {
-+        HEVCLocalContext *lc = s->HEVClc;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_INTRA;
-+        cmd->size = log2_trafo_size;
-+        cmd->c_idx = c_idx;
-+        cmd->x = x0;
-+        cmd->y = y0;
-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+    } else {
-+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
-+    }
-+}
-+#endif
-+
- static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
-                               int log2_cb_size, int log2_trafo_size,
-@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-     if (lc->cu.pred_mode == MODE_INTRA) {
-         int trafo_size = 1 << log2_trafo_size;
-         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
--
-+#ifdef RPI
-+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
-+#else
-         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-+#endif
+diff --git b/libavcodec/h264_slice.c a/libavcodec/h264_slice.c
+index 44a0b9f..fa1e9ae 100644
+--- b/libavcodec/h264_slice.c
++++ a/libavcodec/h264_slice.c
+@@ -1778,12 +1778,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
      }
+     if ((pps->weighted_pred && sl->slice_type_nos == AV_PICTURE_TYPE_P) ||
+         (pps->weighted_bipred_idc == 1 &&
+-         sl->slice_type_nos == AV_PICTURE_TYPE_B)) {
+-        ret = ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
++         sl->slice_type_nos == AV_PICTURE_TYPE_B))
++        ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
+                                   sl->slice_type_nos, &sl->pwt, h->avctx);
+-        if (ret < 0)
+-            return ret;
+-    }
  
-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
-+#else
-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
-+#endif
-                 }
-                 if (cbf_cb[i])
-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
-+#else
-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
-+#endif
-                 }
-                 if (cbf_cr[i])
-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                 if (lc->cu.pred_mode == MODE_INTRA) {
-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-                                                     trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
-+#else
-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
-+#endif
-                 }
-                 if (cbf_cb[i])
-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                 if (lc->cu.pred_mode == MODE_INTRA) {
-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-                                                 trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
-+#else
-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
-+#endif
-                 }
-                 if (cbf_cr[i])
-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
-             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
-+#else
-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
-+#endif
-             if (s->ps.sps->chroma_format_idc == 2) {
-                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
-                                                 trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
-+#else
-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
-+#endif
-             }
-         } else if (blk_idx == 3) {
-             int trafo_size_h = 1 << (log2_trafo_size + 1);
-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
-             ff_hevc_set_neighbour_available(s, xBase, yBase,
-                                             trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
-+#else
-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-+#endif
-             if (s->ps.sps->chroma_format_idc == 2) {
-                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
-                                                 trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+#else
-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+#endif
-             }
-         }
-     }
-@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  * @param luma_offset additive offset applied to the luma prediction value
-  */
- 
-+#ifdef RPI_INTER
-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                        int block_w, int block_h, int luma_weight, int luma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_UNI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref->data[0];
-+    cmd->srcstride = ref->linesize[0];
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = luma_weight;
-+    cmd->offset = luma_offset;
-+}
-+
-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_BI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[0];
-+    cmd->srcstride = ref0->linesize[0];
-+    cmd->mv = *mv0;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[0];
-+    cmd->srcstride1 = ref1->linesize[0];
-+    cmd->mv1 = *mv1;
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = src0;
-+    cmd->srcstride = srcstride;
-+    cmd->mv = current_mv->mv[reflist];
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = chroma_weight;
-+    cmd->offset = chroma_offset;
-+}
-+
-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[cidx+1];
-+    cmd->srcstride = ref0->linesize[cidx+1];
-+    cmd->mv = current_mv->mv[0];
-+    cmd->mv1 = current_mv->mv[1];
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[cidx+1];
-+    cmd->srcstride1 = ref1->linesize[cidx+1];
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+#else
-+#define RPI_REDIRECT(fn) fn
-+#endif
-+
- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-     int idx              = ff_hevc_pel_weight[block_w];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     x_off += mv->x >> 2;
-     y_off += mv->y >> 2;
-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
-  * @param current_mv current motion vector structure
-  */
-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
- {
-@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-     intptr_t _mx         = mx << (1 - hshift);
-     intptr_t _my         = my << (1 - vshift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     x_off += mv->x >> (2 + hshift);
-     y_off += mv->y >> (2 + vshift);
-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-     int hshift = s->ps.sps->hshift[1];
-     int vshift = s->ps.sps->vshift[1];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
-     }
- }
- 
--static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
--                                int nPbW, int nPbH,
--                                int log2_cb_size, int partIdx, int idx)
-+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
-+                                const int nPbW, const int nPbH,
-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
- {
- #define POS(c_idx, x, y)                                                              \
-     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
-                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
--    HEVCLocalContext *lc = s->HEVClc;
-+    HEVCLocalContext * const lc = s->HEVClc;
-     int merge_idx = 0;
-     struct MvField current_mv = {{{ 0 }}};
- 
-@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-     int y_cb             = y0 >> log2_min_cb_size;
-     int x_pu, y_pu;
-     int i, j;
--
--    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
-+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
- 
-     if (!skip_flag)
-         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi) {
-+            const Mv * const mv    = &current_mv.mv[0];
-+            const unsigned int mx          = mv->x & 3;
-+            const unsigned int my          = mv->y & 3;
-+            const unsigned int my_mx       = (my<<8) | mx;
-+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
-+            uint32_t * y = s->curr_y_mvs;
-+
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
-+
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  const int bw = nPbW-start_x;
-+                  const int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
-+        } else
-+#endif
-+        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
-+          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
-+                return;
-+            }
-+#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-         }
-@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi) {
-+            const int reflist = 1;
-+            const Mv *mv    = &current_mv.mv[reflist];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
-+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
-+        } else
-+#endif
-+
-+        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
-+            if (s->enable_rpi) {
-+                const int reflist = 1;
-+                const int hshift           = s->ps.sps->hshift[1];
-+                const int vshift           = s->ps.sps->vshift[1];
-+                const Mv * const mv        = &current_mv.mv[reflist];
-+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                const intptr_t _mx         = mx << (1 - hshift);
-+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t * u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      const int bw = nPbW_c-start_x;
-+                      const int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
-+                return;
-+            }
-+#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
- 
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-         }
-@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi && 0) {
-+            const Mv *mv    = &current_mv.mv[0];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            const Mv *mv2    = &current_mv.mv[1];
-+            int mx2          = mv2->x & 3;
-+            int my2          = mv2->y & 3;
-+            int my2_mx2 = (my2<<8) + mx2;
-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int x2 = x0 + (mv2->x >> 2);
-+            int y2 = y0 + (mv2->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+
-+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
-+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
-+
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                }
-+            }
-+            s->curr_y_mvs = y;
-+        } else
-+#endif
-+        {
-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                    ref1->frame, &current_mv.mv[1], &current_mv);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+#ifdef RPI_INTER_QPU
-+          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                const Mv *mv2         = &current_mv.mv[1];
-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-+                intptr_t _mx2         = mx2 << (1 - hshift);
-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-+
-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
-+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
-+                      *u++ = 0;
-+
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx2][0];
-+                      *u++ = rpi_filter_coefs[_my2][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
-+                return;
-+            }
-+#endif
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
- 
--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-         }
-     }
-@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
- }
- 
-+#ifdef RPI
-+static void rpi_execute_dblk_cmds(HEVCContext *s)
-+{
-+    int n;
-+    int job = s->pass1_job;
-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+    int (*p)[2] = s->dblk_cmds[job];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-+    }
-+    s->num_dblk_cmds[job] = 0;
-+}
-+
-+static void rpi_execute_transform(HEVCContext *s)
-+{
-+    int i=2;
-+    int job = s->pass1_job;
-+    /*int j;
-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+        s->hevcdsp.idct[4-2](coeffs, 16);
-+    }
-+    i=3;
-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
-+        s->hevcdsp.idct[5-2](coeffs, 32);
-+    }*/
-+
-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    //vpu_wait(s->vpu_id);
-+
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[job][i] = 0;
-+}
-+
-+static void rpi_execute_pred_cmds(HEVCContext *s)
-+{
-+  int i;
-+  int job = s->pass1_job;
-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+#ifdef RPI_WORKER
-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+#else
-+  HEVCLocalContext *lc = s->HEVClc;
-+#endif
-+
-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+      if (cmd->type == RPI_PRED_INTRA) {
-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+      } else {
-+#ifdef RPI_PRECLEAR
-+          int trafo_size = 1 << cmd->size;
-+#endif
-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+#ifdef RPI_PRECLEAR
-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+#endif
-+      }
-+  }
-+  s->num_pred_cmds[job] = 0;
-+}
-+
-+static void rpi_execute_inter_cmds(HEVCContext *s)
-+{
-+    int job = s->pass1_job;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+    int n,cidx;
-+    AVFrame myref;
-+    AVFrame myref1;
-+    struct MvField mymv;
-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-+        printf("Overflow inter_cmds\n");
-+        exit(-1);
-+    }
-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
-+        switch(cmd->cmd) {
-+        case RPI_CMD_LUMA_UNI:
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_LUMA_BI:
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            myref1.data[0] = cmd->src1;
-+            myref1.linesize[0] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-+                       &myref1, &cmd->mv1, &mymv);
-+            break;
-+        case RPI_CMD_CHROMA_UNI:
-+            mymv.mv[0] = cmd->mv;
-+            chroma_mc_uni(s, cmd->dst,
-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_CHROMA_BI:
-+        case RPI_CMD_CHROMA_BI+1:
-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-+            myref.data[cidx+1] = cmd->src;
-+            myref.linesize[cidx+1] = cmd->srcstride;
-+            myref1.data[cidx+1] = cmd->src1;
-+            myref1.linesize[cidx+1] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            mymv.mv[0] = cmd->mv;
-+            mymv.mv[1] = cmd->mv1;
-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-+            break;
-+        }
-+    }
-+    s->num_mv_cmds[job] = 0;
-+}
-+
-+static void rpi_do_all_passes(HEVCContext *s)
-+{
-+    // Kick off QPUs and VPUs
-+    rpi_launch_vpu_qpu(s);
-+    // Perform luma inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+    // Prepare next batch
-+    rpi_begin(s);
-+}
-+
-+#endif
-+
-+#ifdef RPI
-+static void rpi_begin(HEVCContext *s)
-+{
-+    int job = s->pass0_job;
-+    int i;
-+#ifdef RPI_INTER_QPU
-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[job][i] = s->mvs_base[job][i];
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = pic_width;
-+        *s->u_mvs[job][i]++ = pic_height;
-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+    }
-+    s->curr_u_mvs = s->u_mvs[job][0];
-+#endif
-+
-+#ifdef RPI_LUMA_QPU
-+    for(i=0;i<12;i++) {
-+        // This needs to have a generally similar structure to the
-+        // actual filter code as various pipelined bits need to land correctly
-+        // when inserted by the filter requests
-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-+        *s->y_mvs[job][i]++ = 0; // y_x
-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
-+        *s->y_mvs[job][i]++ = 0; // y2_x2
-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
-+        *s->y_mvs[job][i]++ = 0; // Next kernel
-+    }
-+    s->curr_y_mvs = s->y_mvs[job][0];
-+#endif
-+    s->ctu_count = 0;
-+}
-+#endif
-+
-+#ifdef RPI_SIMULATE_QPUS
-+
-+static int32_t clipx(int x,int FRAME_WIDTH)
-+{
-+	if (x<=0) return 0;
-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-+	return x;
-+}
-+
-+static int32_t clipy(int y,int FRAME_HEIGHT)
-+{
-+	if (y<=0) return 0;
-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-+	return y;
-+}
-+
-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
-+
-+      vsum += lumaFilter[my][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+round)>>denom)+offset;
-+
-+   return av_clip_uint8( vsum );
-+}*/
-+
-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int chromaFilterH[4];
-+  int chromaFilterV[4];
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+  for(i=0;i<4;i++) {
-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-+  }
-+
-+   for (y = 0; y < 4; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 4; x++)
-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += chromaFilterV[y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
-+
-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
-+{
-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-+  int pitch = frame->linesize[cIdx];
-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
-+  if (p>=base && p<base+pitch*pic_height) {
-+    return frame->data[cIdx] + (p-base);
-+  }
-+  return NULL;
-+}
-+
-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-+{
-+  SliceHeader *sh   = &s->sh;
-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-+  int i;
-+  if (arm) return arm;
-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-+  {
-+    for(i=0;i<sh->nb_refs[L0];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  if (sh->slice_type == B_SLICE) {
-+    for(i=0;i<sh->nb_refs[L1];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-+  exit(-1);
-+  return NULL;
-+}
-+
-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-+{
-+  uint32_t next_kernel;
-+  uint32_t x0;
-+  uint32_t y0;
-+  uint8_t *ref_u_base;
-+  uint8_t *ref_v_base;
-+  uint32_t frame_width = p[5];
-+  uint32_t frame_height = p[6];
-+  uint32_t pitch = p[7];
-+  uint32_t dst_pitch = p[8];
-+  int32_t offset_before = p[9];
-+  int32_t denom = p[10];
-+  uint32_t vpm_id = p[11];
-+  uint32_t tmp_u_dst[256];
-+  uint32_t tmp_v_dst[256];
-+  while(1) {
-+    p += 12;
-+    next_kernel = p[0-12];
-+    x0 = p[1-12];
-+    y0 = p[2-12];
-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
-+      int x,y;
-+      uint32_t width_height = p[5];
-+      uint32_t hcoeffs = p[6];
-+      uint32_t vcoeffs = p[7];
-+      uint32_t offset_weight_u = p[8];
-+      uint32_t offset_weight_v = p[9];
-+      uint8_t *this_u_dst;
-+      uint8_t *this_v_dst;
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
-+      if (next_kernel!=s->mc_filter_uv_b0)
-+      {
-+        this_u_dst = compute_arm_addr(s,p[10],1);
-+        this_v_dst = compute_arm_addr(s,p[11],2);
-+      }
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter_uv) {
-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          } else if (next_kernel==s->mc_filter_uv_b0) {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            tmp_u_dst[x+y*16] = refa;
-+            tmp_v_dst[x+y*16] = refb;
-+          } else {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
-+
-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+{
-+  uint32_t next_kernel;
-+  int y_x,y2_x2;
-+  int x0;
-+  int y0;
-+  int x2;
-+  int y2;
-+  uint32_t *p0 = p;
-+  uint8_t *ref_y_base;
-+  uint8_t *ref_y2_base;
-+  uint32_t frame_width_height = p[4];
-+  uint32_t frame_width = frame_width_height>>16;
-+  uint32_t frame_height = (frame_width_height<<16)>>16;
-+  uint32_t pitch = p[5];
-+  uint32_t dst_pitch = p[6];
-+  int offset_shift = p[7];
-+  int32_t offset_before = offset_shift>>16;
-+  int32_t denom = (offset_shift<<16)>>16;
-+  while(1) {
-+    p += 9;
-+    next_kernel = p[8-9];
-+    y_x = p[0-9];
-+    x0 = (y_x<<16)>>16;
-+    y0 = y_x>>16;
-+    y2_x2 = p[2-9];
-+    x2 = (y2_x2<<16)>>16;
-+    y2 = y2_x2>>16;
-+
-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+      int x,y;
-+      uint32_t width_height = p[4];
-+      uint32_t my2_mx2_my_mx = p[5];
-+      uint32_t offset_weight = p[6];
-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      uint8_t *dst_base = s->frame->data[0];
-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter) {
-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+            refa = av_clip_uint8(refa);
-+            this_dst[x+y*dst_pitch] = refa;
-+          }
-+          else {
-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
-+
-+static void rpi_simulate_inter_qpu(HEVCContext *s)
-+{
-+  // First run the transform as normal
-+  int i;
-+  rpi_execute_transform(s);
-+  for(i=0;i<8;i++)
-+  {
-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-+  }
-+  for(i=0;i<12;i++)
-+  {
-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+  }
-+}
-+
-+#endif
-+
-+#ifdef RPI_INTER_QPU
-+
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
-+{
-+    int k;
-+    int job = s->pass1_job;
-+    int i;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+#ifdef RPI_LUMA_QPU
-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
-+#endif
-+    if (s->sh.slice_type == I_SLICE) {
-+#ifdef RPI_MULTI_MAILBOX
-+      rpi_execute_transform(s);
-+      return;
-+#endif
-+    }
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-+    }
-+
-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+#ifdef RPI_LUMA_QPU
-+    for(k=0;k<12;k++) {
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
-+    }
-+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+#endif
-+
-+#ifdef RPI_SIMULATE_QPUS
-+    rpi_simulate_inter_qpu(s);
-+    return;
-+#endif
-+
-+#ifdef RPI_MULTI_MAILBOX
-+#ifdef RPI_CACHE_UNIF_MVS
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
-+#else
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+#endif
-+
-+#if 1
-+    {
-+        unsigned int i;
-+        uint32_t * p;
-+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        code = qpu_get_fn(QPU_MC_SETUP);
-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
-+            vpu_get_constants(),
-+            s->coeffs_buf_vc[job][2],
-+            s->num_coeffs[job][2] >> 8,
-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+            s->num_coeffs[job][3] >> 10,
-+            0,
-+            // QPU job 1
-+            QPU_N_UV,
-+            mail_uv,
-+            // QPU job 2
-+            QPU_N_Y,
-+            mail_y
-+            );
-+    }
-+
-+#else
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+#ifdef RPI_LUMA_QPU
-+                                   qpu_get_fn(QPU_MC_SETUP),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
-+#else
-+                                   0,
-+                                   0,0,0,0,
-+                                   0,0,0,0,
-+                                   0,0,0,0
-+#endif
-+                                 );
-+#endif
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[job][i] = 0;
-+#else
-+#error Code rotted here
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-+      );
-+#endif
-+
-+
-+}
-+#else
-+
-+#ifdef RPI
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
-+{
-+  rpi_execute_transform(s);
-+}
-+#endif
-+
-+#endif
-+
-+#ifdef RPI
-+
-+#ifndef RPI_FAST_CACHEFLUSH
-+#error RPI_FAST_CACHEFLUSH is broken
-+static void flush_buffer(AVBufferRef *bref) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+    gpu_cache_flush(p);
-+}
-+#endif
-+
-+static void flush_frame(HEVCContext *s,AVFrame *frame)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int n = s->ps.sps->height;
-+    int curr_y = 0;
-+    int curr_uv = 0;
-+    int n_uv = n >> s->ps.sps->vshift[1];
-+    int sz,base;
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+#endif
-+}
-+
-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    int n;
-+    int curr_y;
-+    int curr_uv;
-+    int n_uv;
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int sz,base;
-+    int (*d)[2] = s->dblk_cmds[job];
-+    int low=(*d)[1];
-+    int high=(*d)[1];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+        int y = (*d)[1];
-+        low=FFMIN(low,y);
-+        high=FFMAX(high,y);
-+    }
-+    curr_y = low;
-+    n = high+(1 << s->ps.sps->log2_ctb_size);
-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
-+    n_uv = n >> s->ps.sps->vshift[1];
-+
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+
-+    iocache.s[3].handle = p0->vcsm_handle;
-+    iocache.s[3].cmd = 3; // clean+invalidate
-+    iocache.s[3].addr = (int) p0->arm;
-+    iocache.s[3].size  = p0->numbytes;
-+    if (p1) {
-+      iocache.s[4].handle = p1->vcsm_handle;
-+      iocache.s[4].cmd = 3; // clean+invalidate
-+      iocache.s[4].addr = (int) p1->arm;
-+      iocache.s[4].size  = p1->numbytes;
-+    }
-+    if (p2) {
-+      iocache.s[5].handle = p2->vcsm_handle;
-+      iocache.s[5].cmd = 3; // clean+invalidate
-+      iocache.s[5].addr = (int) p2->arm;
-+      iocache.s[5].size  = p2->numbytes;
-+    }
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+    gpu_cache_flush3(p0, p1, p2);
-+#endif
-+}
-+
-+#endif
-+
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- {
-     HEVCContext *s  = avctxt->priv_data;
-@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int y_ctb       = 0;
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
-+#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
-+
-+    if (!s->enable_rpi) {
-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
-+        printf("Cross component\n");
-+    }
-+#endif
-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-         return AVERROR_INVALIDDATA;
-@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         }
-     }
- 
-+#ifdef RPI_WORKER
-+    s->pass0_job = 0;
-+    s->pass1_job = 0;
-+#endif
-+#ifdef RPI
-+    rpi_begin(s);
-+#endif
-+
-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
- 
-@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
- 
-+#ifdef RPI_INTER_QPU
-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
-+#endif
-+
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+
-+#ifdef RPI_INTER_QPU
-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
-+#endif
-+
-+#ifdef RPI
-+        if (s->enable_rpi) {
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-+          //av_assert0(s->pass0_job>=0);
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+          s->ctu_count++;
-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
-+
-+          if ( s->ctu_count >= s->max_ctu_count ) {
-+#ifdef RPI_WORKER
-+            if (s->used_for_ref) {
-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+              // Pass on this job to worker thread
-+              worker_submit_job(s);
-+              // Make sure we have space to prepare the next job
-+              worker_pass0_ready(s);
-+
-+              // Prepare the next batch of commands
-+              rpi_begin(s);
-+            } else {
-+              // Non-ref frame so do it all on this thread
-+              rpi_do_all_passes(s);
-+            }
-+#else
-+            rpi_do_all_passes(s);
-+#endif
-+          }
-+
-+        }
-+#endif
-+
-+
-         if (more_data < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
-             return more_data;
-@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         ctb_addr_ts++;
-         ff_hevc_save_states(s, ctb_addr_ts);
-+#ifdef RPI
-+        if (s->enable_rpi)
-+            continue;
-+#endif
-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-     }
- 
-+#ifdef RPI
-+
-+#ifdef RPI_WORKER
-+    // Wait for the worker to finish all its jobs
-+    if (s->enable_rpi) {
-+        worker_wait(s);
-+    }
-+#endif
-+
-+    // Finish off any half-completed rows
-+    if (s->enable_rpi && s->ctu_count) {
-+        rpi_do_all_passes(s);
-+    }
-+
-+#endif
-+
-     if (x_ctb + ctb_size >= s->ps.sps->width &&
-         y_ctb + ctb_size >= s->ps.sps->height)
-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-     s = s1->sList[self_id];
-     lc = s->HEVClc;
- 
-+#ifdef RPI
-+    s->enable_rpi = 0;
-+    //printf("Wavefront\n");
-+#endif
-+
-     if(ctb_row) {
-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
- 
-@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
-         if (ret < 0)
-             return ret;
- 
-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-+                        s->nal_unit_type == NAL_TSA_N   ||
-+                        s->nal_unit_type == NAL_STSA_N  ||
-+                        s->nal_unit_type == NAL_RADL_N  ||
-+                        s->nal_unit_type == NAL_RASL_N);
-+
-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
-+            s->is_decoded = 0;
-+            break;
-+        }
-         if (s->max_ra == INT_MAX) {
-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-                 s->max_ra = s->poc;
-@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-     }
- 
- fail:
--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
-+#endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
--
-+    } else if (s->ref) {
-+#ifdef RPI_INTER_QPU
-+      // When running single threaded we need to flush the whole frame
-+      flush_frame(s,s->frame);
-+#endif
-+    }
-     return ret;
- }
- 
-@@ -3064,6 +4625,41 @@ fail:
-     return AVERROR(ENOMEM);
- }
- 
-+#ifdef RPI_WORKER
-+static av_cold void hevc_init_worker(HEVCContext *s)
-+{
-+    int err;
-+    pthread_cond_init(&s->worker_cond_head, NULL);
-+    pthread_cond_init(&s->worker_cond_tail, NULL);
-+    pthread_mutex_init(&s->worker_mutex, NULL);
-+
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-+    if (err) {
-+        printf("Failed to create worker thread\n");
-+        exit(-1);
-+    }
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCContext *s)
-+{
-+    void *res;
-+    s->kill_worker=1;
-+    pthread_cond_broadcast(&s->worker_cond_tail);
-+    pthread_join(s->worker_thread, &res);
-+
-+    pthread_cond_destroy(&s->worker_cond_head);
-+    pthread_cond_destroy(&s->worker_cond_tail);
-+    pthread_mutex_destroy(&s->worker_mutex);
-+
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
-+}
-+#endif
-+
- static av_cold int hevc_decode_free(AVCodecContext *avctx)
- {
-     HEVCContext       *s = avctx->priv_data;
-@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
- 
-     av_freep(&s->cabac_state);
- 
-+#ifdef RPI
-+
-+#ifdef RPI_WORKER
-+    hevc_exit_worker(s);
-+#endif
-+
-+    for(i=0;i<RPI_MAX_JOBS;i++) {
-+      av_freep(&s->unif_mv_cmds[i]);
-+      av_freep(&s->univ_pred_cmds[i]);
-+
-+#ifdef RPI_INTER_QPU
-+      if (s->unif_mvs[i]) {
-+        gpu_free( &s->unif_mvs_ptr[i] );
-+        s->unif_mvs[i] = 0;
-+      }
-+#endif
-+#ifdef RPI_LUMA_QPU
-+      if (s->y_unif_mvs[i]) {
-+        gpu_free( &s->y_unif_mvs_ptr[i] );
-+        s->y_unif_mvs[i] = 0;
-+      }
-+#endif
-+    }
-+
-+#endif
-+
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-         av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     return 0;
- }
- 
-+#ifdef RPI
-+#ifdef RPI_PRECLEAR
-+static av_cold void memclear16(int16_t *p, int n)
-+{
-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+  //int i;
-+  //for(i=0;i<n;i++)
-+  //  p[i] = 0;
-+}
-+#endif
-+#endif
-+
- static av_cold int hevc_init_context(AVCodecContext *avctx)
- {
-     HEVCContext *s = avctx->priv_data;
-     int i;
-+    int job;
- 
-     s->avctx = avctx;
- 
-@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->HEVClcList[0] = s->HEVClc;
-     s->sList[0] = s;
- 
-+#ifdef RPI
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+        if (!s->unif_mv_cmds[job])
-+            goto fail;
-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+        if (!s->univ_pred_cmds[job])
-+            goto fail;
-+    }
-+
-+#ifdef RPI_INTER_QPU
-+    // We divide the image into blocks 256 wide and 64 high
-+    // We support up to 2048 widths
-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-+    // Also add space for the startup command for each stream.
-+
-+    {
-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+		for(job=0;job<RPI_MAX_JOBS;job++) {
-+#ifdef RPI_CACHE_UNIF_MVS
-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+#else
-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+#endif
-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
-+
-+          // Set up initial locations for uniform streams
-+          p = s->unif_mvs[job];
-+          for(i = 0; i < 8; i++) {
-+            s->mvs_base[job][i] = p;
-+            p += uv_commands_per_qpu;
-+          }
-+        }
-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+    }
-+
-+#endif
-+#ifdef RPI_LUMA_QPU
-+    for(job=0;job<RPI_MAX_JOBS;job++)
-+    {
-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+#ifdef RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#else
-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#endif
-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+
-+        // Set up initial locations for uniform streams
-+        p = s->y_unif_mvs[job];
-+        for(i = 0; i < 12; i++) {
-+            s->y_mvs_base[job][i] = p;
-+            p += y_commands_per_qpu;
-+        }
-+    }
-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+#endif
-+    //gpu_malloc_uncached(2048*64,&s->dummy);
-+
-+    s->enable_rpi = 0;
-+
-+#ifdef RPI_WORKER
-+    hevc_init_worker(s);
-+#endif
-+
-+#endif
-+
-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-     if (!s->cabac_state)
-         goto fail;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index be91010..6b03ea8 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -23,6 +23,9 @@
+     sl->explicit_ref_marking = 0;
+     if (nal->ref_idc) {
+diff --git b/libavcodec/hevc.h a/libavcodec/hevc.h
+index de77d2a..494ca48 100644
+--- b/libavcodec/hevc.h
++++ a/libavcodec/hevc.h
+@@ -21,6 +21,34 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
  
 +// define RPI to split the CABAC/prediction/transform into separate stages
-+#include "config.h"
++#ifndef RPI
 +
- #include "libavutil/buffer.h"
- #include "libavutil/md5.h"
- 
-@@ -37,6 +40,29 @@
- #include "thread.h"
- #include "videodsp.h"
- 
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifdef RPI
++  #define RPI_INTER          0
++
++#else
 +
 +  #include "rpi_qpu.h"
-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-+  #define RPI_INTER_QPU
++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
 +
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+    #define RPI_LUMA_QPU
-+  #endif
-+
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  #define RPI_MAX_JOBS 2
 +  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
 +  #define RPI_WORKER
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  // This has no effect unless RPI_WORKER is defined
++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
++  // free for the foreground to fill in.
++  #define RPI_MAX_JOBS 2
++
 +  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
 +//  #define RPI_DEBLOCK_VPU
 +
++  #define RPI_VPU_DEBLOCK_CACHED 1
 +#endif
 +
-+#define RPI_VPU_DEBLOCK_CACHED 1
-+
- #define MAX_DPB_SIZE 16 // A.4.1
- #define MAX_REFS 16
- 
-@@ -660,17 +686,6 @@ typedef struct CodingUnit {
-     uint8_t cu_transquant_bypass_flag;
- } CodingUnit;
- 
--typedef struct Mv {
--    int16_t x;  ///< horizontal component of motion vector
--    int16_t y;  ///< vertical component of motion vector
--} Mv;
--
--typedef struct MvField {
--    DECLARE_ALIGNED(4, Mv, mv)[2];
--    int8_t ref_idx[2];
--    int8_t pred_flag;
--} MvField;
--
- typedef struct NeighbourAvailable {
-     int cand_bottom_left;
-     int cand_left;
-@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
-     uint8_t flags;
- } HEVCFrame;
- 
-+#ifdef RPI_WORKER
-+typedef struct HEVCLocalContextIntra {
-+    TransformUnit tu;
-+    NeighbourAvailable na;
-+} HEVCLocalContextIntra;
-+#endif
-+
- typedef struct HEVCLocalContext {
-+    TransformUnit tu;
-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
-+
-     uint8_t cabac_state[HEVC_CONTEXTS];
- 
-     uint8_t stat_coeff[4];
-@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
- 
-     int qPy_pred;
- 
--    TransformUnit tu;
- 
-     uint8_t ctb_left_flag;
-     uint8_t ctb_up_flag;
-@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
-     int ct_depth;
-     CodingUnit cu;
-     PredictionUnit pu;
--    NeighbourAvailable na;
- 
- #define BOUNDARY_LEFT_SLICE     (1 << 0)
- #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
-     int boundary_flags;
- } HEVCLocalContext;
- 
-+
-+#ifdef RPI
-+
-+// The processing is done in chunks
-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-+// This is a distance of 1536 pixels across the screen
-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
-+// but allocate more memory and increase the latency before data in the next frame can be processed
-+#define RPI_NUM_CHUNKS 1
-+
-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
-+
-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+// Each block can have an intra prediction and a transform_add command
-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+// Worst case is 16x16 CTUs
-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
-+
-+#define RPI_CMD_LUMA_UNI 0
-+#define RPI_CMD_CHROMA_UNI 1
-+#define RPI_CMD_LUMA_BI 2
-+#define RPI_CMD_CHROMA_BI 3
-+#define RPI_CMD_V_BI 4
-+
-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+// #define RPI_PRECLEAR
-+
-+// Command for inter prediction
-+typedef struct HEVCMvCmd {
-+    int cmd;
-+    uint8_t *dst;
-+    ptrdiff_t dststride;
-+    uint8_t *src;
-+    ptrdiff_t srcstride;
-+    Mv mv;
-+    int x_off;
-+    int y_off;
-+    int block_w;
-+    int block_h;
-+    int weight;
-+    int offset;
-+    uint8_t *src1;
-+    ptrdiff_t srcstride1;
-+    Mv mv1;
-+    int8_t ref_idx[2];
-+} HEVCMvCmd;
-+
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+#define RPI_PRED_TRANSFORM_ADD 0
-+#define RPI_PRED_INTRA 1
-+typedef struct HEVCPredCmd {
-+    uint8_t size;
-+    uint8_t type;
-+    uint8_t na;
-+    uint8_t c_idx;
-+    union {
-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t x;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t y;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t stride;         // RPI_PRED_INTRA
-+    };
-+} HEVCPredCmd;
-+
-+#endif
-+
- typedef struct HEVCContext {
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
-@@ -798,13 +895,107 @@ typedef struct HEVCContext {
- 
-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-     HEVCLocalContext    *HEVClc;
--
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra HEVClcIntra;
-+#endif
-     uint8_t             threads_type;
-     uint8_t             threads_number;
- 
-     int                 width;
-     int                 height;
- 
-+    int used_for_ref;
-+
-+#ifdef RPI
-+    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+    int buf_width;
-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-+    int num_coeffs[RPI_MAX_JOBS][4];
-+    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds[RPI_MAX_JOBS];
-+    int num_pred_cmds[RPI_MAX_JOBS];
-+    int num_dblk_cmds[RPI_MAX_JOBS];
-+    int vpu_id;
-+    int pass0_job; // Pass0 does coefficient decode
-+    int pass1_job; // Pass1 does pixel processing
-+    int ctu_count; // Number of CTUs done in pass0 so far
-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
-+#ifdef RPI_INTER_QPU
-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-+    // Function pointers
-+    uint32_t mc_filter_uv;
-+    uint32_t mc_filter_uv_b0;
-+    uint32_t mc_filter_uv_b;
-+#endif
-+#ifdef RPI_LUMA_QPU
-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
-+    // Function pointers
-+    uint32_t mc_filter;
-+    uint32_t mc_filter_b;
-+#endif
-+
-+#ifdef RPI_WORKER
-+    pthread_t worker_thread;
-+    pthread_cond_t worker_cond_head;
-+    pthread_cond_t worker_cond_tail;
-+    pthread_mutex_t worker_mutex;
-+
-+    int worker_tail; // Contains the number of posted jobs
-+    int worker_head; // Contains the number of completed jobs
-+    int kill_worker; // set to 1 to terminate the worker
-+#endif
-+
-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
-+
-+#ifdef RPI_DEBLOCK_VPU
-+    int enable_rpi_deblock;
-+
-+    int uv_setup_width;
-+    int uv_setup_height;
-+    int setup_width; // Number of 16x16 blocks across the image
-+    int setup_height; // Number of 16x16 blocks down the image
-+
-+    struct dblk_vpu_q_s
-+    {
-+        GPU_MEM_PTR_T deblock_vpu_gmem;
-+
-+        uint8_t (*y_setup_arm)[2][2][2][4];
-+        uint8_t (*y_setup_vc)[2][2][2][4];
-+
-+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
-+        uint8_t (*uv_setup_vc)[2][2][2][4];
-+
-+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
-+        int vpu_cmds_vc;
-+
-+        int cmd_id;
-+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
-+
-+    struct dblk_vpu_q_s * dvq;
-+    unsigned int dvq_n;
-+
-+#endif
-+
-+#endif
-+
-     uint8_t *cabac_state;
- 
-     /** 1 if the independent slice segment header was successfully parsed */
-@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
-     uint32_t max_mastering_luminance;
-     uint32_t min_mastering_luminance;
- 
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
- } HEVCContext;
- 
- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                  int log2_trafo_size, enum ScanType scan_idx,
-                                  int c_idx);
- 
-+#ifdef RPI_INTER_QPU
-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
-+#endif
-+
- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
- 
- 
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 05b2821..e2f1f4e 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -21,14 +21,72 @@
+ /**
+  * Table 7-3: NAL unit type codes
+  */
+diff --git b/libavcodec/hevc_cabac.c a/libavcodec/hevc_cabac.c
+index e27c54e..1dbbb16 100644
+--- b/libavcodec/hevc_cabac.c
++++ a/libavcodec/hevc_cabac.c
+@@ -21,6 +21,8 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -4813,10 +2689,10 @@ index 05b2821..e2f1f4e 100644
  #include "libavutil/attributes.h"
  #include "libavutil/common.h"
  
--#include "cabac_functions.h"
+@@ -29,8 +31,64 @@
  #include "hevc.h"
-+#include "cabac_functions.h"
-+
+ #include "hevcdec.h"
+ 
 +// BY22 is probably faster than simple bypass if the processor has
 +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
 +// x86 has fast int divide
@@ -4836,7 +2712,7 @@ index 05b2821..e2f1f4e 100644
 +#if ARCH_ARM
 +#include "arm/hevc_cabac.h"
 +#endif
- 
++
  #define CABAC_MAX_BIN 31
  
 +
@@ -4878,7 +2754,7 @@ index 05b2821..e2f1f4e 100644
  /**
   * number of bin by SyntaxElement.
   */
-@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+@@ -447,6 +505,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
      { 28, 36, 43, 49, 54, 58, 61, 63, },
  };
  
@@ -5090,7 +2966,7 @@ index 05b2821..e2f1f4e 100644
  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
  {
      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+@@ -865,19 +1128,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
  }
  
@@ -5116,7 +2992,7 @@ index 05b2821..e2f1f4e 100644
  }
  
  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+@@ -893,14 +1156,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
  }
  
@@ -5133,7 +3009,7 @@ index 05b2821..e2f1f4e 100644
          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
          ctx_shift = (log2_size + 1) >> 2;
      } else {
-@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+@@ -931,22 +1194,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
      return value;
  }
  
@@ -5159,7 +3035,7 @@ index 05b2821..e2f1f4e 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -968,90 +1225,337 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -5488,12 +3364,12 @@ index 05b2821..e2f1f4e 100644
      int vshift = s->ps.sps->vshift[c_idx];
      uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+#ifdef RPI
-+    //***** transform_skip_flag decoded later!
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
-+#endif
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
 -    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++#ifdef RPI
++    int use_vpu;
++#endif
++    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
      int explicit_rdpcm_flag = 0;
      int explicit_rdpcm_dir_flag;
@@ -5508,38 +3384,11 @@ index 05b2821..e2f1f4e 100644
      int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                           lc->tu.intra_pred_mode_c;
  
+-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
 +    int prev_sig = 0;
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        int n = trafo_size * trafo_size;
-+        if (use_vpu) {
-+            // We support size 4 and size 5.
-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
-+            // num_coeffs is indexed by log2_trafo_size-2
-+            if (log2_trafo_size == 4)
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-+            else
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+        } else {
-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+            s->num_coeffs[s->pass0_job][0] += n;
-+        }
-+    }
-+    // We now do the memset after transform_add while we know the data is cached.
-+    #ifdef RPI_PRECLEAR
-+    #else
-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    #endif
-+#else
-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+#endif
-+
 +
  
      // Derive QP for dequant
@@ -5549,7 +3398,7 @@ index 05b2821..e2f1f4e 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1067,9 +1571,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -5570,7 +3419,7 @@ index 05b2821..e2f1f4e 100644
          }
  
          if (c_idx == 0) {
-@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1102,39 +1616,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -5641,6 +3490,9 @@ index 05b2821..e2f1f4e 100644
 +        may_hide_sign = 0;
      }
  
++
++
++
      if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
 -        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 -        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
@@ -5658,7 +3510,7 @@ index 05b2821..e2f1f4e 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1162,119 +1713,133 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -5715,14 +3567,40 @@ index 05b2821..e2f1f4e 100644
 -    for (i = num_last_subset; i >= 0; i--) {
 -        int n, m;
 -        int x_cg, y_cg, x_c, y_c, pos;
--        int implicit_non_zero_coeff = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
++            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#ifndef RPI_PRECLEAR
++            // We now do the memset after transform_add while we know the data is cached.
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++        }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++        }
++    }
++
++    i = num_last_subset;
++    do {
+         int implicit_non_zero_coeff = 0;
 -        int64_t trans_coeff_level;
 -        int prev_sig = 0;
 -        int offset = i << 4;
 -        int rice_init = 0;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++        int n_end;
  
--        uint8_t significant_coeff_flag_idx[16];
+         uint8_t significant_coeff_flag_idx[16];
 -        uint8_t nb_significant_coeff_flag = 0;
 -
 -        x_cg = scan_x_cg[i];
@@ -5734,8 +3612,7 @@ index 05b2821..e2f1f4e 100644
 -                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 -            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- 
+-
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 -            implicit_non_zero_coeff = 1;
@@ -5744,13 +3621,8 @@ index 05b2821..e2f1f4e 100644
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
 -        }
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
- 
+-
 -        last_scan_pos = num_coeff - offset - 1;
-+        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
          if (i == num_last_subset) {
@@ -5836,7 +3708,7 @@ index 05b2821..e2f1f4e 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1288,34 +1853,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -5885,7 +3757,7 @@ index 05b2821..e2f1f4e 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1325,141 +1886,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
@@ -6188,7 +4060,7 @@ index 05b2821..e2f1f4e 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1469,7 +2074,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -6197,20 +4069,9 @@ index 05b2821..e2f1f4e 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 for (i = 0; i < 8; i++)
-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-             }
--
-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
- 
-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-             }
+@@ -1490,6 +2095,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
--            s->hevcdsp.idct_4x4_luma(coeffs);
-+           s->hevcdsp.idct_4x4_luma(coeffs);
+             s->hevcdsp.transform_4x4_luma(coeffs);
          } else {
 +#ifdef RPI
 +            if (!use_vpu) {
@@ -6232,16 +4093,16 @@ index 05b2821..e2f1f4e 100644
 +#else
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                 s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+@@ -1503,6 +2126,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      col_limit = FFMIN(24, col_limit);
-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+                 s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
              }
 +#endif
          }
      }
      if (lc->tu.cross_pf) {
-@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1512,6 +2136,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
@@ -6250,19 +4111,19 @@ index 05b2821..e2f1f4e 100644
 +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
 +        cmd->type = RPI_PRED_TRANSFORM_ADD;
 +        cmd->size = log2_trafo_size;
-+        cmd->buf = coeffs;
-+        cmd->dst = dst;
-+        cmd->stride = stride;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
 +        return;
 +    }
 +#endif
-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+     s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
  }
  
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f33b0c..55a0315 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
+diff --git b/libavcodec/hevc_filter.c a/libavcodec/hevc_filter.c
+index 14e7c8d..e4ffd87 100644
+--- b/libavcodec/hevc_filter.c
++++ a/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
@@ -6276,19 +4137,18 @@ index 1f33b0c..55a0315 100644
  #include "libavutil/common.h"
  #include "libavutil/internal.h"
  
-@@ -31,6 +37,11 @@
+@@ -30,6 +36,10 @@
  
  #include "bit_depth_template.c"
  
 +#ifdef RPI
-+#include "rpi_user_vcsm.h"
 +#include "rpi_qpu.h"
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -272,6 +282,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
      edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
  
@@ -6299,7 +4159,7 @@ index 1f33b0c..55a0315 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -495,6 +509,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -6315,7 +4175,7 @@ index 1f33b0c..55a0315 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -538,6 +561,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -6335,7 +4195,7 @@ index 1f33b0c..55a0315 100644
                      s->hevcdsp.hevc_v_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -570,6 +606,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -6355,7 +4215,7 @@ index 1f33b0c..55a0315 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -604,9 +653,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6379,7 +4239,7 @@ index 1f33b0c..55a0315 100644
                      }
                  }
  
-@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -637,6 +700,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6399,7 +4259,7 @@ index 1f33b0c..55a0315 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -647,69 +723,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -6469,7 +4329,7 @@ index 1f33b0c..55a0315 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -720,10 +733,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -6494,7 +4354,7 @@ index 1f33b0c..55a0315 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -735,34 +759,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -6571,7 +4431,7 @@ index 1f33b0c..55a0315 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -773,64 +819,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -6586,9 +4446,7 @@ index 1f33b0c..55a0315 100644
 -        int xq_pu =  x0      >> log2_min_pu_size;
 -        int xp_tu = (x0 - 1) >> log2_min_tu_size;
 -        int xq_tu =  x0      >> log2_min_tu_size;
-+                               rpl;
-+        MvField *left = curr - 1;
- 
+-
 -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 -                int y_pu      = (y0 + i) >> log2_min_pu_size;
 -                int y_tu      = (y0 + i) >> log2_min_tu_size;
@@ -6606,10 +4464,7 @@ index 1f33b0c..55a0315 100644
 -                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
 -            }
 -    }
-+        if (is_intra) {
-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-+                bs[j * s->bs_width >> 2] = 2;
- 
+-
 -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
 -        RefPicList *rpl = s->ref->refPicList;
 -
@@ -6617,12 +4472,17 @@ index 1f33b0c..55a0315 100644
 -        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
 -            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
 -            int yq_pu = (y0 + j)     >> log2_min_pu_size;
--
++                               rpl;
++        MvField *left = curr - 1;
+ 
 -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 -                int x_pu = (x0 + i) >> log2_min_pu_size;
 -                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
 -                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
--
++        if (is_intra) {
++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
++                bs[j * s->bs_width >> 2] = 2;
+ 
 -                bs = boundary_strength(s, curr, top, rpl);
 -                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
 +        } else {
@@ -6674,137 +4534,42 @@ index 1f33b0c..55a0315 100644
          }
      }
  }
-@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -839,11 +875,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
-+#if !defined(RPI_FAST_CACHEFLUSH)
-+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
-+static void flush_buffer_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
-+    gpu_cache_flush(&p);
-+}
-+#endif
-+#endif
-+
-+
 +#ifdef RPI_DEBLOCK_VPU
-+#error Not fixed yet
-+
 +// ff_hevc_flush_buffer_lines
 +// flushes and invalidates all pixel rows in [start,end-1]
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = start;
-+        int n = end;
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        if (flush_chroma) {
-+          p = get_gpu_mem_ptr_u(s->frame);
-+          iocache.s[0].handle = p.vcsm_handle;
-+          iocache.s[0].cmd = 3; // clean+invalidate
-+          iocache.s[0].addr = (int)p.arm + base;
-+          iocache.s[0].size  = sz;
-+          p = get_gpu_mem_ptr_v(s->frame);
-+          iocache.s[1].handle = p.vcsm_handle;
-+          iocache.s[1].cmd = 3; // clean+invalidate
-+          iocache.s[1].addr = (int)p.arm + base;
-+          iocache.s[1].size  = sz;
-+        }
-+        if (flush_luma) {
-+          p = get_gpu_mem_ptr_y(s->frame);
-+          sz = s->frame->linesize[0] * (n-curr_y);
-+          base = s->frame->linesize[0] * curr_y;
-+          iocache.s[2].handle = p.vcsm_handle;
-+          iocache.s[2].cmd = 3; // clean+invalidate
-+          iocache.s[2].addr = (int)p.arm + base;
-+          iocache.s[2].size  = sz;
-+        }
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        if (flush_chroma) {
-+          flush_buffer_u(s->frame);
-+          flush_buffer_v(s->frame);
-+        }
-+        if (flush_luma) {
-+          flush_buffer_y(s->frame);
-+        }
-+#endif
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_finish(rfe);
 +}
 +#endif
 +
-+#ifdef RPI_INTER_QPU
-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
++#if RPI_INTER
++
++// Flush some lines of a reference frames
++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
 +{
 +    if (s->enable_rpi && s->used_for_ref) {
-+      // TODO make this use ff_hevc_flush_buffer_lines
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = ((int *)f->progress->data)[0];
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        p = get_gpu_mem_ptr_u(s->frame);
-+        iocache.s[0].handle = p.vcsm_handle;
-+        iocache.s[0].cmd = 3; // clean+invalidate
-+        iocache.s[0].addr = (int)p.arm + base;
-+        iocache.s[0].size  = sz;
-+        p = get_gpu_mem_ptr_v(s->frame);
-+        iocache.s[1].handle = p.vcsm_handle;
-+        iocache.s[1].cmd = 3; // clean+invalidate
-+        iocache.s[1].addr = (int)p.arm + base;
-+        iocache.s[1].size  = sz;
++        const int d0 = ((int *)f->progress->data)[0];
++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
 +
-+#ifdef RPI_LUMA_QPU
-+        p = get_gpu_mem_ptr_y(s->frame);
-+        sz = s->frame->linesize[0] * (n-curr_y);
-+        base = s->frame->linesize[0] * curr_y;
-+        iocache.s[2].handle = p.vcsm_handle;
-+        iocache.s[2].cmd = 3; // clean+invalidate
-+        iocache.s[2].addr = (int)p.arm + base;
-+        iocache.s[2].size  = sz;
-+#endif
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        flush_buffer_u(s->frame);
-+        flush_buffer_v(s->frame);
-+#ifdef RPI_LUMA_QPU
-+        flush_buffer_y(s->frame);
-+#endif
-+
-+#endif
-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
++        if (curr_y < (unsigned int)f->f->height) {
++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_finish(rfe);
++        }
 +    }
 +}
 +#endif
 +
 +#ifdef RPI_DEBLOCK_VPU
-+#error XXX
 +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
 +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
 +{
@@ -6833,16 +4598,19 @@ index 1f33b0c..55a0315 100644
 +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
 +  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
 +  s->dvq->vpu_cmds_arm[2][5] = 4;
++
 +  // Call VPU
-+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
++  {
++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
++      vpu_qpu_job_finish(vqj);
++  }
 +
 +  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
 +  s->dvq = s->dvq_ents + s->dvq_n;
 +
-+  if (s->dvq->cmd_id != -1) {
-+      vpu_wait(s->dvq->cmd_id);
-+      s->dvq->cmd_id = -1;
-+  }
++  vpu_qpu_wait(&s->dvq->cmd_id);
 +}
 +
 +#endif
@@ -6871,14 +4639,14 @@ index 1f33b0c..55a0315 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -852,16 +981,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y, 0);
 +            }
@@ -6886,14 +4654,15 @@ index 1f33b0c..55a0315 100644
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            }
-+        }
-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
@@ -6904,15 +4673,14 @@ index 1f33b0c..55a0315 100644
 +            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +          }
 +        } else {
-+#ifdef RPI_INTER_QPU
-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
 +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++        }
 +#else
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
@@ -6921,11 +4689,11 @@ index 1f33b0c..55a0315 100644
  }
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
-diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index 83f2ec2..6882a8d 100644
---- a/libavcodec/hevc_ps.c
-+++ b/libavcodec/hevc_ps.c
-@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+diff --git b/libavcodec/hevc_ps.c a/libavcodec/hevc_ps.c
+index acd55cc..0a465d4 100644
+--- b/libavcodec/hevc_ps.c
++++ a/libavcodec/hevc_ps.c
+@@ -1001,6 +1001,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
      sps->amp_enabled_flag = get_bits1(gb);
      sps->sao_enabled      = get_bits1(gb);
  
@@ -6934,10 +4702,2640 @@ index 83f2ec2..6882a8d 100644
      sps->pcm_enabled_flag = get_bits1(gb);
      if (sps->pcm_enabled_flag) {
          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 9d773d9..a6534a9 100644
---- a/libavcodec/hevcdsp.c
-+++ b/libavcodec/hevcdsp.c
+diff --git b/libavcodec/hevcdec.c a/libavcodec/hevcdec.c
+index ef21595..b36e840 100644
+--- b/libavcodec/hevcdec.c
++++ a/libavcodec/hevcdec.c
+@@ -42,8 +42,233 @@
+ #include "hevcdec.h"
+ #include "profiles.h"
+ 
++#ifdef RPI
++  #include "rpi_qpu.h"
++  #include "rpi_shader.h"
++
++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
++  #define RPI_CACHE_UNIF_MVS  1
++
++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
++  //#define RPI_SIMULATE_QPUS
++  #ifdef RPI_WORKER
++    #include "pthread.h"
++  #endif
++
++  #include "libavutil/atomic.h"
++
++  static void worker_core(HEVCContext * const s);
++#endif
++
++// #define DISABLE_MC
++
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++    return a & ((1 << p) - 1);
++}
++#   define av_mod_uintp2   av_mod_uintp2_c
++#endif
++
++#define Y_B_ONLY 1
++
+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+ 
++
++#if RPI_INTER
++
++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
++// For each block of 64*64 the smallest block size is 8x4
++// We also need an extra command for the setup information
++
++#define RPI_CHROMA_COMMAND_WORDS 11
++#define UV_COMMANDS_PER_QPU ((1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++typedef struct qpu_mc_pred_c_s {
++    uint32_t next_fn;
++    int16_t next_src_y;
++    int16_t next_src_x;
++    uint32_t next_src_base_u;
++    uint32_t next_src_base_v;
++    union {
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_u;
++            uint32_t dst_addr_v;
++        } p;
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t weight_u;
++            uint32_t weight_v;
++            uint32_t dummy0;
++            uint32_t dummy1;
++        } b0;
++        struct {
++            uint32_t dummy0;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_u;
++            uint32_t dst_addr_v;
++        } b1;
++        struct {
++            uint32_t pic_w;
++            uint32_t pic_h;
++            uint32_t src_stride;
++            uint32_t dst_stride;
++            uint32_t wdenom;
++            uint32_t dummy0;
++            uint32_t dummy1;
++        } s;
++    };
++} qpu_mc_pred_c_t;
++
++
++static const char static_assert_qpu_mc_pred[sizeof(qpu_mc_pred_c_t) != RPI_CHROMA_COMMAND_WORDS * 4 ? -1 : 1] = {0};
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++// TODO Chroma only needs 4 taps
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
++};
++
++#define RPI_LUMA_COMMAND_WORDS 10
++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
++#endif
++
++
++#ifdef RPI_WORKER
++
++typedef struct worker_global_env_s
++{
++    volatile int arm_load;
++    pthread_mutex_t lock;
++
++    unsigned int arm_y;
++    unsigned int arm_c;
++    unsigned int gpu_y;
++    unsigned int gpu_c;
++} worker_global_env_t;
++
++static worker_global_env_t worker_global_env =
++{
++    .lock = PTHREAD_MUTEX_INITIALIZER
++};
++
++
++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
++
++#define LOG_ENTER
++#define LOG_EXIT
++
++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
++static void worker_submit_job(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  s->worker_tail++;
++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call this to say we have completed pass1
++static void worker_complete_job(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  s->worker_head++;
++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call this to wait for all jobs to have completed at the end of a frame
++static void worker_wait(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  while( s->worker_head !=s->worker_tail)
++  {
++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++  }
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++static void worker_pass0_ready(HEVCContext *s)
++{
++  LOG_ENTER
++    pthread_mutex_lock(&s->worker_mutex);
++    // tail is number of submitted jobs
++    // head is number of completed jobs
++    // tail-head is number of outstanding jobs in the queue
++    // we need to ensure there is at least 1 space left for us to use
++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
++    {
++      // Wait until another job is completed
++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++    }
++    pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++static void *worker_start(void *arg)
++{
++  HEVCContext *s = (HEVCContext *)arg;
++  while(1) {
++    pthread_mutex_lock(&s->worker_mutex);
++
++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
++    {
++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
++    }
++    pthread_mutex_unlock(&s->worker_mutex);
++
++    if (s->kill_worker) {
++      break;
++    }
++    LOG_ENTER
++    worker_core(s);
++
++    worker_complete_job(s);
++    LOG_EXIT
++  }
++  return NULL;
++}
++
++#endif
++
+ /**
+  * NOTE: Each function hls_foo correspond to the function foo in the
+  * specification (HLS stands for High Level Syntax).
+@@ -56,6 +281,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+ /* free everything allocated  by pic_arrays_init() */
+ static void pic_arrays_free(HEVCContext *s)
+ {
++#ifdef RPI
++    int job;
++    for(job=0;job<RPI_MAX_JOBS;job++) {
++      if (s->coeffs_buf_arm[job][0]) {
++        gpu_free(&s->coeffs_buf_default[job]);
++        s->coeffs_buf_arm[job][0] = 0;
++      }
++      if (s->coeffs_buf_arm[job][2]) {
++        gpu_free(&s->coeffs_buf_accelerated[job]);
++        s->coeffs_buf_arm[job][2] = 0;
++      }
++    }
++#endif
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++
++            if (dvq->vpu_cmds_arm) {
++                gpu_free(&dvq->deblock_vpu_gmem);
++              dvq->vpu_cmds_arm = 0;
++            }
++        }
++    }
++#endif
+     av_freep(&s->sao);
+     av_freep(&s->deblock);
+ 
+@@ -92,6 +343,88 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+     int ctb_count        = sps->ctb_width * sps->ctb_height;
+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+ 
++#ifdef RPI
++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
++    int job;
++
++    av_assert0(sps);
++//    s->max_ctu_count = sps->ctb_width;
++//    printf("CTB with=%d\n", sps->ctb_width);
++//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
++    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
++    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++
++    for(job=0;job<RPI_MAX_JOBS;job++) {
++      for(job=0;job<RPI_MAX_JOBS;job++) {
++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
++        if (!s->coeffs_buf_arm[job][0])
++            goto fail;
++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
++        if (!s->coeffs_buf_arm[job][2])
++            goto fail;
++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
++      }
++    }
++#endif
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        s->enable_rpi_deblock = !sps->sao_enabled;
++        s->setup_width = (sps->width+15) / 16;
++        s->setup_height = (sps->height+15) / 16;
++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
++
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
++        {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
++            const unsigned int total_size =- cmd_size + y_size + uv_size;
++            int p_vc;
++            uint8_t * p_arm;
++ #if RPI_VPU_DEBLOCK_CACHED
++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
++ #else
++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
++ #endif
++            p_vc = dvq->deblock_vpu_gmem.vc;
++            p_arm = dvq->deblock_vpu_gmem.arm;
++
++            // Zap all
++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
++
++            // Subdivide
++            dvq->vpu_cmds_arm = (void*)p_arm;
++            dvq->vpu_cmds_vc = p_vc;
++
++            p_arm += cmd_size;
++            p_vc += cmd_size;
++
++            dvq->y_setup_arm = (void*)p_arm;
++            dvq->y_setup_vc = (void*)p_vc;
++
++            p_arm += y_size;
++            p_vc += y_size;
++
++            dvq->uv_setup_arm = (void*)p_arm;
++            dvq->uv_setup_vc = (void*)p_vc;
++        }
++
++        s->dvq_n = 0;
++        s->dvq = s->dvq_ents + s->dvq_n;
++    }
++#endif
++
+     s->bs_width  = (width  >> 2) + 1;
+     s->bs_height = (height >> 2) + 1;
+ 
+@@ -138,6 +471,29 @@ fail:
+     return AVERROR(ENOMEM);
+ }
+ 
++static void default_pred_weight_table(HEVCContext * const s)
++{
++  unsigned int i;
++  s->sh.luma_log2_weight_denom = 0;
++  s->sh.chroma_log2_weight_denom = 0;
++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++      s->sh.luma_weight_l0[i] = 1;
++      s->sh.luma_offset_l0[i] = 0;
++      s->sh.chroma_weight_l0[i][0] = 1;
++      s->sh.chroma_offset_l0[i][0] = 0;
++      s->sh.chroma_weight_l0[i][1] = 1;
++      s->sh.chroma_offset_l0[i][1] = 0;
++  }
++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++      s->sh.luma_weight_l1[i] = 1;
++      s->sh.luma_offset_l1[i] = 0;
++      s->sh.chroma_weight_l1[i][0] = 1;
++      s->sh.chroma_offset_l1[i][0] = 0;
++      s->sh.chroma_weight_l1[i][1] = 1;
++      s->sh.chroma_offset_l1[i][1] = 0;
++  }
++}
++
+ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+ {
+     int i = 0;
+@@ -678,6 +1034,11 @@ static int hls_slice_header(HEVCContext *s)
+                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
+                 pred_weight_table(s, gb);
+             }
++            else
++            {
++              // Give us unit weights
++              default_pred_weight_table(s);
++            }
+ 
+             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+@@ -933,6 +1294,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+     return 0;
+ }
+ 
++#ifdef RPI
++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
++{
++    if (s->enable_rpi) {
++        HEVCLocalContext *lc = s->HEVClc;
++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        cmd->type = RPI_PRED_INTRA;
++        cmd->size = log2_trafo_size;
++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
++        cmd->c_idx = c_idx;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++    } else {
++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
++    }
++}
++#endif
++
+ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                               int xBase, int yBase, int cb_xBase, int cb_yBase,
+                               int log2_cb_size, int log2_trafo_size,
+@@ -945,8 +1325,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+     if (lc->cu.pred_mode == MODE_INTRA) {
+         int trafo_size = 1 << log2_trafo_size;
+         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+-
++#ifdef RPI
++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
++#else
+         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
++#endif
+     }
+ 
+     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+@@ -1032,7 +1415,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                 if (lc->cu.pred_mode == MODE_INTRA) {
+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++#ifdef RPI
++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++#else
+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
++#endif
+                 }
+                 if (cbf_cb[i])
+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+@@ -1061,7 +1448,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                 if (lc->cu.pred_mode == MODE_INTRA) {
+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++#ifdef RPI
++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
++#else
+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
++#endif
+                 }
+                 if (cbf_cr[i])
+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+@@ -1090,7 +1481,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                 if (lc->cu.pred_mode == MODE_INTRA) {
+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                     trafo_size_h, trafo_size_v);
++#ifdef RPI
++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
++#else
+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
++#endif
+                 }
+                 if (cbf_cb[i])
+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+@@ -1100,7 +1495,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                 if (lc->cu.pred_mode == MODE_INTRA) {
+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                 trafo_size_h, trafo_size_v);
++#ifdef RPI
++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
++#else
+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
++#endif
+                 }
+                 if (cbf_cr[i])
+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+@@ -1112,26 +1511,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
++#ifdef RPI
++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
++#else
+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
++#endif
+             if (s->ps.sps->chroma_format_idc == 2) {
+                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+                                                 trafo_size_h, trafo_size_v);
++#ifdef RPI
++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
++#else
+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
++#endif
+             }
+         } else if (blk_idx == 3) {
+             int trafo_size_h = 1 << (log2_trafo_size + 1);
+             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+             ff_hevc_set_neighbour_available(s, xBase, yBase,
+                                             trafo_size_h, trafo_size_v);
++#ifdef RPI
++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
++#else
+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
++#endif
+             if (s->ps.sps->chroma_format_idc == 2) {
+                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+                                                 trafo_size_h, trafo_size_v);
++#ifdef RPI
++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
++#else
+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
++#endif
+             }
+         }
+     }
+@@ -1277,33 +1696,23 @@ do {
+     return 0;
+ }
+ 
+-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++
++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+ {
+-    HEVCLocalContext *lc = s->HEVClc;
+     GetBitContext gb;
+-    int cb_size   = 1 << log2_cb_size;
+     ptrdiff_t stride0 = s->frame->linesize[0];
+     ptrdiff_t stride1 = s->frame->linesize[1];
+     ptrdiff_t stride2 = s->frame->linesize[2];
+     uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+     uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+     uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+-
+-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+-                          s->ps.sps->pcm.bit_depth_chroma;
+-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
+     int ret;
+ 
+-    if (!s->sh.disable_deblocking_filter_flag)
+-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+-
+     ret = init_get_bits(&gb, pcm, length);
+     if (ret < 0)
+         return ret;
+ 
+-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
++    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+     if (s->ps.sps->chroma_format_idc) {
+         s->hevcdsp.put_pcm(dst1, stride1,
+                            cb_size >> s->ps.sps->hshift[1],
+@@ -1318,6 +1727,59 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+     return 0;
+ }
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
++{
++    int16_t * const coeffs = (buf_no != 3) ?
++        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
++        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
++    s->num_coeffs[s->pass0_job][buf_no] += n;
++    return coeffs;
++}
++#endif
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
++
++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
++
++#ifdef RPI
++    if (s->enable_rpi) {
++        // Copy coeffs
++        const int blen = (length + 7) >> 3;
++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, (blen + 1) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Add command
++        {
++            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++#endif
++
++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
++}
++
+ /**
+  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
+  *
+@@ -1334,6 +1796,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+  * @param luma_offset additive offset applied to the luma prediction value
+  */
+ 
++#if RPI_INTER
++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
++                        int block_w, int block_h, int luma_weight, int luma_offset)
++{
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
++    cmd->cmd = RPI_CMD_LUMA_UNI;
++    cmd->dst = dst;
++    cmd->dststride = dststride;
++    cmd->src = ref->data[0];
++    cmd->srcstride = ref->linesize[0];
++    cmd->mv = *mv;
++    cmd->x_off = x_off;
++    cmd->y_off = y_off;
++    cmd->block_w = block_w;
++    cmd->block_h = block_h;
++    cmd->weight = luma_weight;
++    cmd->offset = luma_offset;
++}
++
++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
++                       const struct MvField * const current_mv)
++{
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
++    cmd->cmd = RPI_CMD_LUMA_BI;
++    cmd->dst = dst;
++    cmd->dststride = dststride;
++    cmd->src = ref0->data[0];
++    cmd->srcstride = ref0->linesize[0];
++    cmd->mv = *mv0;
++    cmd->x_off = x_off;
++    cmd->y_off = y_off;
++    cmd->block_w = block_w;
++    cmd->block_h = block_h;
++    cmd->src1 = ref1->data[0];
++    cmd->srcstride1 = ref1->linesize[0];
++    cmd->mv1 = *mv1;
++    cmd->ref_idx[0] = current_mv->ref_idx[0];
++    cmd->ref_idx[1] = current_mv->ref_idx[1];
++}
++
++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
++                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
++{
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
++    cmd->cmd = RPI_CMD_CHROMA_UNI;
++    cmd->dst = dst0;
++    cmd->dststride = dststride;
++    cmd->src = src0;
++    cmd->srcstride = srcstride;
++    cmd->mv = *mv;
++    cmd->x_off = x_off;
++    cmd->y_off = y_off;
++    cmd->block_w = block_w;
++    cmd->block_h = block_h;
++    cmd->weight = chroma_weight;
++    cmd->offset = chroma_offset;
++}
++
++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
++                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
++{
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
++    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
++    cmd->dst = dst0;
++    cmd->dststride = dststride;
++    cmd->src = ref0->data[cidx+1];
++    cmd->srcstride = ref0->linesize[cidx+1];
++    cmd->mv = current_mv->mv[0];
++    cmd->mv1 = current_mv->mv[1];
++    cmd->x_off = x_off;
++    cmd->y_off = y_off;
++    cmd->block_w = block_w;
++    cmd->block_h = block_h;
++    cmd->src1 = ref1->data[cidx+1];
++    cmd->srcstride1 = ref1->linesize[cidx+1];
++    cmd->ref_idx[0] = current_mv->ref_idx[0];
++    cmd->ref_idx[1] = current_mv->ref_idx[1];
++}
++
++#endif
++
+ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                         int block_w, int block_h, int luma_weight, int luma_offset)
+@@ -1349,6 +1896,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                            (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
+     int idx              = ff_hevc_pel_weight[block_w];
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     x_off += mv->x >> 2;
+     y_off += mv->y >> 2;
+     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+@@ -1395,7 +1946,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+  * @param mv1 motion vector1 (relative to block position) to get pixel data from
+  * @param current_mv current motion vector structure
+  */
+- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+ {
+@@ -1419,6 +1970,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+@@ -1504,6 +2059,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+     intptr_t _mx         = mx << (1 - hshift);
+     intptr_t _my         = my << (1 - vshift);
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     x_off += mv->x >> (2 + hshift);
+     y_off += mv->y >> (2 + vshift);
+     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+@@ -1568,6 +2127,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+     int hshift = s->ps.sps->hshift[1];
+     int vshift = s->ps.sps->vshift[1];
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+@@ -1695,14 +2258,312 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+     }
+ }
+ 
+-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                                int nPbW, int nPbH,
+-                                int log2_cb_size, int partIdx, int idx)
++
++#if RPI_INTER
++static void
++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const Mv *const mv,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = x0 + y0 * s->frame->linesize[0];
++
++    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++                    mv, x0, y0, nPbW, nPbH,
++                    weight_mul, weight_offset);
++
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx       = (my << 8) | mx;
++        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
++        uint32_t *y = s->curr_y_mvs;
++        uint32_t dst_base = get_vc_address_y(s->frame) + y_off;
++        const uint32_t wo_0 = PACK2(weight_offset * 2 + 1, weight_mul);
++
++        // Potentially we could change the assembly code to support taller sizes in one go
++        for (int start_y = 0; start_y < nPbH; start_y += 16, dst_base += s->frame->linesize[0] * 16) {
++            const uint32_t src_yx_y = y1_m3 + start_y;
++            int start_x = 0;
++            const int bh = FFMIN(nPbH - start_y, 16);
++            uint32_t *const py = y - RPI_LUMA_COMMAND_WORDS;
++            uint32_t *const ppy = y - RPI_LUMA_COMMAND_WORDS * 2;
++
++            // As Y-pred operates on two independant 8-wide src blocks we can merge
++            // this pred with the previous one if it the previous one is 8 pel wide,
++            // the same height as the current block, immediately to the left of our
++            // current dest block and mono-pred.
++            //
++            // In the init (1st) block w/h is pic width height so given
++            // that no pic will ever be 8 pixels wide the first test here
++            // should fail if this is the first pred (i.e. after that test
++            // ppy is valid)
++            if (py[4] == ((8 << 16) | bh) && py[8] + 8 == dst_base && ppy[9] == s->qpu_filter) {
++                const int bw = FFMIN(nPbW, 8);
++
++                ppy[2] = PACK2(src_yx_y, x1_m3);
++                ppy[3] = src_vc_address_y;
++                py[4] += bw << 16;
++                py[5] = PACK2(my2_mx2_my_mx, py[5]);
++                // py[6] stays the same
++                py[7] = wo_0;
++
++                start_x = bw;
++            }
++
++            for (; start_x < nPbW; start_x += 16) {
++                const int bw = FFMIN(nPbW - start_x, 16);;
++                y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + start_x);
++                y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
++                y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + 8 + start_x);
++                y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
++                *y++ = PACK2(bw, bh);
++                *y++ = my2_mx2_my_mx;
++                *y++ = wo_0;
++                *y++ = wo_0;
++                *y++ = dst_base + start_x;
++                y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter;
++            }
++        }
++        s->curr_y_mvs = y;
++    }
++}
++
++static void
++rpi_pred_y_b(HEVCContext * const s,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct MvField *const mv_field,
++           AVFrame *const src_frame,
++           AVFrame *const src_frame2)
++{
++    const unsigned int y_off = x0 + y0 * s->frame->linesize[0];
++    const Mv * const mv  = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
++
++    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++           mv, x0, y0, nPbW, nPbH,
++           src_frame2, mv2, mv_field);
++#if !Y_B_ONLY
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx = (my<<8) | mx;
++        const unsigned int mx2          = mv2->x & 3;
++        const unsigned int my2          = mv2->y & 3;
++        const unsigned int my2_mx2 = (my2<<8) | mx2;
++        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2) - 3;
++        const int y1 = y0 + (mv->y >> 2) - 3;
++        const int x2 = x0 + (mv2->x >> 2) - 3;
++        const int y2 = y0 + (mv2->y >> 2) - 3;
++        const unsigned int ref_idx0 = mv_field->ref_idx[0];
++        const unsigned int ref_idx1 = mv_field->ref_idx[1];
++        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
++                     s->sh.luma_offset_l1[ref_idx1] + 1;
++        const uint32_t wo_0 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++        const uint32_t wo_1 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++        uint32_t * y = s->curr_y_mvs;
++        uint32_t dst = get_vc_address_y(s->frame) + y_off;
++
++        for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
++          for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
++              int bw = nPbW-start_x;
++              int bh = nPbH-start_y;
++              y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y1 + start_y, x1 + start_x);
++              y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame);
++              y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y2 + start_y, x2 + start_x);
++              y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame2);
++              *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
++              *y++ = my2_mx2_my_mx;
++
++              *y++ = wo_0;
++              *y++ = wo_1;
++
++              *y++ = dst + start_x;
++              y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter_b;
++          }
++          dst += s->frame->linesize[0] * 16;
++        }
++        s->curr_y_mvs = y;
++    }
++#endif
++}
++
++
++static void
++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const Mv * const mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++
++    const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1];
++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[0], c_offsets[0]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[1], c_offsets[1]);
++
++    {
++        const int hshift           = s->ps.sps->hshift[1];
++        const int vshift           = s->ps.sps->vshift[1];
++
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        const uint32_t src_base_u = get_vc_address_u(src_frame);
++        const uint32_t src_base_v = get_vc_address_v(src_frame);
++        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
++        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++        uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off;
++
++        qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs;
++
++        for(int start_y=0;start_y < nPbH_c;start_y+=16)
++        {
++            const int bh = FFMIN(nPbH_c-start_y, 16);
++            // We are allowed 3/4 powers of two as well as powers of 2
++            av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
++
++            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH, ++u)
++            {
++                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++                u[-1].next_fn  = s->qpu_filter_uv;
++                u[-1].next_src_x = x1_c + start_x;
++                u[-1].next_src_y = y1_c + start_y;
++                u[-1].next_src_base_u = src_base_u;
++                u[-1].next_src_base_v = src_base_v;
++                u[0].p.h = bh;
++                u[0].p.w = bw;
++                u[0].p.coeffs_x = x_coeffs;
++                u[0].p.coeffs_y = y_coeffs;
++                u[0].p.wo_u = wo_u;
++                u[0].p.wo_v = wo_v;
++                u[0].p.dst_addr_u = dst_base_u + start_x;
++                u[0].p.dst_addr_v = dst_base_v + start_x;
++            }
++
++            dst_base_u += s->frame->linesize[1] * 16;
++            dst_base_v += s->frame->linesize[2] * 16;
++        }
++        s->curr_u_mvs = (uint32_t *)u;
++    }
++  return;
++}
++
++static void
++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct MvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1];
++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++
++    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++
++    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
++
++    {
++        const int hshift = s->ps.sps->hshift[1];
++        const int vshift = s->ps.sps->vshift[1];
++        const Mv * const mv = mv_field->mv + 0;
++        const Mv * const mv2 = mv_field->mv + 1;
++
++        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++        uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off;
++        qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs;
++
++        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
++          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH, u += 2) {
++              int bw = nPbW_c-start_x;
++              int bh = nPbH_c-start_y;
++              u[-1].next_fn = s->qpu_filter_uv_b0; // In fact ignored
++              u[-1].next_src_x = x1_c + start_x;
++              u[-1].next_src_y = y1_c + start_y;
++              u[-1].next_src_base_u = get_vc_address_u(src_frame);
++              u[-1].next_src_base_v = get_vc_address_v(src_frame);
++
++              u[0].next_fn = s->qpu_filter_uv_b;
++              u[0].next_src_x = x2_c + start_x;
++              u[0].next_src_y = y2_c + start_y;
++              u[0].next_src_base_u = get_vc_address_u(src_frame2);
++              u[0].next_src_base_v = get_vc_address_v(src_frame2);
++
++              u[0].b0.h = (bh<16 ? bh : 16);
++              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
++              u[0].b0.coeffs_x = coefs0_x;
++              u[0].b0.coeffs_y = coefs0_y;
++              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
++              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
++              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
++              u[0].b0.dummy1 = 0;
++
++              u[1].b1.dummy0 = 0;  // w,h inherited from b0
++              u[1].b1.coeffs_x = coefs1_x;
++              u[1].b1.coeffs_y = coefs1_y;
++              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
++              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
++              u[1].b1.dst_addr_u = dst_base_u + start_x;
++              u[1].b1.dst_addr_v = dst_base_v + start_x;
++          }
++
++          dst_base_u += s->frame->linesize[1] * 16;
++          dst_base_v += s->frame->linesize[2] * 16;
++        }
++
++        s->curr_u_mvs = (uint32_t *)u;
++    }
++}
++#endif
++
++
++
++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
++                                const int nPbW, const int nPbH,
++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+ {
+ #define POS(c_idx, x, y)                                                              \
+     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
+                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
+-    HEVCLocalContext *lc = s->HEVClc;
++    HEVCLocalContext * const lc = s->HEVClc;
+     int merge_idx = 0;
+     struct MvField current_mv = {{{ 0 }}};
+ 
+@@ -1720,8 +2581,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+     int y_cb             = y0 >> log2_min_cb_size;
+     int x_pu, y_pu;
+     int i, j;
+-
+-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+ 
+     if (!skip_flag)
+         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+@@ -1765,12 +2625,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+ 
+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
++#if RPI_INTER
++        if (s->enable_rpi) {
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
++        } else
++#endif
++        {
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
++        }
+ 
+         if (s->ps.sps->chroma_format_idc) {
++#if RPI_INTER
++            if (s->enable_rpi) {
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                  ref0->frame);
++                return;
++            }
++#endif
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+@@ -1784,12 +2661,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+ 
+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
++#if RPI_INTER
++        if (s->enable_rpi) {
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
++        } else
++#endif
++        {
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
++        }
+ 
+         if (s->ps.sps->chroma_format_idc) {
++#if RPI_INTER
++            if (s->enable_rpi) {
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                  ref1->frame);
++                return;
++            }
++#endif
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+@@ -1804,11 +2698,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+ 
+-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
++#if RPI_INTER
++        if (s->enable_rpi) {
++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++        } else
++#endif
++        {
++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                    ref1->frame, &current_mv.mv[1], &current_mv);
++        }
+ 
+         if (s->ps.sps->chroma_format_idc) {
++#if RPI_INTER
++          if (s->enable_rpi) {
++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
++                           &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                           ref0->frame,
++                           ref1->frame);
++                return;
++            }
++#endif
+             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+ 
+@@ -2083,7 +2997,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+                 if (s->ps.sps->pcm.loop_filter_disable_flag)
++                {
+                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
++                }
+ 
+                 if (ret < 0)
+                     return ret;
+@@ -2306,6 +3222,741 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+ }
+ 
++#ifdef RPI
++static void rpi_execute_dblk_cmds(HEVCContext *s)
++{
++    int n;
++    int job = s->pass1_job;
++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
++    int (*p)[2] = s->dblk_cmds[job];
++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
++    }
++    s->num_dblk_cmds[job] = 0;
++}
++
++#if 0
++static void rpi_execute_transform(HEVCContext *s)
++{
++    int i=2;
++    int job = s->pass1_job;
++    /*int j;
++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
++        s->hevcdsp.idct[4-2](coeffs, 16);
++    }
++    i=3;
++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
++        s->hevcdsp.idct[5-2](coeffs, 32);
++    }*/
++
++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
++    //vpu_wait(s->vpu_id);
++
++    for(i=0;i<4;i++)
++        s->num_coeffs[job][i] = 0;
++}
++#endif
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(HEVCContext * const s)
++{
++  int i;
++  int job = s->pass1_job;
++  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
++#ifdef RPI_WORKER
++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
++#else
++  HEVCLocalContext *lc = s->HEVClc;
++#endif
++
++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++
++      switch (cmd->type)
++      {
++          case RPI_PRED_INTRA:
++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
++              lc->na.cand_left         = (cmd->na >> 3) & 1;
++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
++              lc->na.cand_up           = (cmd->na >> 1) & 1;
++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
++              s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              break;
++
++          case RPI_PRED_TRANSFORM_ADD:
++              s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++#ifdef RPI_PRECLEAR
++              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
++#endif
++              break;
++
++          case RPI_PRED_I_PCM:
++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++              break;
++
++          default:
++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++              abort();
++      }
++  }
++  s->num_pred_cmds[job] = 0;
++}
++
++// Do any inter-pred that we want to do in software
++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
++// All ARM
++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
++{
++    unsigned int cidx;
++    AVFrame myref;
++    AVFrame myref1;
++    struct MvField mymv;
++
++    for(; n>0 ; n--, cmd++) {
++        switch(cmd->cmd) {
++        case RPI_CMD_LUMA_UNI:
++            if (b_only)
++                break;
++            myref.data[0] = cmd->src;
++            myref.linesize[0] = cmd->srcstride;
++            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
++            break;
++        case RPI_CMD_LUMA_BI:
++            myref.data[0] = cmd->src;
++            myref.linesize[0] = cmd->srcstride;
++            myref1.data[0] = cmd->src1;
++            myref1.linesize[0] = cmd->srcstride1;
++            mymv.ref_idx[0] = cmd->ref_idx[0];
++            mymv.ref_idx[1] = cmd->ref_idx[1];
++            luma_mc_bi(s, cmd->dst, cmd->dststride,
++                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
++                       &myref1, &cmd->mv1, &mymv);
++            break;
++        case RPI_CMD_CHROMA_UNI:
++            if (b_only)
++                break;
++            mymv.mv[0] = cmd->mv;
++            chroma_mc_uni(s, cmd->dst,
++                          cmd->dststride, cmd->src, cmd->srcstride, 0,
++                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
++            break;
++        case RPI_CMD_CHROMA_BI:
++        case RPI_CMD_CHROMA_BI+1:
++            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
++            myref.data[cidx+1] = cmd->src;
++            myref.linesize[cidx+1] = cmd->srcstride;
++            myref1.data[cidx+1] = cmd->src1;
++            myref1.linesize[cidx+1] = cmd->srcstride1;
++            mymv.ref_idx[0] = cmd->ref_idx[0];
++            mymv.ref_idx[1] = cmd->ref_idx[1];
++            mymv.mv[0] = cmd->mv;
++            mymv.mv[1] = cmd->mv1;
++            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
++                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
++            break;
++        }
++    }
++}
++
++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
++{
++    const int job = s->pass1_job;
++
++    if (!qpu_luma || luma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
++    s->num_mv_cmds_y[job] = 0;
++    if (!qpu_chroma || chroma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
++    s->num_mv_cmds_c[job] = 0;
++}
++
++#endif
++
++#ifdef RPI
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(HEVCContext *s)
++{
++#if RPI_INTER
++    int job = s->pass0_job;
++    int i;
++
++    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
++    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
++
++    for(i=0; i < QPU_N_UV;i++) {
++        qpu_mc_pred_c_t * const u = (qpu_mc_pred_c_t *)s->mvs_base[job][i];
++
++        u->next_fn = 0;
++        u->next_src_x = 0;
++        u->next_src_y = 0;
++        u->next_src_base_u = 0;
++        u->next_src_base_v = 0;
++        u->s.pic_w = pic_width;
++        u->s.pic_h = pic_height;
++        u->s.src_stride = s->frame->linesize[1];
++        u->s.dst_stride = s->frame->linesize[1];
++        u->s.wdenom = s->sh.chroma_log2_weight_denom + 6;
++        u->s.dummy0 = 0;
++        u->s.dummy1 = 0;
++
++        s->u_mvs[job][i] = (uint32_t *)(u + 1);
++    }
++    s->curr_u_mvs = s->u_mvs[job][0];
++
++    for(i=0;i < QPU_N_Y;i++) {
++        // This needs to have a generally similar structure to the
++        // actual filter code as various pipelined bits need to land correctly
++        // when inserted by the filter requests
++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
++        *s->y_mvs[job][i]++ = 0; // y_x
++        *s->y_mvs[job][i]++ = 0; // ref_y_base
++        *s->y_mvs[job][i]++ = 0; // y2_x2
++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
++        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
++        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
++        *s->y_mvs[job][i]++ = 0; // Next kernel
++    }
++    s->curr_y_mvs = s->y_mvs[job][0];
++#endif
++    s->ctu_count = 0;
++}
++#endif
++
++#ifdef RPI_SIMULATE_QPUS
++#error Rotted
++
++static int32_t clipx(int x,int FRAME_WIDTH)
++{
++	if (x<=0) return 0;
++	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
++	return x;
++}
++
++static int32_t clipy(int y,int FRAME_HEIGHT)
++{
++	if (y<=0) return 0;
++	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
++	return y;
++}
++
++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
++{
++   int32_t vsum = 0;
++   int x, y;
++
++   for (y = 0; y < 8; y++) {
++      int32_t hsum = 0;
++
++      for (x = 0; x < 8; x++)
++         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
++
++      vsum += lumaFilter[my][y]*hsum;
++   }
++   vsum >>= 6;
++   vsum = (((vsum*weight)+round)>>denom)+offset;
++
++   return av_clip_uint8( vsum );
++}*/
++
++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
++{
++  int32_t vsum = 0;
++  int x, y;
++  int chromaFilterH[4];
++  int chromaFilterV[4];
++  int i;
++  int offset_after = offset_weight>>16;
++  int weight = (offset_weight<<16)>>16;
++  for(i=0;i<4;i++) {
++    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
++    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
++  }
++
++   for (y = 0; y < 4; y++) {
++      int32_t hsum = 0;
++
++      for (x = 0; x < 4; x++)
++         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
++
++      vsum += chromaFilterV[y]*hsum;
++   }
++   vsum >>= 6;
++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
++
++   return vsum;
++}
++
++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
++
++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
++{
++  int32_t vsum = 0;
++  int x, y;
++  int i;
++  int offset_after = offset_weight>>16;
++  int weight = (offset_weight<<16)>>16;
++
++   for (y = 0; y < 8; y++) {
++      int32_t hsum = 0;
++
++      for (x = 0; x < 8; x++)
++         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
++
++      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
++   }
++   vsum >>= 6;
++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
++
++   return vsum;
++}
++
++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
++{
++  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
++  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
++  int pitch = frame->linesize[cIdx];
++  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
++    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
++  if (p>=base && p<base+pitch*pic_height) {
++    return frame->data[cIdx] + (p-base);
++  }
++  return NULL;
++}
++
++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
++{
++  SliceHeader *sh   = &s->sh;
++  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
++  int i;
++  if (arm) return arm;
++  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
++  {
++    for(i=0;i<sh->nb_refs[L0];i++) {
++      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
++      if (arm) return arm;
++    }
++  }
++  if (sh->slice_type == B_SLICE) {
++    for(i=0;i<sh->nb_refs[L1];i++) {
++      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
++      if (arm) return arm;
++    }
++  }
++  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
++  exit(-1);
++  return NULL;
++}
++
++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
++{
++  uint32_t next_kernel;
++  uint32_t x0;
++  uint32_t y0;
++  uint8_t *ref_u_base;
++  uint8_t *ref_v_base;
++  uint32_t frame_width = p[5];
++  uint32_t frame_height = p[6];
++  uint32_t pitch = p[7];
++  uint32_t dst_pitch = p[8];
++  int32_t offset_before = p[9];
++  int32_t denom = p[10];
++  uint32_t vpm_id = p[11];
++  uint32_t tmp_u_dst[256];
++  uint32_t tmp_v_dst[256];
++  while(1) {
++    p += 12;
++    next_kernel = p[0-12];
++    x0 = p[1-12];
++    y0 = p[2-12];
++    if (next_kernel==s->qpu_filter_uv || next_kernel==s->qpu_filter_uv_b0 || next_kernel==s->qpu_filter_uv_b) {
++      int x,y;
++      uint32_t width_height = p[5];
++      uint32_t hcoeffs = p[6];
++      uint32_t vcoeffs = p[7];
++      uint32_t offset_weight_u = p[8];
++      uint32_t offset_weight_v = p[9];
++      uint8_t *this_u_dst;
++      uint8_t *this_v_dst;
++      uint32_t width = width_height >> 16;
++      uint32_t height = (width_height << 16) >> 16;
++      ref_u_base = compute_arm_addr(s,p[3-12],1);
++      ref_v_base = compute_arm_addr(s,p[4-12],2);
++      if (next_kernel!=s->qpu_filter_uv_b0)
++      {
++        this_u_dst = compute_arm_addr(s,p[10],1);
++        this_v_dst = compute_arm_addr(s,p[11],2);
++      }
++      for (y=0; y<height; ++y) {
++        for (x=0; x<width; ++x) {
++          if (next_kernel==s->qpu_filter_uv) {
++            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
++            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
++          } else if (next_kernel==s->qpu_filter_uv_b0) {
++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
++            tmp_u_dst[x+y*16] = refa;
++            tmp_v_dst[x+y*16] = refb;
++          } else {
++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
++          }
++        }
++      }
++    } else {
++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
++      break;
++    }
++  }
++}
++
++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
++{
++  uint32_t next_kernel;
++  int y_x,y2_x2;
++  int x0;
++  int y0;
++  int x2;
++  int y2;
++  uint32_t *p0 = p;
++  uint8_t *ref_y_base;
++  uint8_t *ref_y2_base;
++  uint32_t frame_width_height = p[4];
++  uint32_t frame_width = frame_width_height>>16;
++  uint32_t frame_height = (frame_width_height<<16)>>16;
++  uint32_t pitch = p[5];
++  uint32_t dst_pitch = p[6];
++  int offset_shift = p[7];
++  int32_t offset_before = offset_shift>>16;
++  int32_t denom = (offset_shift<<16)>>16;
++  while(1) {
++    p += 9;
++    next_kernel = p[8-9];
++    y_x = p[0-9];
++    x0 = (y_x<<16)>>16;
++    y0 = y_x>>16;
++    y2_x2 = p[2-9];
++    x2 = (y2_x2<<16)>>16;
++    y2 = y2_x2>>16;
++
++    if (next_kernel==s->qpu_filter || next_kernel==s->qpu_filter_b) {
++      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++      int x,y;
++      uint32_t width_height = p[4];
++      uint32_t my2_mx2_my_mx = p[5];
++      uint32_t offset_weight = p[6];
++      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
++      uint32_t width = width_height >> 16;
++      uint32_t height = (width_height << 16) >> 16;
++      uint8_t *dst_base = s->frame->data[0];
++      ref_y_base = compute_arm_addr(s,p[1-9],0);
++      ref_y2_base = compute_arm_addr(s,p[3-9],0);
++      for (y=0; y<height; ++y) {
++        for (x=0; x<width; ++x) {
++          if (next_kernel==s->qpu_filter) {
++            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
++            refa = av_clip_uint8(refa);
++            this_dst[x+y*dst_pitch] = refa;
++          }
++          else {
++            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
++            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
++            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
++          }
++        }
++      }
++    } else {
++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
++      break;
++    }
++  }
++}
++
++static void rpi_simulate_inter_qpu(HEVCContext *s)
++{
++  // First run the transform as normal
++  int i;
++  rpi_execute_transform(s);
++  for(i=0;i<8;i++)
++  {
++    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
++  }
++  for(i=0;i<12;i++)
++  {
++    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
++  }
++}
++
++#endif
++
++
++#if RPI_INTER
++static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
++{
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit);
++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
++    const uint32_t dummy_texture = qpu_fn(mc_setup_uv);
++    unsigned int tc = 0;
++
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_Y; ++i) {
++        uint32_t * const pu = s->y_mvs[job][i] - RPI_LUMA_COMMAND_WORDS;
++        const int cmd_count = pu - s->y_mvs_base[job][i];
++        tc += cmd_count;
++
++        av_assert0(cmd_count < Y_COMMANDS_PER_QPU - 1);
++
++        // We use this code as a dummy texture - safe?
++        pu[0] = 0; // x,y
++        pu[1] = dummy_texture;
++        pu[2] = 0;
++        pu[3] = dummy_texture;
++        pu[RPI_LUMA_COMMAND_WORDS - 1] = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++    }
++
++    return tc;
++}
++
++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
++{
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit_c);
++#if QPU_N_UV == 8
++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit8c);
++#elif QPU_N_UV == 12
++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12c);
++#else
++#error Need appropriate exit code
++#endif
++    const uint32_t dummy_texture = qpu_fn(mc_setup_uv);
++    unsigned int tc = 0;
++
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_UV; ++i) {
++        qpu_mc_pred_c_t * const pu = (qpu_mc_pred_c_t *)s->u_mvs[job][i] - 1;
++        const int cmd_count = (uint32_t *)pu - s->mvs_base[job][i];
++        tc += cmd_count;
++
++        pu->next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++        // Need to set the src to something that can be (pointlessly) prefetched
++        pu->next_src_x = 0;
++        pu->next_src_y = 0;
++        // We use this code as a dummy texture - safe?
++        pu->next_src_base_u = dummy_texture;
++        pu->next_src_base_v = dummy_texture;
++    }
++
++    return tc;
++}
++#endif
++
++#ifdef RPI
++
++
++static void flush_frame(HEVCContext *s,AVFrame *frame)
++{
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++  rpi_cache_flush_finish(rfe);
++}
++
++
++// Core execution tasks
++static void worker_core(HEVCContext * const s)
++{
++    worker_global_env_t * const wg = &worker_global_env;
++    int arm_cost = 0;
++//    vpu_qpu_wait_h sync_c;
++    vpu_qpu_wait_h sync_y;
++    int qpu_luma = 0;
++    int qpu_chroma = 0;
++    int gpu_load;
++    int arm_load;
++    static const int arm_const_cost = 2;
++
++//    static int z = 0;
++
++    const int job = s->pass1_job;
++    unsigned int flush_start = 0;
++    unsigned int flush_count = 0;
++
++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++
++    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
++        vpu_qpu_job_add_vpu(vqj,
++            vpu_get_fn(),
++            vpu_get_constants(),
++            s->coeffs_buf_vc[job][2],
++            s->num_coeffs[job][2] >> 8,
++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
++            s->num_coeffs[job][3] >> 10,
++            0);
++
++        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    }
++
++
++#if RPI_INTER
++    pthread_mutex_lock(&wg->lock);
++
++//    ++z;
++    gpu_load = vpu_qpu_current_load();
++    arm_load = avpriv_atomic_int_get(&wg->arm_load);
++#if !Y_B_ONLY
++    qpu_luma =  gpu_load + 2 < arm_load;
++    qpu_chroma = gpu_load < arm_load + 8;
++#elif 1
++    qpu_luma =  gpu_load < arm_load + 2;
++    qpu_chroma = gpu_load < arm_load + 8;
++#else
++    qpu_chroma = 1;
++    qpu_luma = 1;
++#endif
++
++    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
++
++    wg->gpu_c += qpu_chroma;
++    wg->gpu_y += qpu_luma;
++    wg->arm_c += !qpu_chroma;
++    wg->arm_y += !qpu_luma;
++
++
++//    if ((z & 511) == 0) {
++//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
++//    }
++
++
++    {
++        int (*d)[2] = s->dblk_cmds[job];
++        unsigned int high=(*d)[1];
++        int n;
++
++        flush_start = high;
++        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
++            unsigned int y = (*d)[1];
++            flush_start = FFMIN(flush_start, y);
++            high=FFMAX(high,y);
++        }
++        // Avoid flushing past end of frame
++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
++    }
++
++    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
++    {
++        uint32_t * const unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
++        const uint32_t code = qpu_fn(mc_setup_uv);
++        uint32_t * p;
++        unsigned int i;
++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
++
++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
++            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
++            *p++ = code;
++        }
++
++        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
++
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, s->unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#endif
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
++    }
++
++// We can take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
++
++    if (qpu_luma && mc_terminate_y(s, job) != 0)
++    {
++        uint32_t * const y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
++        const uint32_t code = qpu_fn(mc_setup);
++        uint32_t * p;
++        unsigned int i;
++        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
++
++        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
++            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
++            *p++ = code;
++        }
++
++        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
++
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, s->y_unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#endif
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
++    }
++
++    pthread_mutex_unlock(&wg->lock);
++
++#endif
++
++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
++
++    // Having accumulated some commands - do them
++    rpi_cache_flush_finish(rfe);
++    vpu_qpu_job_finish(vqj);
++
++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++
++#if Y_B_ONLY
++    if (qpu_luma)
++        vpu_qpu_wait(&sync_y);
++#endif
++    // Perform inter prediction
++    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
++
++    // Wait for transform completion
++
++    // Perform intra prediction and residual reconstruction
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
++#if Y_B_ONLY
++    if (!qpu_luma)
++        vpu_qpu_wait(&sync_y);
++#else
++    vpu_qpu_wait(&sync_y);
++#endif
++    rpi_execute_pred_cmds(s);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s);
++
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
++}
++
++static void rpi_do_all_passes(HEVCContext *s)
++{
++    // Do the various passes - common with the worker code
++    worker_core(s);
++    // Prepare next batch
++    rpi_begin(s);
++}
++
++
++
++#endif
++
+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+ {
+     HEVCContext *s  = avctxt->priv_data;
+@@ -2315,6 +3966,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+     int y_ctb       = 0;
+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+ 
++#ifdef RPI
++    s->enable_rpi = s->ps.sps->bit_depth == 8
++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
++
++    if (!s->enable_rpi) {
++      if (s->ps.pps->cross_component_prediction_enabled_flag)
++        printf("Cross component\n");
++    }
++#endif
++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
++
+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+         return AVERROR_INVALIDDATA;
+@@ -2328,6 +3990,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         }
+     }
+ 
++#ifdef RPI_WORKER
++    s->pass0_job = 0;
++    s->pass1_job = 0;
++#endif
++#ifdef RPI
++    rpi_begin(s);
++#endif
++
+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+ 
+@@ -2335,6 +4005,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+ 
++
+         ff_hevc_cabac_init(s, ctb_addr_ts);
+ 
+         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+@@ -2343,7 +4014,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+ 
++#if RPI_INTER
++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV];
++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y];
++#endif
++
+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
++
++#ifdef RPI
++#if RPI_INTER
++        s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]= s->curr_u_mvs;
++        s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y] = s->curr_y_mvs;
++#endif
++
++        if (s->enable_rpi) {
++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
++          //av_assert0(s->pass0_job>=0);
++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
++          s->ctu_count++;
++
++          if ( s->ctu_count >= s->max_ctu_count ) {
++#ifdef RPI_WORKER
++            if (s->used_for_ref)
++            {
++//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++
++//                worker_wait(s);
++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
++              // Pass on this job to worker thread
++              worker_submit_job(s);
++
++              // Make sure we have space to prepare the next job
++              worker_pass0_ready(s);
++
++              // Prepare the next batch of commands
++              rpi_begin(s);
++            } else {
++              // Non-ref frame so do it all on this thread
++              rpi_do_all_passes(s);
++            }
++#else
++            rpi_do_all_passes(s);
++#endif
++          }
++
++        }
++#endif
++
++
+         if (more_data < 0) {
+             s->tab_slice_address[ctb_addr_rs] = -1;
+             return more_data;
+@@ -2352,9 +4073,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+ 
+         ctb_addr_ts++;
+         ff_hevc_save_states(s, ctb_addr_ts);
++#ifdef RPI
++        if (s->enable_rpi)
++            continue;
++#endif
+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+     }
+ 
++#ifdef RPI
++
++#ifdef RPI_WORKER
++    // Wait for the worker to finish all its jobs
++    if (s->enable_rpi) {
++        worker_wait(s);
++    }
++#endif
++
++    // Finish off any half-completed rows
++    if (s->enable_rpi && s->ctu_count) {
++        rpi_do_all_passes(s);
++    }
++
++#endif
++
+     if (x_ctb + ctb_size >= s->ps.sps->width &&
+         y_ctb + ctb_size >= s->ps.sps->height)
+         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+@@ -2389,6 +4130,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+     s = s1->sList[self_id];
+     lc = s->HEVClc;
+ 
++#ifdef RPI
++    s->enable_rpi = 0;
++    //printf("Wavefront\n");
++#endif
++
+     if(ctb_row) {
+         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+ 
+@@ -2771,6 +4517,20 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+         if (ret < 0)
+             return ret;
+ 
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
++                        s->nal_unit_type == HEVC_NAL_TSA_N   ||
++                        s->nal_unit_type == HEVC_NAL_STSA_N  ||
++                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
++                        s->nal_unit_type == HEVC_NAL_RASL_N);
++
++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++            s->is_decoded = 0;
++            break;
++        }
+         if (s->max_ra == INT_MAX) {
+             if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
+                 s->max_ra = s->poc;
+@@ -2894,10 +4654,18 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+         }
+     }
+ 
+-fail:
+-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++fail:  // Also success path
++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
++#endif
+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-
++    } else if (s->ref) {
++#if RPI_INTER
++      // When running single threaded we need to flush the whole frame
++      flush_frame(s,s->frame);
++#endif
++    }
+     return ret;
+ }
+ 
+@@ -3150,6 +4918,41 @@ fail:
+     return AVERROR(ENOMEM);
+ }
+ 
++#ifdef RPI_WORKER
++static av_cold void hevc_init_worker(HEVCContext *s)
++{
++    int err;
++    pthread_cond_init(&s->worker_cond_head, NULL);
++    pthread_cond_init(&s->worker_cond_tail, NULL);
++    pthread_mutex_init(&s->worker_mutex, NULL);
++
++    s->worker_tail=0;
++    s->worker_head=0;
++    s->kill_worker=0;
++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
++    if (err) {
++        printf("Failed to create worker thread\n");
++        exit(-1);
++    }
++}
++
++static av_cold void hevc_exit_worker(HEVCContext *s)
++{
++    void *res;
++    s->kill_worker=1;
++    pthread_cond_broadcast(&s->worker_cond_tail);
++    pthread_join(s->worker_thread, &res);
++
++    pthread_cond_destroy(&s->worker_cond_head);
++    pthread_cond_destroy(&s->worker_cond_tail);
++    pthread_mutex_destroy(&s->worker_mutex);
++
++    s->worker_tail=0;
++    s->worker_head=0;
++    s->kill_worker=0;
++}
++#endif
++
+ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+ {
+     HEVCContext       *s = avctx->priv_data;
+@@ -3161,6 +4964,33 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+ 
+     av_freep(&s->cabac_state);
+ 
++#ifdef RPI
++
++#ifdef RPI_WORKER
++    hevc_exit_worker(s);
++#endif
++
++    for(i=0;i<RPI_MAX_JOBS;i++) {
++      av_freep(&s->unif_mv_cmds_y[i]);
++      av_freep(&s->unif_mv_cmds_c[i]);
++      av_freep(&s->univ_pred_cmds[i]);
++
++#if RPI_INTER
++      if (s->unif_mvs[i]) {
++        gpu_free( &s->unif_mvs_ptr[i] );
++        s->unif_mvs[i] = 0;
++      }
++      if (s->y_unif_mvs[i]) {
++        gpu_free( &s->y_unif_mvs_ptr[i] );
++        s->y_unif_mvs[i] = 0;
++      }
++#endif
++    }
++
++    vpu_qpu_term();
++
++#endif
++
+     for (i = 0; i < 3; i++) {
+         av_freep(&s->sao_pixel_buffer_h[i]);
+         av_freep(&s->sao_pixel_buffer_v[i]);
+@@ -3202,10 +5032,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+     return 0;
+ }
+ 
++#ifdef RPI
++#ifdef RPI_PRECLEAR
++static av_cold void memclear16(int16_t *p, int n)
++{
++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
++  //int i;
++  //for(i=0;i<n;i++)
++  //  p[i] = 0;
++}
++#endif
++#endif
++
+ static av_cold int hevc_init_context(AVCodecContext *avctx)
+ {
+     HEVCContext *s = avctx->priv_data;
+     int i;
++#ifdef RPI
++    unsigned int job;
++#endif
+ 
+     s->avctx = avctx;
+ 
+@@ -3215,6 +5060,82 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+     s->HEVClcList[0] = s->HEVClc;
+     s->sList[0] = s;
+ 
++#ifdef RPI
++    // Whilst FFmpegs init fn is only called once the close fn is called as
++    // many times as we have threads (init_thread_copy is called for the
++    // threads).  So to match init & term put the init here where it will be
++    // called by both init & copy
++    if (vpu_qpu_init() != 0)
++        goto fail;
++
++    for(job = 0; job < RPI_MAX_JOBS; job++) {
++        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
++        if (!s->unif_mv_cmds_y[job])
++            goto fail;
++        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
++        if (!s->unif_mv_cmds_c[job])
++            goto fail;
++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
++        if (!s->univ_pred_cmds[job])
++            goto fail;
++    }
++
++#if RPI_INTER
++    // We divide the image into blocks 256 wide and 64 high
++    // We support up to 2048 widths
++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
++    // Also add space for the startup command for each stream.
++
++    for (job = 0; job < RPI_MAX_JOBS; job++) {
++        uint32_t *p;
++#if RPI_CACHE_UNIF_MVS
++        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++#else
++        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++#endif
++        s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
++
++        // Set up initial locations for uniform streams
++        p = s->unif_mvs[job];
++        for(i = 0; i < QPU_N_UV; i++) {
++            s->mvs_base[job][i] = p;
++            p += UV_COMMANDS_PER_QPU;
++        }
++    }
++    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
++    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
++    s->qpu_filter_uv_b = qpu_fn(mc_filter_uv_b);
++
++    for (job=0; job < RPI_MAX_JOBS; job++)
++    {
++        uint32_t *p;
++#if RPI_CACHE_UNIF_MVS
++        gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
++#else
++        gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
++#endif
++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
++
++        // Set up initial locations for uniform streams
++        p = s->y_unif_mvs[job];
++        for(i = 0; i < QPU_N_Y; i++) {
++            s->y_mvs_base[job][i] = p;
++            p += Y_COMMANDS_PER_QPU;
++        }
++    }
++    s->qpu_filter = qpu_fn(mc_filter);
++    s->qpu_filter_b = qpu_fn(mc_filter_b);
++#endif
++    //gpu_malloc_uncached(2048*64,&s->dummy);
++
++    s->enable_rpi = 0;
++
++#ifdef RPI_WORKER
++    hevc_init_worker(s);
++#endif
++
++#endif
++
+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+     if (!s->cabac_state)
+         goto fail;
+@@ -3357,9 +5278,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+     }
+ 
+     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+-            s->threads_type = FF_THREAD_FRAME;
+-        else
+-            s->threads_type = FF_THREAD_SLICE;
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = FF_THREAD_SLICE;
+ 
+     return 0;
+ }
+@@ -3418,6 +5339,8 @@ AVCodec ff_hevc_decoder = {
+     .update_thread_context = hevc_update_thread_context,
+     .init_thread_copy      = hevc_init_thread_copy,
+     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++//                             0,
++//                             AV_CODEC_CAP_FRAME_THREADS,
+                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+     .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
+     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+diff --git b/libavcodec/hevcdec.h a/libavcodec/hevcdec.h
+index 0c78812..e068936 100644
+--- b/libavcodec/hevcdec.h
++++ a/libavcodec/hevcdec.h
+@@ -334,17 +334,6 @@ typedef struct CodingUnit {
+     uint8_t cu_transquant_bypass_flag;
+ } CodingUnit;
+ 
+-typedef struct Mv {
+-    int16_t x;  ///< horizontal component of motion vector
+-    int16_t y;  ///< vertical component of motion vector
+-} Mv;
+-
+-typedef struct MvField {
+-    DECLARE_ALIGNED(4, Mv, mv)[2];
+-    int8_t ref_idx[2];
+-    int8_t pred_flag;
+-} MvField;
+-
+ typedef struct NeighbourAvailable {
+     int cand_bottom_left;
+     int cand_left;
+@@ -421,7 +410,17 @@ typedef struct HEVCFrame {
+     uint8_t flags;
+ } HEVCFrame;
+ 
++#ifdef RPI_WORKER
++typedef struct HEVCLocalContextIntra {
++    TransformUnit tu;
++    NeighbourAvailable na;
++} HEVCLocalContextIntra;
++#endif
++
+ typedef struct HEVCLocalContext {
++    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
++    NeighbourAvailable na;
++
+     uint8_t cabac_state[HEVC_CONTEXTS];
+ 
+     uint8_t stat_coeff[4];
+@@ -436,8 +435,6 @@ typedef struct HEVCLocalContext {
+ 
+     int qPy_pred;
+ 
+-    TransformUnit tu;
+-
+     uint8_t ctb_left_flag;
+     uint8_t ctb_up_flag;
+     uint8_t ctb_up_right_flag;
+@@ -453,7 +450,6 @@ typedef struct HEVCLocalContext {
+     int ct_depth;
+     CodingUnit cu;
+     PredictionUnit pu;
+-    NeighbourAvailable na;
+ 
+ #define BOUNDARY_LEFT_SLICE     (1 << 0)
+ #define BOUNDARY_LEFT_TILE      (1 << 1)
+@@ -464,6 +460,89 @@ typedef struct HEVCLocalContext {
+     int boundary_flags;
+ } HEVCLocalContext;
+ 
++#ifdef RPI
++
++// The processing is done in chunks
++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
++// This is a distance of 1536 pixels across the screen
++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
++// but allocate more memory and increase the latency before data in the next frame can be processed
++#define RPI_NUM_CHUNKS 4
++#define RPI_CHUNK_SIZE 12
++
++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
++
++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
++// Each block can have an intra prediction and a transform_add command
++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
++// Worst case is 16x16 CTUs
++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
++
++#define RPI_CMD_LUMA_UNI 0
++#define RPI_CMD_CHROMA_UNI 1
++#define RPI_CMD_LUMA_BI 2
++#define RPI_CMD_CHROMA_BI 3
++#define RPI_CMD_V_BI 4
++
++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
++// #define RPI_PRECLEAR
++
++// Command for inter prediction
++typedef struct HEVCMvCmd {
++    uint8_t cmd;
++    uint8_t block_w;
++    uint8_t block_h;
++    int8_t ref_idx[2];
++    uint16_t dststride;
++    uint16_t srcstride;
++    uint16_t srcstride1;
++    int16_t weight;
++    int16_t offset;
++    int16_t x_off;
++    int16_t y_off;
++    uint8_t *src;
++    uint8_t *src1;
++    uint8_t *dst;
++    Mv mv;
++    Mv mv1;
++} HEVCMvCmd;
++
++
++// Command for intra prediction and transform_add of predictions to coefficients
++#define RPI_PRED_TRANSFORM_ADD 0
++#define RPI_PRED_INTRA 1
++#define RPI_PRED_I_PCM 2
++
++typedef struct HEVCPredCmd {
++    uint8_t type;
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t na;    // i_pred - but left here as they pack well
++    uint8_t c_idx; // i_pred
++    union {
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint32_t stride;
++        } ta;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
++    };
++} HEVCPredCmd;
++
++#endif
++
+ typedef struct HEVCContext {
+     const AVClass *c;  // needed by private avoptions
+     AVCodecContext *avctx;
+@@ -472,6 +551,9 @@ typedef struct HEVCContext {
+ 
+     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+     HEVCLocalContext    *HEVClc;
++#ifdef RPI_WORKER
++    HEVCLocalContextIntra HEVClcIntra;
++#endif
+ 
+     uint8_t             threads_type;
+     uint8_t             threads_number;
+@@ -479,6 +561,98 @@ typedef struct HEVCContext {
+     int                 width;
+     int                 height;
+ 
++    int used_for_ref;  // rpi
++#ifdef RPI
++    int enable_rpi;
++    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
++    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
++    int buf_width;
++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
++    int num_coeffs[RPI_MAX_JOBS][4];
++    int num_xfm_cmds[RPI_MAX_JOBS];
++    int num_mv_cmds_y[RPI_MAX_JOBS];
++    int num_mv_cmds_c[RPI_MAX_JOBS];
++    int num_pred_cmds[RPI_MAX_JOBS];
++    int num_dblk_cmds[RPI_MAX_JOBS];
++    int vpu_id;
++    int pass0_job; // Pass0 does coefficient decode
++    int pass1_job; // Pass1 does pixel processing
++    int ctu_count; // Number of CTUs done in pass0 so far
++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
++    int ctu_per_y_chan; // Number of CTUs per luma QPU
++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
++
++#if RPI_INTER
++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
++
++    // _base pointers are to the start of the row
++    uint32_t *mvs_base[RPI_MAX_JOBS][QPU_N_UV];
++    // these pointers are to the next free space
++    uint32_t *u_mvs[RPI_MAX_JOBS][QPU_N_UV];
++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
++    // Function pointers
++    uint32_t qpu_filter_uv;
++    uint32_t qpu_filter_uv_b0;
++    uint32_t qpu_filter_uv_b;
++
++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
++    uint32_t *y_mvs_base[RPI_MAX_JOBS][QPU_N_Y];
++    uint32_t *y_mvs[RPI_MAX_JOBS][QPU_N_Y];
++    uint32_t *curr_y_mvs; // Current uniform stream for luma
++    // Function pointers
++    uint32_t qpu_filter;
++    uint32_t qpu_filter_b;
++#endif
++
++#ifdef RPI_WORKER
++    pthread_t worker_thread;
++    pthread_cond_t worker_cond_head;
++    pthread_cond_t worker_cond_tail;
++    pthread_mutex_t worker_mutex;
++
++    int worker_tail; // Contains the number of posted jobs
++    int worker_head; // Contains the number of completed jobs
++    int kill_worker; // set to 1 to terminate the worker
++#endif
++
++#define RPI_DEBLOCK_VPU_Q_COUNT 2
++
++#ifdef RPI_DEBLOCK_VPU
++    int enable_rpi_deblock;
++
++    int uv_setup_width;
++    int uv_setup_height;
++    int setup_width; // Number of 16x16 blocks across the image
++    int setup_height; // Number of 16x16 blocks down the image
++
++    struct dblk_vpu_q_s
++    {
++        GPU_MEM_PTR_T deblock_vpu_gmem;
++
++        uint8_t (*y_setup_arm)[2][2][2][4];
++        uint8_t (*y_setup_vc)[2][2][2][4];
++
++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
++        uint8_t (*uv_setup_vc)[2][2][2][4];
++
++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
++        int vpu_cmds_vc;
++
++        vpu_qpu_wait_h cmd_id;
++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
++
++    struct dblk_vpu_q_s * dvq;
++    unsigned int dvq_n;
++
++#endif
++#endif
++
+     uint8_t *cabac_state;
+ 
+     /** 1 if the independent slice segment header was successfully parsed */
+@@ -596,6 +770,9 @@ typedef struct HEVCContext {
+     uint32_t max_mastering_luminance;
+     uint32_t min_mastering_luminance;
+ 
++#ifdef RPI
++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
++#endif
+ } HEVCContext;
+ 
+ int ff_hevc_decode_nal_sei(HEVCContext *s);
+@@ -703,6 +880,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+ 
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+ 
++#if RPI_INTER
++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
++#endif
++
++
+ /**
+  * Reset SEI values that are stored on the Context.
+  * e.g. Caption data that was extracted during NAL
+@@ -716,4 +898,8 @@ extern const uint8_t ff_hevc_qpel_extra_before[4];
+ extern const uint8_t ff_hevc_qpel_extra_after[4];
+ extern const uint8_t ff_hevc_qpel_extra[4];
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
++#endif
++
+ #endif /* AVCODEC_HEVCDEC_H */
+diff --git b/libavcodec/hevcdsp.c a/libavcodec/hevcdsp.c
+index 23e923f..a985f02 100644
+--- b/libavcodec/hevcdsp.c
++++ a/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
  #include "hevcdsp_template.c"
  #undef BIT_DEPTH
@@ -7068,10 +7466,10 @@ index 9d773d9..a6534a9 100644
      if (ARCH_X86)
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index 9f1f6dd..e221e54 100644
---- a/libavcodec/hevcdsp.h
-+++ b/libavcodec/hevcdsp.h
+diff --git b/libavcodec/hevcdsp.h a/libavcodec/hevcdsp.h
+index eefb3cd..a41aa09 100644
+--- b/libavcodec/hevcdsp.h
++++ a/libavcodec/hevcdsp.h
 @@ -42,6 +42,17 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
@@ -7100,10 +7498,10 @@ index 9f1f6dd..e221e54 100644
  } HEVCDSPContext;
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+diff --git b/libavcodec/hevcpred_template.c a/libavcodec/hevcpred_template.c
 index 6ae87cc..28d2653 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
+--- b/libavcodec/hevcpred_template.c
++++ a/libavcodec/hevcpred_template.c
 @@ -20,6 +20,8 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
@@ -7137,10 +7535,43 @@ index 6ae87cc..28d2653 100644
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 099a8c5..bdff2d2 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
+diff --git b/libavcodec/mjpegenc_common.c a/libavcodec/mjpegenc_common.c
+index 6d9c982..83a9e95 100644
+--- b/libavcodec/mjpegenc_common.c
++++ a/libavcodec/mjpegenc_common.c
+@@ -91,17 +91,13 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
+ {
+     int i, j, size;
+     uint8_t *ptr;
+-    MpegEncContext *s = NULL;
+-
+-    /* Since avctx->priv_data will point to LJpegEncContext in this case */
+-    if (avctx->codec_id != AV_CODEC_ID_LJPEG)
+-        s = avctx->priv_data;
++    MpegEncContext *s = avctx->priv_data;
+ 
+     if (avctx->codec_id != AV_CODEC_ID_LJPEG) {
+         int matrix_count = 1 + !!memcmp(luma_intra_matrix,
+                                         chroma_intra_matrix,
+                                         sizeof(luma_intra_matrix[0]) * 64);
+-    if (s && s->force_duplicated_matrix)
++    if (s->force_duplicated_matrix)
+         matrix_count = 2;
+     /* quant matrixes */
+     put_marker(p, DQT);
+@@ -138,7 +134,7 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
+ 
+     // Only MJPEG can have a variable Huffman variable. All other
+     // formats use the default Huffman table.
+-    if (s && s->huffman == HUFFMAN_TABLE_OPTIMAL) {
++    if (s->out_format == FMT_MJPEG && s->huffman == HUFFMAN_TABLE_OPTIMAL) {
+         size += put_huffman_table(p, 0, 0, s->mjpeg_ctx->bits_dc_luminance,
+                                   s->mjpeg_ctx->val_dc_luminance);
+         size += put_huffman_table(p, 0, 1, s->mjpeg_ctx->bits_dc_chrominance,
+diff --git b/libavcodec/mmaldec.c a/libavcodec/mmaldec.c
+index 81fcebc..7858478 100644
+--- b/libavcodec/mmaldec.c
++++ a/libavcodec/mmaldec.c
 @@ -24,6 +24,9 @@
   * MMAL Video Decoder
   */
@@ -7156,14 +7587,14 @@ index 099a8c5..bdff2d2 100644
  #include <interface/mmal/util/mmal_default_components.h>
  #include <interface/mmal/vc/mmal_vc_api.h>
 +#pragma GCC diagnostic pop
+ #include <stdatomic.h>
  
  #include "avcodec.h"
- #include "internal.h"
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 3adf28d..2f9195f 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+diff --git b/libavcodec/mpeg4videodec.c a/libavcodec/mpeg4videodec.c
+index 791a07b..502c21f 100644
+--- b/libavcodec/mpeg4videodec.c
++++ a/libavcodec/mpeg4videodec.c
+@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
  
          if (ctx->divx_version >= 0)
              s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
@@ -7173,7 +7604,7 @@ index 3adf28d..2f9195f 100644
      }
  
      if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
                 s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
                 ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
  
@@ -7181,11 +7612,25 @@ index 3adf28d..2f9195f 100644
      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
          s->codec_id == AV_CODEC_ID_MPEG4 &&
          avctx->idct_algo == FF_IDCT_AUTO) {
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+diff --git b/libavcodec/mpegvideo_enc.c a/libavcodec/mpegvideo_enc.c
+index 882cf09..71a858f 100644
+--- b/libavcodec/mpegvideo_enc.c
++++ a/libavcodec/mpegvideo_enc.c
+@@ -399,9 +399,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
+         return AVERROR(EINVAL);
+     }
+ 
+-    if (s->huffman && avctx->codec_id == AV_CODEC_ID_AMV)
+-        s->huffman = 0;
+-
+     if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) {
+         av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n");
+         return AVERROR(EINVAL);
+diff --git b/libavcodec/rpi_hevc_transform.h a/libavcodec/rpi_hevc_transform.h
 new file mode 100644
 index 0000000..4309f1c
 --- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.h
++++ a/libavcodec/rpi_hevc_transform.h
 @@ -0,0 +1,3070 @@
 +unsigned char rpi_hevc_transform [] = {
 +21,
@@ -10257,11 +10702,11 @@ index 0000000..4309f1c
 +33,
 +3,
 +};
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+diff --git b/libavcodec/rpi_hevc_transform.s a/libavcodec/rpi_hevc_transform.s
 new file mode 100644
 index 0000000..5543093
 --- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
++++ a/libavcodec/rpi_hevc_transform.s
 @@ -0,0 +1,917 @@
 +# ******************************************************************************
 +# Argon Design Ltd.
@@ -11180,12 +11625,12 @@ index 0000000..5543093
 +  bgt loop_cmds
 +
 +  pop r6-r7, pc
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+diff --git b/libavcodec/rpi_mailbox.c a/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..3904efc
+index 0000000..8d8a20d
 --- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,340 @@
++++ a/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,118 @@
 +/*
 +Copyright (c) 2012, Broadcom Europe Ltd.
 +All rights reserved.
@@ -11220,7 +11665,6 @@ index 0000000..3904efc
 +#include <unistd.h>
 +#include <assert.h>
 +#include <stdint.h>
-+#include <sys/mman.h>
 +#include <sys/ioctl.h>
 +
 +#include <linux/ioctl.h>
@@ -11231,75 +11675,6 @@ index 0000000..3904efc
 +
 +#include "rpi_mailbox.h"
 +
-+#define PAGE_SIZE (4*1024)
-+
-+// Shared memory will not be cached in ARM cache
-+void *mapmem_shared(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_SHARED/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-+void *mapmem_private(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_PRIVATE/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+void unmapmem(void *addr, unsigned size)
-+{
-+   int s = munmap(addr, size);
-+   if (s != 0) {
-+      printf("munmap error %d\n", s);
-+      exit (-1);
-+   }
-+}
-+
 +/*
 + * use ioctl to send mbox property message
 + */
@@ -11320,47 +11695,7 @@ index 0000000..3904efc
 +   return ret_val;
 +}
 +
-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000c; // (the tag id)
-+   p[i++] = 12; // (size of the buffer)
-+   p[i++] = 12; // (size of the data)
-+   p[i++] = size; // (num bytes? or pages?)
-+   p[i++] = align; // (alignment)
-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_free(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000f; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_lock(int file_desc, unsigned handle)
++unsigned mbox_mem_lock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
@@ -11379,7 +11714,7 @@ index 0000000..3904efc
 +   return p[5];
 +}
 +
-+unsigned mem_unlock(int file_desc, unsigned handle)
++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
@@ -11398,118 +11733,6 @@ index 0000000..3904efc
 +   return p[5];
 +}
 +
-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x30010; // (the tag id)
-+   p[i++] = 28; // (size of the buffer)
-+   p[i++] = 28; // (size of the data)
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned qpu_enable(int file_desc, unsigned enable)
-+{
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x30012; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = enable;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30011; // (the tag id)
-+   p[i++] = 16; // (size of the buffer)
-+   p[i++] = 16; // (size of the data)
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30018; // (the tag id)
-+   p[i++] = 88; // (size of the buffer)
-+   p[i++] = 88; // (size of the data)
-+
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = num_qpus_2;
-+   p[i++] = control_2;
-+   p[i++] = noflush_2;
-+   p[i++] = timeout_2; // ms
-+
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = code_2;
-+   p[i++] = r0_2;
-+   p[i++] = r1_2;
-+   p[i++] = r2_2;
-+   p[i++] = r3_2;
-+   p[i++] = r4_2;
-+   p[i++] = r5_2;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return;
-+}
 +
 +int mbox_open() {
 +   int file_desc;
@@ -11526,55 +11749,29 @@ index 0000000..3904efc
 +void mbox_close(int file_desc) {
 +  close(file_desc);
 +}
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+diff --git b/libavcodec/rpi_mailbox.h a/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..5898102
+index 0000000..b51303b
 --- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,25 @@
++++ a/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,10 @@
 +#ifndef RPI_MAILBOX_H
 +#define RPI_MAILBOX_H
 +
 +extern int mbox_open(void);
 +extern void mbox_close(int file_desc);
 +
-+extern unsigned get_version(int file_desc);
-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-+extern unsigned mem_free(int file_desc, unsigned handle);
-+extern unsigned mem_lock(int file_desc, unsigned handle);
-+extern unsigned mem_unlock(int file_desc, unsigned handle);
-+extern void *mapmem_shared(unsigned base, unsigned size);
-+extern void *mapmem_private(unsigned base, unsigned size);
-+extern void unmapmem(void *addr, unsigned size);
-+
-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-+extern void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
-+extern unsigned qpu_enable(int file_desc, unsigned enable);
++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
 +
 +#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+diff --git b/libavcodec/rpi_qpu.c a/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..365f4a6
+index 0000000..be58458
 --- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,993 @@
++++ a/libavcodec/rpi_qpu.c
+@@ -0,0 +1,827 @@
 +#ifdef RPI
-+// Use vchiq service for submitting jobs
-+#define GPUSERVICE
-+
-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+//#define RPI_TIME_TOTAL_QPU
-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+//#define RPI_TIME_TOTAL_VPU
-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+#define RPI_TIME_TOTAL_POSTED
-+
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
@@ -11592,22 +11789,23 @@ index 0000000..365f4a6
 +#include "rpi_shader.h"
 +#include "rpi_hevc_transform.h"
 +
-+#include "rpi_user_vcsm.h"
-+#ifdef GPUSERVICE
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
 +#pragma GCC diagnostic ignored "-Wredundant-decls"
 +#include "interface/vmcs_host/vc_vchi_gpuserv.h"
 +#pragma GCC diagnostic pop
-+#endif
 +
-+// QPU profile flags
-+#define NO_FLUSH 1
-+#define CLEAR_PROFILE 2
-+#define OUTPUT_COUNTS 4
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
 +
-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
++// QPU "noflush" flags
++// a mixture of flushing & profiling
 +
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
 +// On Pi2 there is no way to access the VPU L2 cache
 +// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
@@ -11664,65 +11862,212 @@ index 0000000..365f4a6
 +{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
 +};
 +
++// Code/constants on GPU
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
 +  unsigned int vpu_code[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
-+  int open_count; // Number of allocated video buffers
-+  int      mb; // Mailbox handle
-+  int      vc; // Address in GPU memory
-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
 +};
 +
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++  int count;
++  int64_t start[WAIT_COUNT_MAX];
++  int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++  unsigned int jcount;
++  int64_t start0;
++  int64_t last_update;
++  trace_time_one_t active;
++  trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++  sem_t sem;
++  unsigned int cost;
++  struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++  vq_wait_t * head;
++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++  int open_count;
++  int init_count;
++  int mb;
++  unsigned int current_load;
++  GPU_MEM_PTR_T code_gm_ptr;
++  vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
 +// Stop more than one thread trying to allocate memory or use the processing resources at once
 +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static volatile struct GPU* gpu = NULL;
-+static GPU_MEM_PTR_T gpu_mem_ptr;
++static gpu_env_t * gpu = NULL;
 +
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+static unsigned int Microseconds(void) {
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
 +    struct timespec ts;
-+    unsigned int x;
-+    static unsigned int base = 0;
-+    clock_gettime(CLOCK_REALTIME, &ts);
-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-+    if (base==0) base=x;
-+    return x-base;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
 +}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++  // Update totals for levels that are still pending
++  for (int i = 0; i < tto->count; ++i) {
++    tto->total[i] += now - tto->start[i];
++    tto->start[i] = now;
++  }
++
++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++  av_assert0(tto->count < WAIT_COUNT_MAX);
++  tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++  const int n = --tto->count;
++  av_assert0(n >= 0);
++  tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++  tto_print(&ttw->active, now, ttw->start0, "Active");
++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
++
 +#endif
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
++// GPU memory alloc fns (internal)
++
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++  mbox_mem_unlock(mb, p->vc_handle);
++  vcsm_unlock_ptr(p->arm);
++  vcsm_free(p->vcsm_handle);
++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++  gpu_env_t * const ge = gpu;
++
++  // We have to hope that eveything has terminated...
++  gpu = NULL;
++
++  vc_gpuserv_deinit();
++
++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
++
++  vcsm_exit();
++
++  mbox_close(ge->mb);
++
++  vq_wait_pool_deinit(&ge->wait_pool);
++
++  free(ge);
++}
++
 +
 +// Connect to QPU, returns 0 on success.
-+static int gpu_init(volatile struct GPU **gpu) {
-+  int mb = mbox_open();
-+  int vc;
++static int gpu_init(gpu_env_t ** const gpu) {
 +  volatile struct GPU* ptr;
-+	if (mb < 0)
-+		return -1;
-+#ifndef RPI_ASYNC
-+	if (qpu_enable(mb, 1)) return -2;
-+#endif
++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++  *gpu = NULL;
++
++  if (ge == NULL)
++    return -1;
++
++  if ((ge->mb = mbox_open()) < 0)
++    return -1;
++
++  vq_wait_pool_init(&ge->wait_pool);
++
 +  vcsm_init();
-+  vc_gpuserv_init();
-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+  memset((void*)ptr, 0, sizeof *ptr);
-+  vc = gpu_mem_ptr.vc;
 +
-+  ptr->mb = mb;
-+  ptr->vc = vc;
++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
 +
-+  printf("GPU allocated at 0x%x\n",vc);
-+
-+  *gpu = ptr;
++  // Zero everything so we have zeros between the code bits
++  memset((void *)ptr, 0, sizeof(*ptr));
 +
 +  // Now copy over the QPU code into GPU memory
 +  {
-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
++    int num_bytes = (char *)mc_end - (char *)rpi_shader;
 +    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
 +    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
 +  }
@@ -11735,106 +12080,56 @@ index 0000000..365f4a6
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
 +
-+#ifdef RPI_ASYNC
-+  {
-+    int err;
-+    vpu_async_tail = 0;
-+    vpu_async_head = 0;
-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-+    //printf("Created thread\n");
-+    if (err) {
-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
-+        return -4;
-+    }
-+
-+    {
-+      struct sched_param param = {0};
-+      int policy = 0;
-+
-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+      {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+      }
-+      else
-+      {
-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
-+
-+        policy = SCHED_FIFO;
-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
-+
-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
-+
-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
-+        }
-+        else
-+        {
-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+          {
-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+          }
-+          else
-+          {
-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
-+                policy,
-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+                param.sched_priority);
-+          }
-+        }
-+      }
-+
-+    }
-+
-+  }
-+#endif
-+
++  *gpu = ge;
 +  return 0;
 +}
 +
-+// Returns 1 if the gpu is currently idle
-+static int gpu_idle(void)
-+{
-+  int ret = pthread_mutex_trylock(&gpu_mutex);
-+  if (ret==0) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+    return 1;
-+  }
-+  return 0;
-+}
 +
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static void gpu_lock(void) {
-+  pthread_mutex_lock(&gpu_mutex);
-+
-+  if (gpu==NULL) {
-+    gpu_init(&gpu);
-+  }
-+}
 +
 +static void gpu_unlock(void) {
 +  pthread_mutex_unlock(&gpu_mutex);
 +}
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++  pthread_mutex_lock(&gpu_mutex);
++
++  av_assert0(gpu != NULL);
++  return gpu;
 +}
 +
++static gpu_env_t * gpu_lock_ref(void)
++{
++  pthread_mutex_lock(&gpu_mutex);
++
++  if (gpu == NULL) {
++    int rv = gpu_init(&gpu);
++    if (rv != 0) {
++      gpu_unlock();
++      return NULL;
++    }
++  }
++
++  ++gpu->open_count;
++  return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++  if (--ge->open_count == 0)
++    gpu_term();
++
++  gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++  av_assert0(gpu != NULL);
++  return gpu;
++}
++
++// Public gpu fns
++
 +// Allocate memory on GPU
 +// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
 +// Returns 0 on success.
@@ -11843,731 +12138,476 @@ index 0000000..365f4a6
 +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+int gpu_get_mailbox(void)
-+{
-+  av_assert0(gpu);
-+  return gpu->mb;
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p->arm;
-+    iocache.s[0].size  = p->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp = vcsm_lock(p->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p0->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p0->arm;
-+    iocache.s[0].size  = p0->numbytes;
-+    iocache.s[1].handle = p1->vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int) p1->arm;
-+    iocache.s[1].size  = p1->numbytes;
-+    iocache.s[2].handle = p2->vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int) p2->arm;
-+    iocache.s[2].size  = p2->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp;
-+    tmp = vcsm_lock(p0->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p1->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p2->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
-+}
-+
 +// This allocates data that will be
 +//    Cached in ARM L2
 +//    Uncached in VPU L2
 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_cached_internal(numbytes, p);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+static void gpu_term(void)
-+{
-+  int mb;
-+
-+  if (gpu==NULL)
-+    return;
-+  mb = gpu->mb;
-+
-+  // ??? Tear down anything needed for gpuexecute
-+
-+  qpu_enable(mb, 0);
-+  gpu_free_internal(&gpu_mem_ptr);
-+
-+  vc_gpuserv_deinit();
-+  vcsm_exit();
-+
-+  mbox_close(mb);
-+  gpu = NULL;
-+}
-+
-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+  int mb = gpu->mb;
-+  mem_unlock(mb,p->vc_handle);
-+  vcsm_unlock_ptr(p->arm);
-+  vcsm_free(p->vcsm_handle);
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T *p) {
-+  gpu_lock();
-+
-+  gpu_free_internal(p);
-+
-+  gpu->open_count--;
-+  if (gpu->open_count==0) {
-+      printf("Closing GPU\n");
-+      gpu_term();
-+      gpu = NULL;
-+  }
-+  gpu_unlock();
++void gpu_free(GPU_MEM_PTR_T * const p) {
++  gpu_env_t * const ge = gpu_lock();
++  gpu_free_internal(ge->mb, p);
++  gpu_unlock_unref(ge);
 +}
 +
 +unsigned int vpu_get_fn(void) {
 +  // Make sure that the gpu is initialized
-+  if (gpu==NULL) {
-+    printf("Preparing gpu\n");
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,vpu_code);
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
 +}
 +
 +unsigned int vpu_get_constants(void) {
-+  if (gpu==NULL) {
-+    gpu_lock();
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
++}
++
++int gpu_get_mailbox(void)
++{
++  av_assert0(gpu);
++  return gpu->mb;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++
++rpi_cache_flush_env_t * rpi_cache_flush_init()
++{
++    rpi_cache_flush_env_t * const rfe = calloc(1, sizeof(rpi_cache_flush_env_t));
++    if (rfe == NULL)
++        return NULL;
++
++    return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++    if (rfe != NULL)
++        free(rfe);
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++    int rc = (rfe->n == 0) ? 0 : vcsm_clean_invalid(&rfe->a);
++
++    free(rfe);
++
++    if (rc == 0)
++        return 0;
++
++    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++    return rc;
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++    av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
++
++    // Deal with empty pointer trivially
++    if (gm == NULL || gm->numbytes == 0)
++        return;
++
++    rfe->a.s[rfe->n].cmd = mode;
++    rfe->a.s[rfe->n].handle = gm->vcsm_handle;
++    rfe->a.s[rfe->n].addr = (unsigned int)gm->arm;
++    rfe->a.s[rfe->n].size = gm->numbytes;
++    ++rfe->n;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
++{
++    // Deal with empty pointer trivially
++    if (gm == NULL || size == 0)
++        return;
++
++    av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
++    av_assert0(offset <= gm->numbytes);
++    av_assert0(size <= gm->numbytes);
++    av_assert0(offset + size <= gm->numbytes);
++
++    rfe->a.s[rfe->n].cmd = mode;
++    rfe->a.s[rfe->n].handle = gm->vcsm_handle;
++    rfe->a.s[rfe->n].addr = (unsigned int)gm->arm + offset;
++    rfe->a.s[rfe->n].size = size;
++    ++rfe->n;
++}
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++  }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++  }
++}
++
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * start_line;
++  const unsigned int y_size = frame->linesize[0] * n;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
++
++  if (gpu_is_buf1(frame)) {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
++}
++
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(const unsigned int cost)
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  ge->current_load += cost;
++  wait->cost = cost;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
++    }
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
++    }
++  }
++#endif
++  gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
++  if (wait->cost != 0)
++#endif
++  {
++    gpu_env_t *const ge = gpu_lock();
++    ge->current_load -= wait->cost;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    tto_end(&ge->ttw.active, ns_time());
++#endif
 +    gpu_unlock();
 +  }
-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
++
++  sem_post(&wait->sem);
 +}
 +
-+#ifdef GPUSERVICE
-+static void callback(void *cookie)
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
 +{
-+  sem_post((sem_t *)cookie);
++  unsigned int n;
++  unsigned int mask;
++  unsigned int cost;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
++{
++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  return vqj;
 +}
-+#endif
 +
-+
-+static volatile uint32_t post_done = 0;
-+static volatile uint32_t post_qed = 0;
-+
-+static void post_code2_cb(void * v)
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
 +{
-+  uint32_t n = (uint32_t)v;
-+  if ((int32_t)(n - post_done) > 0) {
-+    post_done = n;
++  memset(vqj, 0, sizeof(*vqj));
++  free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
++
++    j->command = EXECUTE_VPU;
++    j->u.v.q[0] = vpu_code;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
 +  }
 +}
 +
-+
-+// Post a command to the queue
-+// Returns an id which we can use to wait for completion
-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
 +{
-+  struct gpu_job_s j[1] = {
-+    {
-+      .command = EXECUTE_VPU,
-+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
-+      .callback.func = post_code2_cb
-+    }
-+  };
-+  uint32_t id;
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++    vqj->cost += cost;
 +
-+  j[0].callback.cookie = (void *)(id = ++post_qed);
-+
-+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+  return id;
++    j->command = EXECUTE_QPU;
++    j->u.q.jobs = n;
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
 +}
 +
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail)
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
 +{
-+#if 1
-+  sem_t sync0;
-+  struct gpu_job_s j[4];
++  vq_wait_post(v);
++}
 +
-+  sem_init(&sync0, 0, 0);
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++  vq_wait_t * wait;
 +
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = 0;
-+  j[0].callback.cookie = NULL;
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
 +
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
++  // We are going to want a sync object
++  wait = vq_wait_new(vqj->cost);
 +
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = 0;
-+  j[2].callback.cookie = NULL;
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert0(j->callback.func == 0);
 +
-+  j[3].command = EXECUTE_SYNC;
-+  j[3].u.s.mask = 3;
-+  j[3].callback.func = callback;
-+  j[3].callback.cookie = (void *)&sync0;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
 +
-+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
 +
-+  sem_wait(&sync0);
-+#else
++  vqj->cost = 0;
++  vqj->mask = 0;
++  *wait_h = wait;
++}
 +
-+  sem_t sync0, sync2;
-+  struct gpu_job_s j[3];
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
 +
-+  sem_init(&sync0, 0, 0);
-+  sem_init(&sync2, 0, 0);
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
 +
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = callback;
-+  j[0].callback.cookie = (void *)&sync0;
++unsigned int vpu_qpu_current_load(void)
++{
++  return gpu_ptr()->current_load;
++}
 +
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
++    }
++  }
++}
 +
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = callback;
-+  j[2].callback.cookie = (void *)&sync2;
++int vpu_qpu_init()
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
 +
-+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
-+
-+  sem_wait(&sync0);
-+  sem_wait(&sync2);
-+#endif
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
 +
++  gpu_unlock();
 +  return 0;
 +}
 +
-+
-+// Wait for completion of the given command
-+void vpu_wait(int id)
++void vpu_qpu_term()
 +{
-+  if (id == 0) {
-+#if 0
-+    sem_t sync0;
-+    struct gpu_job_s j[1] =
-+    {
-+      {
-+        .command = EXECUTE_SYNC,
-+        .u.s.mask = 3,
-+        .callback.func = callback,
-+        .callback.cookie = (void *)&sync0
-+      }
-+    };
++  gpu_env_t * const ge = gpu_lock();
 +
-+    sem_init(&sync0, 0, 0);
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
 +
-+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+    sem_wait(&sync0);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
 +#endif
 +  }
-+  else {
-+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
-+      usleep(1000);
-+    }
-+  }
++
++  gpu_unlock_unref(ge);
 +}
 +
-+
-+unsigned int qpu_get_fn(int num) {
-+    // Make sure that the gpu is initialized
-+    unsigned int *fn;
-+    if (gpu==NULL) {
-+      printf("Preparing gpu\n");
-+      gpu_lock();
-+      gpu_unlock();
-+    }
-+    switch(num) {
-+    case QPU_MC_SETUP:
-+      fn = mc_setup;
-+      break;
-+    case QPU_MC_FILTER:
-+      fn = mc_filter;
-+      break;
-+    case QPU_MC_EXIT:
-+      fn = mc_exit;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT12:
-+      fn = mc_interrupt_exit12;
-+      break;
-+    case QPU_MC_FILTER_B:
-+      fn = mc_filter_b;
-+      break;
-+    //case QPU_MC_FILTER_HONLY:
-+    //  fn = mc_filter_honly;
-+    //  break;
-+    case QPU_MC_SETUP_UV:
-+      fn = mc_setup_uv;
-+      break;
-+    case QPU_MC_FILTER_UV:
-+      fn = mc_filter_uv;
-+      break;
-+    case QPU_MC_FILTER_UV_B0:
-+      fn = mc_filter_uv_b0;
-+      break;
-+    case QPU_MC_FILTER_UV_B:
-+      fn = mc_filter_uv_b;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT8:
-+      fn = mc_interrupt_exit8;
-+      break;
-+    case QPU_MC_END:
-+      fn = mc_end;
-+      break;
-+    default:
-+      printf("Unknown function\n");
-+      exit(-1);
-+    }
-+    return gpu->vc + 4*(int)(fn-rpi_shader);
-+    //return code[num] + gpu->vc;
-+}
-+
-+#if 0
-+typedef unsigned int uint32_t;
-+
-+typedef struct mvs_s {
-+    GPU_MEM_PTR_T unif_mvs_ptr;
-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[8];
-+
-+} HEVCContext;
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+
-+static void rpi_inter_clear(HEVCContext *s)
++uint32_t qpu_fn(const int * const mc_fn)
 +{
-+    int i;
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[i] = s->mvs_base[i];
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 128;  // w
-+        *s->u_mvs[i]++ = 128;  // h
-+        *s->u_mvs[i]++ = 128;  // stride u
-+        *s->u_mvs[i]++ = 128;  // stride v
-+        s->u_mvs[i] += 3;  // Padding words
-+    }
++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
-+static void rpi_execute_inter_qpu(HEVCContext *s)
-+{
-+    int k;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-+    }
-+
-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+      );
-+}
-+
-+void rpi_test_qpu(void)
-+{
-+    HEVCContext mvs;
-+    HEVCContext *s = &mvs;
-+    int i;
-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+    uint32_t *p;
-+    printf("Allocate memory\n");
-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-+
-+    // Set up initial locations for uniform streams
-+    p = s->unif_mvs;
-+    for(i = 0; i < 8; i++) {
-+        s->mvs_base[i] = p;
-+        p += uv_commands_per_qpu;
-+    }
-+    // Now run a simple program that should just quit immediately after a single texture fetch
-+    rpi_inter_clear(s);
-+    for(i=0;i<4;i++) {
-+      printf("Launch QPUs\n");
-+      rpi_execute_inter_qpu(s);
-+      printf("Done\n");
-+    }
-+    printf("Free memory\n");
-+    gpu_free(&s->unif_mvs_ptr);
-+    return;
-+}
-+#endif
-+
-+#if 0
-+
-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-+
-+static uint8_t av_clip_uint8(int32_t a)
-+{
-+    if (a&(~255)) return (-a)>>31;
-+    else          return a;
-+}
-+
-+static int32_t filter8(const uint8_t *data, int pitch)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += hcoeffs[x]*data[x + y * pitch];
-+
-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-+   }
-+
-+   return av_clip_uint8( (vsum + 64) >> 7);
-+}
-+
-+// Note regression changes coefficients so is not thread safe
-+//#define REGRESSION
-+#ifdef REGRESSION
-+#define CMAX 100
-+#else
-+#define CMAX 2
-+#endif
-+#define YMAX 16
-+
-+int rpi_test_shader(void)
-+{
-+   int i, c;
-+
-+   uint32_t *unifs;
-+
-+   uint8_t *in_buffer;
-+   uint8_t *out_buffer[2];
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   GPU_MEM_PTR_T in_buffer_ptr;
-+   GPU_MEM_PTR_T out_buffer_ptr[2];
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+
-+   int pitch = 0x500;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   printf("This needs to change to reflect new assembler\n");
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return -2;
-+   }
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-+      return -3;
-+   }
-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-+      return -4;
-+   }
-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-+
-+   for (c = 0; c < CMAX; c++) {
-+      int xo[] = {rand()&31, rand()&31};
-+
-+#ifdef REGRESSION
-+      for (i = 0; i < 8; i++) {
-+         hcoeffs[i] = (int8_t)rand();
-+         vcoeffs[i] = (int8_t)rand();
-+         if (hcoeffs[i]==-128)
-+           hcoeffs[i]++;
-+         if (vcoeffs[i]==-128)
-+           vcoeffs[i]++;
-+      }
-+#endif
-+
-+      for (i = 0; i < 64*23; i++) {
-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-+         in_buffer[i] = rand();
-+      }
-+
-+      // Clear output array
-+      {
-+        int b;
-+        for(b=0;b<2;b++) {
-+          for(i=0;i<16*16;i++) {
-+            out_buffer[b][i] = 3;
-+          }
-+        }
-+      }
-+
-+      unifs[0] = mc_filter;
-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-+      unifs[2] = 64; // src pitch
-+      unifs[3] = pitch; // dst pitch
-+      unifs[4] = 0; // Padding
-+      unifs[5] = 0;
-+      unifs[6] = 0;
-+      unifs[7 ] = mc_filter;
-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[13] = out_buffer_ptr[0].vc;
-+      unifs[14] = mc_exit;
-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[20] = out_buffer_ptr[1].vc;
-+
-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-+
-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-+
-+      if (1)
-+      {
-+         int x, y, b;
-+         int bad = 0;
-+
-+         for (b=0; b<2; ++b)
-+            for (y=0; y<YMAX; ++y)
-+               for (x=0; x<16; ++x) {
-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-+
-+                  if (out_buffer[b][x+y*pitch] != ref) {
-+                      bad = 1;
-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-+                  }
-+#ifndef REGRESSION
-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-+#endif
-+               }
-+          if (bad)
-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+          else
-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+      }
-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
-+   }
-+
-+   gpu_free(&out_buffer_ptr[0]);
-+   gpu_free(&out_buffer_ptr[1]);
-+   gpu_free(&in_buffer_ptr);
-+   gpu_free(&unifs_ptr);
-+
-+   return 0;
-+}
-+
-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
-+{
-+  int x,y;
-+  for (y=0; y<16; ++y) {
-+    for (x=0; x<16; ++x) {
-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
-+    }
-+  }
-+}
-+
-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
-+{
-+   uint32_t *unifs;
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   //uint8_t *out_buffer;
-+   //GPU_MEM_PTR_T out_buffer_ptr;
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+   //int x,y;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return;
-+   }
-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-+
-+   /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         out_buffer[x+y*dst_pitch] = 7;
-+      }
-+    }*/
-+
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+    unifs[0] = mc_filter;
-+    unifs[1] = (int)in_buffer_vc;
-+    unifs[2] = src_pitch; // src pitch
-+    unifs[3] = dst_pitch; // dst pitch
-+    unifs[4] = 0; // Padding
-+    unifs[5] = 0;
-+    unifs[6] = 0;
-+    unifs[7 ] = mc_exit;
-+    unifs[8 ] = (int)in_buffer_vc;
-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+    unifs[13] = (int)dst_vc;
-+    //unifs[13] = (int)out_buffer_ptr.vc;
-+
-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
-+
-+    /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-+      }
-+    }*/
-+
-+    gpu_free(&unifs_ptr);
-+    //gpu_free(&out_buffer_ptr);
-+}
-+
-+
-+
-+#endif
-+
 +#endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+diff --git b/libavcodec/rpi_qpu.h a/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..c6cdb2b
+index 0000000..bcde316
 --- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,176 @@
++++ a/libavcodec/rpi_qpu.h
+@@ -0,0 +1,204 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
-+#define RPI_FAST_CACHEFLUSH
++#include <interface/vcsm/user-vcsm.h>
 +
 +#define RPI_ONE_BUF 1
 +
@@ -12582,9 +12622,7 @@ index 0000000..c6cdb2b
 +// General GPU functions
 +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
 +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
 +
 +#include "libavutil/frame.h"
 +#if !RPI_ONE_BUF
@@ -12627,29 +12665,31 @@ index 0000000..c6cdb2b
 +    return av_buffer_get_opaque(frame->buf[0]);
 +}
 +
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
 +{
 +    return av_buffer_pool_opaque(frame->buf[n]);
 +}
 +
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
++
 +
 +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
++    return get_vc_address3(frame, 0);
 +}
 +
 +static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 1)->vc;
++    return get_vc_address3(frame, 1);
 +}
 +
 +static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 2)->vc;
++    return get_vc_address3(frame, 2);
 +}
 +
-+
++#if 0
 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
 +    if (gpu_is_buf1(frame))
 +    {
@@ -12686,30 +12726,44 @@ index 0000000..c6cdb2b
 +    else
 +        return *gpu_buf3_gmem(frame, 2);
 +}
-+
 +#endif
++#endif
++
++// Cache flush stuff
++
++typedef struct rpi_flush_envss {
++    unsigned int n;
++    struct vcsm_user_clean_invalid_s a;
++} rpi_cache_flush_env_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
 +
 +// QPU specific functions
-+extern void rpi_test_qpu(void);
++uint32_t qpu_fn(const int * const mc_fn);
 +
-+enum {
-+  QPU_MC_SETUP,
-+  QPU_MC_FILTER,
-+  QPU_MC_EXIT,
-+  QPU_MC_INTERRUPT_EXIT12,
-+  QPU_MC_FILTER_B,
-+  QPU_MC_FILTER_HONLY,
-+  QPU_MC_SETUP_UV,
-+  QPU_MC_FILTER_UV,
-+  QPU_MC_FILTER_UV_B0,
-+  QPU_MC_FILTER_UV_B,
-+  QPU_MC_INTERRUPT_EXIT8,
-+  QPU_MC_END
-+  };
-+extern unsigned int qpu_get_fn(int num);
-+
-+#define QPU_N_UV   8
++#define QPU_N_UV   12
 +#define QPU_N_Y    12
 +#define QPU_N_MAX  16
 +
@@ -12718,16 +12772,32 @@ index 0000000..c6cdb2b
 +#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
 +#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
 +
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
 +// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++
 +extern unsigned int vpu_get_fn(void);
 +extern unsigned int vpu_get_constants(void);
-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail);
 +
-+extern void vpu_wait( int id);
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++unsigned int vpu_qpu_current_load(void);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
 +
 +// Simple test of shader code
 +extern int rpi_test_shader(void);
@@ -12738,12 +12808,12 @@ index 0000000..c6cdb2b
 +extern int gpu_get_mailbox(void);
 +
 +#endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+diff --git b/libavcodec/rpi_shader.c a/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..06fb166
+index 0000000..627cda9
 --- /dev/null
-+++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,629 @@
++++ a/libavcodec/rpi_shader.c
+@@ -0,0 +1,624 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -12768,642 +12838,645 @@ index 0000000..06fb166
 +#endif
 +unsigned int rpi_shader[] = {
 +// ::mc_setup_uv
-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++/* [0x00000000] */ 0x95801ff6, 0xd002591e, // mov tmurs, 1          ; mov ra_link, unif
++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000010] */ 0x159a7d80, 0x10020827, // mov r0, elem_num
++/* [0x00000018] */ 0x0c027c00, 0x14020427, // add ra_x, ra0.16b, r0
++/* [0x00000020] */ 0x15027d80, 0x12020767, // mov ra_y, ra0.16a
++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
++/* [0x00000030] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000038] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
++/* [0x00000040] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
++/* [0x00000048] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
++/* [0x00000050] */ 0x15827d80, 0x10021427, // mov rb16, unif
++/* [0x00000058] */ 0x0c827380, 0x10021627, // add rb24, r1, unif
++/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000070] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000078] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000080] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000088] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000090] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++/* [0x00000098] */ 0x00000000, 0xe0020327, // mov ra12, 0
++/* [0x000000a0] */ 0x00000000, 0xe0020367, // mov ra13, 0
++/* [0x000000a8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
++/* [0x000000b0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
++/* [0x000000b8] */ 0x00000000, 0xe0020267, // mov ra9, 0
++/* [0x000000c0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
++/* [0x000000c8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0                      ; mov r1, ra_y
++/* [0x000000d0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
++/* [0x000000d8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3          ; mov r2, ra_u2v_ref_offset
++/* [0x000000e0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
++/* [0x000000e8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
++/* [0x000000f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x000000f8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0                      ; mov ra_x, r0
++/* [0x00000100] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000108] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
++/* [0x00000110] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
++/* [0x00000118] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
++/* [0x00000120] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
++/* [0x00000128] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000130] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000140] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000150] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000158] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000160] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000168] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000170] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000178] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000180] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000188] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x00000190] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000198] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x000001a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000001a8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x000001b0] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
++/* [0x000001b8] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
 +// ::mc_filter_uv
-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++/* [0x000001c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001c8] */ 0x15827d80, 0x100200a7, // mov ra2, unif
++/* [0x000001d0] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num      ; mov r3, unif
++/* [0x000001d8] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
++/* [0x000001e0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000001e8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x000001f0] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
++/* [0x000001f8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000200] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
++/* [0x00000208] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000210] */ 0x9509cdbf, 0x12024731, // mov ra_y_next, ra2.16a ; mov vw_setup, rb28
++/* [0x00000218] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000220] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000228] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000230] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
++/* [0x00000238] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000240] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9     ; mov -, vw_wait
++/* [0x00000248] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:filter_uv_1
++/* [0x00000250] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
++/* [0x00000258] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000260] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
++/* [0x00000268] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x00000270] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x00000278] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++// :filter_uv_1
++/* [0x00000280] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000288] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
++/* [0x00000290] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
++/* [0x00000298] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
++/* [0x000002a0] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x000002a8] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x000002b0] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x000002b8] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
 +// :uvloop
-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x000002c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x000002c8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x000002d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x000002d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x000002e0] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift    ; v8min r0, r0, rb_k255
++/* [0x000002e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x000002f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x000002f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++/* [0x00000300] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8min r1, r1, rb_k255
++/* [0x00000308] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000310] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000318] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
++/* [0x00000320] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++/* [0x00000328] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++/* [0x00000330] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++/* [0x00000338] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++/* [0x00000340] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++/* [0x00000348] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++/* [0x00000350] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++/* [0x00000358] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000360] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x00000368] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000370] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x00000378] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x00000380] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x00000388] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
++/* [0x00000390] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
++/* [0x00000398] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x000003a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x000003a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003b0] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x000003b8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000003c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x000003c8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003d0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x000003d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000003e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000003e8] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26    ; mov ra9, rb26
++/* [0x000003f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000003f8] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29    ; mov ra10, rb29
++/* [0x00000400] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000408] */ 0x15827d80, 0x100202e7, // mov ra11, unif
 +// ::mc_filter_uv_b0
-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
++/* [0x00000410] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000418] */ 0x15827d80, 0x100200a7, // mov ra2, unif
++/* [0x00000420] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num      ; mov r3, unif
++/* [0x00000428] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
++/* [0x00000430] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000438] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x00000440] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
++/* [0x00000448] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000450] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
++/* [0x00000458] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000460] */ 0x150a7d80, 0x12020727, // mov ra_y_next, ra2.16a
++/* [0x00000468] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000470] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000478] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000480] */ 0x0c043dc0, 0xd20207e7, // add ra31, ra1.16a, 3
++/* [0x00000488] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000490] */ 0x8c0601bf, 0x14025803, // add r0,   r0, ra1.16b        ; mov ra3, unif
++/* [0x00000498] */ 0x918101f6, 0xd002480e, // shl r0,   r0, i_shift16      ; mov rb14, unif
++/* [0x000004a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x000004a8] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
++/* [0x000004b0] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
++/* [0x000004b8] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
++/* [0x000004c0] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
++/* [0x000004c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000004d0] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
 +// :uvloop_b0
-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000004d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17  ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x000004e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x000004e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x000004f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x000004f8] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255
++/* [0x00000500] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000508] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000510] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00000518] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2     ; v8min r1, r1, rb_k255
++/* [0x00000520] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000528] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000530] */ 0x40027030, 0x180049e3, // nop                   ; mul24      r3, ra0.8a,       r0
++/* [0x00000538] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00000540] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00000548] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00000550] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000558] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000560] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00000568] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000570] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
++/* [0x00000578] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
++/* [0x00000580] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x00000588] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
++/* [0x00000590] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
++/* [0x00000598] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
++/* [0x000005a0] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x000005a8] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x000005b0] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
++/* [0x000005b8] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
++/* [0x000005c0] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
++/* [0x000005c8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005d0] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
++/* [0x000005d8] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
++/* [0x000005e0] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
++/* [0x000005e8] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
++/* [0x000005f0] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
++/* [0x000005f8] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
++/* [0x00000600] */ 0x95810ff6, 0xd0020827, // mov r0, i_shift16     ; mov -, unif
++/* [0x00000608] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
++/* [0x00000610] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
++/* [0x00000618] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++/* [0x00000620] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++/* [0x00000628] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++/* [0x00000630] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++/* [0x00000638] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
++/* [0x00000640] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++/* [0x00000648] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++/* [0x00000650] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
++/* [0x00000658] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++/* [0x00000660] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++/* [0x00000668] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++// :uv_b0_post12
++/* [0x00000670] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
++/* [0x00000678] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++/* [0x00000680] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
++/* [0x00000688] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
++/* [0x00000690] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++/* [0x00000698] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
 +// ::mc_filter_uv_b
-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++// :uv_b0_post_fin
++/* [0x000006a0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000006a8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9       ; mov -, vw_wait
++/* [0x000006b0] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:uv_filter_b_1
++/* [0x000006b8] */ 0x959a0ff6, 0x10024020, // mov ra0, unif         ; mov r0, elem_num
++/* [0x000006c0] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
++/* [0x000006c8] */ 0x0c027c00, 0x14020827, // add r0, ra0.16b, r0
++/* [0x000006d0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000006d8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x000006e0] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++// :uv_filter_b_1
++/* [0x000006e8] */ 0x930001f6, 0xd202581c, // max r0, r0, 0                      ; mov ra_y_next, ra0.16a
++/* [0x000006f0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
++/* [0x000006f8] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
++/* [0x00000700] */ 0x8c8270f6, 0x10020827, // add r0, r0, r3                     ; mov -, unif
++/* [0x00000708] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
++/* [0x00000710] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000720] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
++/* [0x00000728] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
++/* [0x00000730] */ 0x8c0d3eb6, 0x1c02468a, // add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
++/* [0x00000738] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
++/* [0x00000740] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x00000748] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
 +// :uvloop_b
-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000750] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x00000758] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x00000760] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000768] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000770] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift     ; v8min r0, r0, rb_k255
++/* [0x00000778] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000780] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000788] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++/* [0x00000790] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8min r1, r1, rb_k255
++/* [0x00000798] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x000007a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000007a8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
++/* [0x000007b0] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++/* [0x000007b8] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++/* [0x000007c0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++/* [0x000007c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++/* [0x000007d0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++/* [0x000007d8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++/* [0x000007e0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++/* [0x000007e8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x000007f0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x000007f8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000800] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x00000808] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
++/* [0x00000810] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x00000818] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x00000820] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x00000828] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++/* [0x00000830] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
++/* [0x00000838] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
++/* [0x00000840] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++/* [0x00000848] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
++/* [0x00000850] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
++/* [0x00000858] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
++/* [0x00000860] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
++/* [0x00000868] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000870] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
++/* [0x00000880] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a
++/* [0x00000888] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26    ; mov ra9, rb26
++/* [0x00000890] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000898] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29    ; mov ra10, rb29
++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x000008a8] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++// ::mc_exit_c
++/* [0x000008b0] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9      ; mov -, vw_wait
++/* [0x000008b8] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit_c_1
++/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008d8] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000008e0] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x000008e8] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++/* [0x000008f0] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_exit
-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++// :exit_c_1
++/* [0x000008f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000900] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000908] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000910] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00000918] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
++/* [0x00000920] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
 +/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00000930] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_setup
-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
++/* [0x00000938] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
 +/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
 +/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
 +/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
++/* [0x00000958] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000960] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000968] */ 0x0d0c1dc0, 0xd4021667, // sub rb_frame_width_minus_1, ra3.16b, 1
++/* [0x00000970] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_frame_height_minus_1, ra3.16a, 1
++/* [0x00000978] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000980] */ 0x15827380, 0x10021627, // or  rb24, r1, unif
++/* [0x00000988] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000990] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
++/* [0x00000998] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000009a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x000009a8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x000009b0] */ 0x0c201dc0, 0xd4020767, // add ra_y, ra8.16b, 1
++/* [0x000009b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x000009c0] */ 0x0c267c00, 0x100208a7, // add r2, ra9, r0
++/* [0x000009c8] */ 0x13200dc0, 0xd4020867, // max r1, ra8.16b, 0
++/* [0x000009d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x000009d8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x000009e0] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1       ; mov ra_frame_base, r2
++/* [0x000009e8] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
++/* [0x000009f0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000009f8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x00000a00] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000a08] */ 0x0c281dc0, 0xd4120567, // add ra_y2, ra10.16b, 1
++/* [0x00000a10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x00000a18] */ 0x0c2e7c00, 0x100208a7, // add r2, ra11, r0
++/* [0x00000a20] */ 0x13280dc0, 0xd4020867, // max r1, ra10.16b, 0
++/* [0x00000a28] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000a30] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000a38] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1       ; mov ra_frame_base2, r2
++/* [0x00000a40] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000a48] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000a50] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000a58] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000a60] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000a68] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000a70] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++/* [0x00000a78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000a80] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000a88] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000a90] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000a98] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000aa0] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000aa8] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000ab0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000ab8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000ac0] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000ac8] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
++/* [0x00000ad0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x00000ad8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000ae0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x00000ae8] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
++/* [0x00000af0] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
++/* [0x00000af8] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
++/* [0x00000b00] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000b08] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
++/* [0x00000b10] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000b18] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
 +// :per_block_setup
-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
++/* [0x00000b20] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000b28] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b30] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
++/* [0x00000b38] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
++/* [0x00000b40] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
++/* [0x00000b48] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000b50] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b58] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000b60] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000b68] */ 0x15067d80, 0x14020727, // mov ra_y_next, ra1.16b
++/* [0x00000b70] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
++/* [0x00000b78] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
++/* [0x00000b80] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000b88] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000b98] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000ba0] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
++/* [0x00000ba8] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
++/* [0x00000bb0] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
++/* [0x00000bb8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00000bc0] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000bc8] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
++/* [0x00000bd0] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
++/* [0x00000bd8] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000be0] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
++/* [0x00000be8] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000bf0] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
++/* [0x00000bf8] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
++/* [0x00000c00] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
++/* [0x00000c08] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
++/* [0x00000c10] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000c18] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000c20] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000c28] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
++/* [0x00000c30] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
++/* [0x00000c38] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000c40] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000c48] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000c50] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000c58] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000c60] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000c68] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000c70] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
++/* [0x00000c78] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
++/* [0x00000c80] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000c88] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
++/* [0x00000c90] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
++/* [0x00000c98] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000ca0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000ca8] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
++/* [0x00000cb0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000cb8] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
++/* [0x00000cc0] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
++/* [0x00000cc8] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
++/* [0x00000cd0] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
++/* [0x00000cd8] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
++/* [0x00000ce0] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
++/* [0x00000ce8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000cf0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
++/* [0x00000cf8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
++/* [0x00000d00] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
 +// ::mc_filter
-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
++/* [0x00000d08] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
 +// :yloop
-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000d10] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000d18] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++/* [0x00000d20] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000d28] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000d30] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000d38] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000d40] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000d48] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000d50] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00000d58] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000d60] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000d68] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000d70] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00000d78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000d80] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00000d88] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00000d90] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00000d98] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00000da0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00000da8] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00000db0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00000db8] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00000dc0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00000dc8] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00000dd0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00000dd8] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00000de0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00000de8] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00000df0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00000df8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00000e00] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000e08] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00000e10] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00000e18] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000e20] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00000e28] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00000e30] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00000e38] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00000e40] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00000e48] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00000e50] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00000e58] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00000e60] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00000e68] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00000e70] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00000e78] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x00000e80] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00000e88] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00000e90] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x00000e98] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000ea0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000ea8] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000eb0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00000eb8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00000ec0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00000ec8] */ 0xfffffc38, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x00000ed0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000ed8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000ee0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
 +// ::mc_filter_b
-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
 +// :yloopb
-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000ee8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000ef0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++/* [0x00000ef8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000f00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000f08] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000f10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000f18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000f20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000f28] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00000f30] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000f38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000f40] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000f48] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00000f50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000f58] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00000f60] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00000f68] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00000f70] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00000f78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00000f80] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00000f88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00000f90] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00000f98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00000fa0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00000fa8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00000fb0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00000fb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00000fc0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00000fc8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00000fd0] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00000fd8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000fe0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00000fe8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00000ff0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00000ff8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00001000] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00001008] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00001010] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00001018] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00001020] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00001028] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00001030] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00001038] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00001040] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00001048] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00001050] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
++/* [0x00001058] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001060] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001068] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
++/* [0x00001070] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
++/* [0x00001078] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
++/* [0x00001080] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00001088] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001090] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00001098] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000010a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000010a8] */ 0xfffffa58, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x000010b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x000010b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000010c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_interrupt_exit12c
++/* [0x000010c8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9      ; mov -, vw_wait
++/* [0x000010d0] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit12_c_1
++/* [0x000010d8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010e8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010f0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000010f8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x00001100] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++/* [0x00001108] */ 0x00000000, 0xe0020267, // mov ra9, 0
 +// ::mc_interrupt_exit12
-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// :exit12_c_1
++/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001118] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001120] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001128] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001178] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001180] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001188] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00001190] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00001198] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_exit1
-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x000011a0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000011b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000011b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000011c0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000011c8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x000011d0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x000011d8] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
 +#pragma Align_to(8, rpi_shader)
 +#endif
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+diff --git b/libavcodec/rpi_shader.h a/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..9772796
+index 0000000..3b1229e
 --- /dev/null
-+++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,19 @@
++++ a/libavcodec/rpi_shader.h
+@@ -0,0 +1,20 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
 +extern unsigned int rpi_shader[];
 +
 +#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 132)
-+#define mc_filter_uv_b0 (rpi_shader + 274)
-+#define mc_filter_uv_b (rpi_shader + 392)
-+#define mc_exit (rpi_shader + 540)
-+#define mc_interrupt_exit8 (rpi_shader + 558)
-+#define mc_setup (rpi_shader + 588)
-+#define mc_filter (rpi_shader + 872)
-+#define mc_filter_b (rpi_shader + 992)
-+#define mc_interrupt_exit12 (rpi_shader + 1114)
-+#define mc_exit1 (rpi_shader + 1152)
-+#define mc_end (rpi_shader + 1168)
++#define mc_filter_uv (rpi_shader + 112)
++#define mc_filter_uv_b0 (rpi_shader + 260)
++#define mc_filter_uv_b (rpi_shader + 424)
++#define mc_exit_c (rpi_shader + 556)
++#define mc_exit (rpi_shader + 574)
++#define mc_setup (rpi_shader + 590)
++#define mc_filter (rpi_shader + 834)
++#define mc_filter_b (rpi_shader + 954)
++#define mc_interrupt_exit12c (rpi_shader + 1074)
++#define mc_interrupt_exit12 (rpi_shader + 1092)
++#define mc_exit1 (rpi_shader + 1128)
++#define mc_end (rpi_shader + 1144)
 +
 +#endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+diff --git b/libavcodec/rpi_shader.qasm a/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..aa9e1e7
+index 0000000..6fd6af5
 --- /dev/null
-+++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1098 @@
++++ a/libavcodec/rpi_shader.qasm
+@@ -0,0 +1,1150 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be routated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
 +# register allocation
 +#
 +# ra0...ra7                                     eight horizontal filter coefficients
@@ -13420,7 +13493,7 @@ index 0000000..aa9e1e7
 +#
 +# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: 0x10000
++# ra4                                           y: Fiter, UV: part -of b0 -> b stash
 +
 +# rb12                                          offset to add before shift (round + weighting offsets)
 +# rb13                                          shift: denom + 6 + 9
@@ -13442,10 +13515,10 @@ index 0000000..aa9e1e7
 +# ra22 ra_k256                                  256
 +# ra23 ra_y2_next                               ra_y2_next
 +#
-+# rb20                                          0xffffff00
-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
++# rb20                                          -- free --
++# rb21                                          -- free --
 +# rb22 rb_k255                                  255
-+# rb23                                          24
++# rb23                                          -- free --
 +#
 +# rb24                                          vdw_setup_1(dst_pitch)
 +# rb25                                          frame width-1
@@ -13462,9 +13535,10 @@ index 0000000..aa9e1e7
 +# ra27                                          next ra25
 +# ra28                                          next y
 +# ra29                                          y for next texture access
-+# ra30                                          64
 +#
-+# ra31                                          next kernel address
++# Use an even numbered register as a link register to avoid corrupting flags
++# ra30                                          next kernel address
++# ra31                                          chroma-B height+3; free otherwise
 +
 +.set rb_frame_width_minus_1,       rb25
 +.set rb_frame_height_minus_1,      rb30
@@ -13496,22 +13570,46 @@ index 0000000..aa9e1e7
 +.set rb_k255,                      rb22
 +.set ra_k256,                      ra22
 +
++.set ra_link,                      ra30
++
 +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
 +.set i_shift16,                    -16
 +.set i_shift21,                    -11
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, r_vpm, r_dma
++  mov r2, qpu_num
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++  add r_dma, r0, r1  # DMA out
++.endm
++
 +
 +################################################################################
 +# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
 +::mc_setup_uv
-+
-+# Read starting kernel
-+mov ra31, unif
++  mov tmurs, 1          ; mov ra_link, unif        # No swap TMUs ; Next fn
 +
 +# Load first request location
-+add ra_x, unif, elem_num # Store x
-+mov ra_y, unif # Store y
++mov ra0, unif
++mov r0, elem_num
++
++add ra_x, ra0.16b, r0   # Store x
++mov ra_y, ra0.16a       # Store y
 +mov ra_frame_base, unif # Store frame u base
-+nop
++mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_frame_base
 +sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
 +
 +# Read image dimensions
@@ -13521,77 +13619,59 @@ index 0000000..aa9e1e7
 +# get source pitch
 +mov rb16, unif
 +
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
++# get destination vdw setup
++add rb24, r1, unif      # dst_stride
 +
 +# load constants
++  mov ra_k1, 1
++  mov ra_k256, 256
++  mov rb_k255, 255
 +
-+mov ra4, 0x10000
-+mov ra_k1, 1
-+mov ra_k256, 256
-+mov ra30, 64
++# touch registers to keep simulator happy
 +
-+mov rb20, 0xffffff00
-+mov rb_k255, 255
-+mov rb23, 24
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
 +
-+# touch vertical context to keep simulator happy
++  # ra12..15: vertical scroll registers
++  mov ra12, 0
++  mov ra13, 0
++  mov ra14, 0
++  mov ra15, 0
 +
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
++  # ra9 - delayed setup - must be 0 initially
++  mov ra9, 0
 +
 +# Compute base address for first and second access
 +mov r0, ra_x           # Load x
-+max r0, r0, 0; mov r1, ra_y # Load y
++max r0, r0, 0                      ; mov r1, ra_y # Load y
 +min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
++shl ra_xshift_next, r0, 3          ; mov r2, ra_u2v_ref_offset
 +add ra_y, r1, 1
 +add r0, r0, r3
 +and r0, r0, ~3
-+max r1, r1, 0 ; mov ra_x, r0 # y
++max r1, r1, 0                      ; mov ra_x, r0 # y
 +min r1, r1, rb_frame_height_minus_1
 +# submit texture requests for first line
 +add r2, r2, r0 ; mul24 r1, r1, rb_pitch
 +add t0s, r0, r1 ; mov ra_frame_base, r2
 +add t1s, r2, r1
 +
-+mov r2, 9
-+add rb13, r2, unif  # denominator
++add rb13, 9, unif   # denominator
 +mov -, unif         # Unused
 +
-+# Compute part of VPM to use for DMA output
-+mov r2, unif
-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
++mov -, unif   # ??? same as (register) qpu_num
 +
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1  # VPM 8bit storage
-+asr r2, r0, 1     # r0 = bc0000d
-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r2, r1  # VPM for 16bit intermediates
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1  # DMA out
++# Compute part of VPM to use for DMA output
++m_calc_dma_regs rb28, rb27
 +
 +# submit texture requests for second line
 +max r1, ra_y, 0
 +min r1, r1, rb_frame_height_minus_1
 +add ra_y, ra_y, 1
-+bra -, ra31
++bra -, ra_link
 +nop ; mul24 r1, r1, rb_pitch
 +add t0s, r1, ra_x
 +add t1s, r1, ra_frame_base
@@ -13605,20 +13685,24 @@ index 0000000..aa9e1e7
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv
-+mov ra31, unif
++mov ra_link, unif
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0         ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
++mov ra2, unif         # x_y
++mov r0, elem_num      ; mov r3, unif          # frame_base
++
++add r0, ra2.16b, r0   # x
++max r0, r0, 0
++min r0, r0, rb_frame_width_minus_1
 +# compute offset from frame base u to frame base v
 +sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
 +shl ra_xshift_next, r0, 3
 +add r0, r0, r3        ; mov ra1, unif  # ; width_height
 +and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
-+mov ra_y_next, r1     ; mov vw_setup, rb28
++mov ra_y_next, ra2.16a ; mov vw_setup, rb28
++
 +add ra_frame_base_next, rb_x_next, r2
 +
 +# set up VPM write
@@ -13628,9 +13712,19 @@ index 0000000..aa9e1e7
 +add rb17, ra1.16a, 1
 +add rb18, ra1.16a, 3
 +shl r0,   ra1.16a, 7
++
++  mov.setf -, ra9     ; mov -, vw_wait
++  brr.anyz -, r:filter_uv_1
++
 +add r0,   r0, ra1.16b    # Combine width and height of destination area
 +shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
 +add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
++# >>> (skip V DMA if never requested)
++
++  sub vw_setup, ra9, -16
++  mov vw_setup, ra10
++  mov vw_addr, ra11
++:filter_uv_1
 +
 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
@@ -13662,12 +13756,12 @@ index 0000000..aa9e1e7
 +shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
 +mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
 +mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++shr r1, r4, ra_xshift    ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +max r2, ra_y, 0  # y
 +min r2, r2, rb_frame_height_minus_1
 +add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
++add t0s, ra_x, r2    ; v8min r1, r1, rb_k255
 +add t1s, ra_frame_base, r2
 +
 +# generate seven shifted versions
@@ -13677,13 +13771,13 @@ index 0000000..aa9e1e7
 +
 +# apply horizontal filter
 +nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
 +sub r0, r2, r3       ; mov r3, rb31
 +sub.setf -, r3, 4    ; mov ra12, ra13
 +brr.anyn -, r:uvloop
@@ -13707,24 +13801,15 @@ index 0000000..aa9e1e7
 +asr r1, r1, rb13
 +min r1, r1, rb_k255       # Delay 2
 +max vpm, r1, 0         # Delay 3
++# >>>
 +
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
++# DMA out for U & stash for V
++  mov vw_setup, rb26    ; mov ra9, rb26 # VDW setup 0
++  bra -, ra_link
++  mov vw_setup, rb29    ; mov ra10, rb29 # Stride
++  mov vw_addr, unif     # u_dst_addr
++  mov ra11, unif        # v_dst_addr
++# >>>
 +
 +################################################################################
 +
@@ -13733,19 +13818,23 @@ index 0000000..aa9e1e7
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv_b0
-+mov ra31, unif
++mov -, unif                  # Ignore chain address - always "b"
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num       # x
-+max r0, r0, 0                ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
++mov ra2, unif         # x_y
++mov r0, elem_num      ; mov r3, unif          # frame_base
++
++add r0, ra2.16b, r0   # x
++max r0, r0, 0
++min r0, r0, rb_frame_width_minus_1
++# compute offset from frame base u to frame base v
++sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
 +shl ra_xshift_next, r0, 3
-+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
-+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
-+mov ra_y_next, r1            ; mov vw_setup, rb21
++add r0, r0, r3        ; mov ra1, unif  # ; width_height
++and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
++mov ra_y_next, ra2.16a
 +
 +add ra_frame_base_next, rb_x_next, r2
 +
@@ -13753,14 +13842,12 @@ index 0000000..aa9e1e7
 +# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
 +# filter code. Unpack into b regs for V
 +
-+# set up VPM write, we need to save 16bit precision
-+
 +sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
 +add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
++add ra31, ra1.16a, 3
 +shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b           # Combine width and height of destination area
-+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
++add r0,   r0, ra1.16b        ; mov ra3, unif   # Combine width and height of destination area ; V filter coeffs
++shl r0,   r0, i_shift16      ; mov rb14, unif  # U weight L0
 +add rb26, r0, rb27
 +
 +mov rb8, ra3.8a
@@ -13773,8 +13860,8 @@ index 0000000..aa9e1e7
 +
 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+mov      rb14, unif                 # U weight L0
 +mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
++
 +# rb14 unused in b0 but will hang around till the second pass
 +
 +# retrieve texture results and pick out bytes
@@ -13785,62 +13872,127 @@ index 0000000..aa9e1e7
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
++  shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++  shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255          # v8subs masks out all but bottom byte
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, ra_y, 0       # y
++  min r2, r2, rb_frame_height_minus_1
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_x, r2     ; v8min r1, r1, rb_k255
++  add t1s, ra_frame_base, r2
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b0
-+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++  nop                   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  sub r0, r2, r3        ; mov r3, rb31
++  sub.setf -, r3, 4     ; mov ra12, ra13
++  brr.anyn -, r:uvloop_b0
++  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
++  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
++  mov ra15, r0          ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b0
 +
-+# apply vertical filter and write to VPM
++# apply vertical filter and write to B-FIFO
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+sub.setf -, r3, rb18
-+brr.anyn -, r:uvloop_b0
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+asr vpm, r1, 6
-+# >>> .anyn uvloop_b0
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
++  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
++  sub r1, r1, r0        ; mov ra7, rb6
 +
-+# in pass0 we don't really need to save any results, but need to discard the uniforms
-+# DMA out for U
++# FIFO goes:
++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
++# This arrangement optimizes the inner loop FIFOs at the expense of making the
++# bulk shift between loops quite a bit nastier
++# a8 used as temp
 +
-+bra -, ra31
-+mov -, unif           # Delay 1
-+mov -, unif           # Delay 2
-+nop                   # Delay 3
++  sub.setf -, r3, ra31
++  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
++  brr.anyn -, r:uvloop_b0
++  mov ra5, rb4          ; mov rb4, ra4
++  mov ra4, rb5          ; mov rb5, ra6
++  mov ra6, rb7          ; mov rb7, ra8
++# >>>
 +
++# 1st half done all results now in the a/b4..7 fifo
++
++# Need to bulk rotate FIFO for heights other than 16
++# plausible heights are 16, 12, 8, 6, 4, 3, 2 and that is all we deal with
++# we are allowed 3/4 cb_size w/h :-(
++
++# Destination uniforms discarded
++# At the end drop through to _b - we will always do b after b0
++
++  sub.setf -, 15, r3    # 12 + 3 of preroll
++  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
++  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
++  mov r0, i_shift16     ; mov -, unif           # ; Discard v_dst_addr
++  mov r1, 0x10000
++# >>>
++  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
++# If h != 16 && h != 12 then h <= 8 so
++# shift 8 with discard (.16b = .16a on all regs)
++  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++# >>>
++  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++
++  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
++# Shift 4
++  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++  # If we shifted by 4 here then the max length remaining is 4
++  # so that is it
++
++  brr -, r:uv_b0_post_fin
++# Shift 2
++  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++  # 6 / 2 so need 6 outputs
++# >>>
++
++:uv_b0_post12
++# this one is annoying as we need to swap halves of things that don't
++# really want to be swapped
++
++# b7a, a6a, b5a, a4a
++# b4a, a5a, b6a, a7a
++# b7b, a6b, b5b, a4b
++# b4b, a5b, b6b, a7b
++
++  mov r2,  ra4          ; mov r3,  rb5
++  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++  mov ra7, r2           ; mov rb6, r3
++
++  mov r2, ra6           ; mov r3, rb7
++  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++  mov ra5, r2           ; mov rb4, r3
++
++:uv_b0_post_fin
++  # drop through
 +
 +################################################################################
 +
 +::mc_filter_uv_b
-+mov ra31, unif
++
++  mov ra_link, unif
++  mov.setf -, ra9       ; mov -, vw_wait  # Delayed V DMA
++  brr.anyz -, r:uv_filter_b_1
++
++  mov ra0, unif         ; mov r0, elem_num
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
@@ -13848,30 +14000,23 @@ index 0000000..aa9e1e7
 +mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0                      ; mov ra_y_next, unif # y
++add r0, ra0.16b, r0    # x
++# >>>
++  sub vw_setup, ra9, -16
++  mov vw_setup, ra10
++  mov vw_addr, ra11
++:uv_filter_b_1
++
++max r0, r0, 0                      ; mov ra_y_next, ra0.16a # y
 +min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
 +# compute offset from frame base u to frame base v
 +sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
-+add r0, r0, r3                     ; mov ra1, unif       # width_height
++add r0, r0, r3                     ; mov -, unif         # discard width_height
 +and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
++# rb17, rb26, rb29, ra31 inherited from B0 as w/h must be the same
 +
-+add ra_frame_base_next, rb_x_next, r2
-+
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
-+shr r3, r3, 8
-+add vr_setup, r3, rb21
-+
-+add r0, r0, ra1.16b    # Combine width and height of destination area
-+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
++mov ra3, unif #  V filter coeffs
 +
 +# get filter coefficients
 +
@@ -13882,7 +14027,7 @@ index 0000000..aa9e1e7
 +# The unif read occurs unconditionally, only the write is conditional
 +mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
 +mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
-+nop                 ; mov rb10, ra3.8c
++add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
 +mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
 +
 +shl r1, ra1.16b, rb13
@@ -13902,12 +14047,12 @@ index 0000000..aa9e1e7
 +shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
 +mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
 +mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++shr r1, r4, ra_xshift     ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +max r2, ra_y, 0  # y
 +min r2, r2, rb_frame_height_minus_1
 +add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
++add t0s, ra_x, r2         ; v8min r1, r1, rb_k255
 +add t1s, ra_frame_base, r2
 +
 +# generate seven shifted versions
@@ -13916,100 +14061,105 @@ index 0000000..aa9e1e7
 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
 +nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
 +sub r0, r2, r3       ; mov r3, rb31
 +sub.setf -, r3, 4    ; mov ra12, ra13
 +brr.anyn -, r:uvloop_b
 +mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
++mov ra14, ra15          ; mul24 r2, ra15, rb10
 +mov ra15, r0            ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b
 +
 +# apply vertical filter and write to VPM
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
-+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14          # shift2=6
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
++  add r1, r1, r2        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++  mov ra7, rb6          ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
 +
-+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+nop                     ; mul24 r0, r0, rb14
++  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++  add r1, r1, r0        ; mov rb4, ra4
 +
-+add r1, r1, r0          ; mov -, vw_wait
-+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
++  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
 +
-+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++  sub.setf -, r3, ra31  ; mov ra6, rb7
++  brr.anyn -, r:uvloop_b
++  asr ra3.8as, r1, rb13
++  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
++  mov vpm, ra3.8a
++# >>>
 +
-+brr.anyn -, r:uvloop_b
-+asr r1, r1, rb13         # Delay 1
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
++# DMA out for U & stash for V
++
++  mov vw_setup, rb26    ; mov ra9, rb26 # VDW setup 0
++  bra -, ra_link
++  mov vw_setup, rb29    ; mov ra10, rb29 # Stride
++  mov vw_addr, unif     # u_dst_addr
++  mov ra11, unif        # v_dst_addr
 +
 +
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
 +
 +################################################################################
 +
 +# mc_exit()
 +
++::mc_exit_c
++  mov.setf -, ra9      ; mov -, vw_wait
++# Annoyingly it looks iike condition codes don't work on writes to special
++# registers so we have to branch around the writes
++  brr.anyz -, r:exit_c_1
++  nop
++  nop
++  nop
++# >>>
++
++  sub vw_setup, ra9, -16
++  mov vw_setup, ra10
++  mov vw_addr, ra11
++  nop
++:exit_c_1
++
 +::mc_exit
-+mov  -, vw_wait # wait on the VDW
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
 +
-+mov -,srel(0)
++  mov -,srel(0)
 +
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+nop        ; nop ; thrend
-+nop        ; nop # delay slot 1
-+nop        ; nop # delay slot 2
++  nop        ; nop ; thrend
++  nop        ; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +# mc_interrupt_exit8()
-+::mc_interrupt_exit8
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
++#::mc_interrupt_exit8
++#mov  -, vw_wait # wait on the VDW
++#
++#ldtmu0
++#ldtmu1
++#ldtmu0
++#ldtmu1
++#
++#mov -,sacq(0) # 1
++#mov -,sacq(0) # 2
++#mov -,sacq(0) # 3
++#mov -,sacq(0) # 4
++#mov -,sacq(0) # 5
++#mov -,sacq(0) # 6
++#mov -,sacq(0) # 7
++#
++#nop        ; nop ; thrend
++#mov interrupt, 1; nop # delay slot 1
++#nop        ; nop # delay slot 2
++#
 +
 +
 +
@@ -14022,115 +14172,79 @@ index 0000000..aa9e1e7
 +################################################################################
 +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
 +::mc_setup
-+  mov r3, 16
-+
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov ra8, unif  # y_x
-+  mov ra9, unif  # ref_y_base
-+  mov ra10, unif # y2_x2
-+  mov ra11, unif # ref_y2_base
++  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
++  mov ra9, unif         # ref_y_base
++  mov ra10, unif        # y2_x2
++  mov ra11, unif        # ref_y2_base
 +
 +# Read image dimensions
-+  mov r1, unif # width_height
-+  shl r0,r1,r3
-+  asr r1,r1,r3 # width
-+  asr r0,r0,r3 # height
-+  sub rb_frame_width_minus_1,r1,1
-+  sub rb_frame_height_minus_1,r0,1
-+
-+# get source pitch
-+  mov rb_pitch, unif # src_pitch
++  mov ra3, unif         # width_height
++  mov rb_pitch, unif    # src_pitch [ra3 delay]
++  sub rb_frame_width_minus_1, ra3.16b, 1
++  sub rb_frame_height_minus_1, ra3.16a, 1
 +
 +# get destination pitch
-+  mov r0, unif       # dst_pitch
 +  mov r1, vdw_setup_1(0)
-+  add rb24, r1, r0
++  or  rb24, r1, unif    # dst_pitch
 +
 +# Compute base address for first and second access
-+  mov r1, ra8 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++  mov r3, elem_num
++  add r0, ra8.16a, r3   # Load x + elem_num
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
++  min r0, r0, rb_frame_width_minus_1
 +  shl ra_xshift_next, r0, 3 # Compute shifts
-+  add ra_y, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+  max r1, r1, 0
++  add ra_y, ra8.16b, 1
++  and r0, r0, ~3        # r0 gives the clipped and aligned x coordinate
++  add r2, ra9, r0       # ra9 is address for frame0 (not including y offset)
++  max r1, ra8.16b, 0
 +  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t0s, r2, r1 ; mov ra_frame_base, r2
++  nop                   ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
++  add t0s, r2, r1       ; mov ra_frame_base, r2
 +
-+  mov r1, ra10 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++  # r3 still contains elem_num
++  add r0, ra10.16a, r3  # Load x
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
++  min r0, r0, rb_frame_width_minus_1
 +  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  add ra_y2, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+  max r1, r1, 0
++  add ra_y2, ra10.16b, 1
++  and r0, r0, ~3        # r0 gives the clipped and aligned x coordinate
++  add r2, ra11, r0      # r2 is address for frame1 (not including y offset)
++  max r1, ra10.16b, 0
 +  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
-+
++  nop                   ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
++  add t1s, r2, r1       ; mov ra_frame_base2, r2
 +
 +# load constants
 +
 +  mov ra_k1, 1
 +  mov ra_k256, 256
-+  mov ra30, 64
-+
-+  mov rb20, 0xffffff00
 +  mov rb_k255, 255
-+  mov rb23, 24
 +
 +# touch vertical context to keep simulator happy
 +
-+  mov ra8, 0
-+  mov ra9, 0
-+  mov ra10, 0
-+  mov ra11, 0
-+  mov ra12, 0
-+  mov ra13, 0
-+  mov ra14, 0
-+  mov ra15, 0
++  mov ra8,  0           ; mov rb8,  0
++  mov ra9,  0           ; mov rb9,  0
++  mov ra10, 0           ; mov rb10, 0
++  mov ra11, 0           ; mov rb11, 0
 +
 +# Compute part of VPM to use
-+  mov r2, qpu_num
-+  mov r1, r2
-+  asr r1, r1, 2
-+  shl r1, r1, 6
-+  mov r0, r2
-+  and r0, r0, 3
-+  add r0, r0, r1
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add rb28, r0, r1  # VPM for saving data
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+  add rb27, r0, r1  # Command for dma output
++  m_calc_dma_regs rb28, rb27
 +
 +# Weighted prediction denom
-+  add rb13, unif, 9  # unif = weight denom + 6
-+
-+  mov -, unif # Unused
++  add rb13, unif, 9     # unif = weight denom + 6
 +
 +# submit texture requests for second line
 +  max r1, ra_y, 0
 +  min r1, r1, rb_frame_height_minus_1
 +  add ra_y, ra_y, 1
-+  nop ; mul24 r1, r1, rb_pitch
++  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
 +  add t0s, r1, ra_frame_base
 +
 +  max r1, ra_y2, 0
 +  min r1, r1, rb_frame_height_minus_1
 +  add ra_y2, ra_y2, 1
-+  nop ; mul24 r1, r1, rb_pitch
++  nop                   ; mul24 r1, r1, rb_pitch
 +  add t1s, r1, ra_frame_base2
 +
 +# FALL THROUGHT TO PER-BLOCK SETUP
@@ -14139,7 +14253,7 @@ index 0000000..aa9e1e7
 +# P and B blocks share the same setup code to save on Icache space
 +:per_block_setup
 +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra31, unif
++  mov ra_link, unif
 +
 +  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
 +
@@ -14153,7 +14267,7 @@ index 0000000..aa9e1e7
 +  max r0, r0, 0
 +  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
 +  shl ra_xshift_next, r0, 3 # Compute shifts
-+  mov r3, 8                          ; mov ra_y_next, ra1.16b
++  mov ra_y_next, ra1.16b
 +  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
 +  add ra_frame_base_next, r2, r0
 +
@@ -14161,7 +14275,7 @@ index 0000000..aa9e1e7
 +  max r0, r0, 0
 +  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
 +  shl rx_xshift2_next, r0, 3         # Compute shifts
-+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
++  mov ra_y2_next, ra1.16b
 +  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
 +  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
 +
@@ -14178,8 +14292,9 @@ index 0000000..aa9e1e7
 +  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16      # Pick half to use
-+  shl ra8, r0, 3
++  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
++  mov r2, 0x01040400                 # [ra5 delay]
++  shl ra8, r0, 3                     ; mov rb14, ra5.16a
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
 +
@@ -14187,9 +14302,8 @@ index 0000000..aa9e1e7
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  mov r1,0x01040400
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
++  ror ra2.8b, r2, ra8.8d
++  ror ra0.8b, r2, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -14215,27 +14329,31 @@ index 0000000..aa9e1e7
 +  ror ra3.8c, r1, ra8.8d
 +  ror ra1.8c, r1, ra8.8c
 +
-+# Extract weighted prediction information in parallel
-+
 +  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
-+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
++  ror ra3.8d, r1, ra8.8d
++  ror ra1.8d, r1, ra8.8c
 +
-+# r3 = 16 from (long way) above
-+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
-+  asr ra18, r0, r3          ; mov rb5, ra3.8b
-+  bra -, ra31
-+  shl r0, r0, r3            ; mov rb6, ra3.8c
-+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
-+  asr rb12, r1, 9
++# Extract weighted prediction information in parallel
++# We are annoyingly A src limited here
 +
-+# >>> branch ra31
++  mov rb4, ra3.8a            ; mov ra18, unif
++  mov rb5, ra3.8b
++  mov rb6, ra3.8c
++  mov.ifnz ra5, ra18
++
++  bra -, ra_link
++
++  shl r0, ra5.16b, rb13      # Offset calc
++  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  mov r3, 0                  ; mov rb7, ra3.8d
++# >>> branch ra_link
 +#
 +# r3 = 0
-+# ra18 = weight L1
-+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# ra18.16a = weight L1
++# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# rb13     = weight denom + 6 + 9
++# rb14     = weight L0
 +
 +
 +################################################################################
@@ -14244,8 +14362,9 @@ index 0000000..aa9e1e7
 +# At this point we have already issued two pairs of texture requests for the current block
 +
 +::mc_filter
-+# r0 = weight << 16; We want weight * 2 in rb14
-+  asr rb14, r0, 15
++# ra5.16a = weight << 16; We want weight * 2 in rb14
++
++  shl rb14, ra5.16a, 1
 +
 +# r3 = 0
 +
@@ -14269,12 +14388,12 @@ index 0000000..aa9e1e7
 +  max r2, ra_y, 0  # y
 +  min r2, r2, rb_frame_height_minus_1
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
 +  min r2, r2, rb_frame_height_minus_1
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14283,21 +14402,21 @@ index 0000000..aa9e1e7
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
@@ -14359,7 +14478,7 @@ index 0000000..aa9e1e7
 +
 +::mc_filter_b
 +  # r0 = weightL0 << 16, we want it in rb14
-+  asr rb14, r0, i_shift16
++#  asr rb14, r0, i_shift16
 +
 +:yloopb
 +# retrieve texture results and pick out bytes
@@ -14377,12 +14496,12 @@ index 0000000..aa9e1e7
 +  max r2, ra_y, 0  # y
 +  min r2, r2, rb_frame_height_minus_1
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
 +  min r2, r2, rb_frame_height_minus_1
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14391,21 +14510,21 @@ index 0000000..aa9e1e7
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
@@ -14417,7 +14536,6 @@ index 0000000..aa9e1e7
 +  # >>> .anyn yloopb
 +
 +  # apply vertical filter and write to VPM
-+
 +  nop                     ; mul24 r0, rb8,  ra2.8a
 +  nop                     ; mul24 r1, rb9,  ra2.8b
 +  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
@@ -14433,7 +14551,7 @@ index 0000000..aa9e1e7
 +
 +  asr r1, r1, 14
 +  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
 +
 +  add r1, r1, r0          ; mov -, vw_wait
 +  shl r1, r1, 8
@@ -14450,26 +14568,26 @@ index 0000000..aa9e1e7
 +  mov vw_addr, unif # start the VDW   Delay 3
 +
 +################################################################################
++::mc_interrupt_exit12c
++  mov.setf -, ra9      ; mov -, vw_wait
++  brr.anyz -, r:exit12_c_1
++  nop
++  nop
++  nop
++# >>>
++
++  sub vw_setup, ra9, -16
++  mov vw_setup, ra10
++  mov vw_addr, ra11
++  mov ra9, 0
++:exit12_c_1
 +
 +# mc_interrupt_exit12()
 +::mc_interrupt_exit12
-+  mov  -, vw_wait # wait on the VDW
-+
-+  # Dummy wait to test instructions
-+#  mov r3,1000000
-+#:dummy_loop
-+#  sub.setf r3, r3, 1
-+#  nop
-+#  nop
-+#  brr.anynn -, r:dummy_loop
-+#  nop
-+#  nop
-+#  nop
-+
-+  ldtmu0
 +  ldtmu0
 +  ldtmu1
-+  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
 +
 +  mov -,sacq(0) # 1
 +  mov -,sacq(0) # 2
@@ -14502,477 +14620,12 @@ index 0000000..aa9e1e7
 +
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+diff --git b/libavcodec/rpi_zc.c a/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000..db41a4d
+index 0000000..9ac22aa
 --- /dev/null
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -0,0 +1,459 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
-+
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
-+
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
-+
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
-+};
-+
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
-diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
-new file mode 100644
-index 0000000..9580165
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,406 @@
++++ a/libavcodec/rpi_zc.c
+@@ -0,0 +1,453 @@
 +#include "config.h"
 +#ifdef RPI
 +#include "rpi_qpu.h"
@@ -14985,6 +14638,7 @@ index 0000000..9580165
 +typedef struct ZcPool
 +{
 +    int numbytes;
++    unsigned int n;
 +    struct ZcPoolEnt * head;
 +    pthread_mutex_t lock;
 +} ZcPool;
@@ -14993,27 +14647,48 @@ index 0000000..9580165
 +{
 +    // It is important that we start with gmem as other bits of code will expect to see that
 +    GPU_MEM_PTR_T gmem;
++    unsigned int n;
 +    struct ZcPoolEnt * next;
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
++#if 1
++//#define ALLOC_PAD       0x1000
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++//#define ALLOC_N_OFFSET  0x100
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    0x80
++#define STRIDE_OR       0x80
++#else
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    32
++#define STRIDE_OR       0
++#endif
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
 +{
 +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
 +
++    // Round up to 4k & add 4k
++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
 +        goto fail0;
 +    }
 +
-+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
++    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
 +    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
 +        goto fail1;
 +    }
 +
 +    zp->next = NULL;
 +    zp->pool = pool;
++    zp->n = pool->n++;
 +    return zp;
 +
 +fail1:
@@ -15062,6 +14737,10 @@ index 0000000..9580165
 +    }
 +
 +    pthread_mutex_unlock(&pool->lock);
++
++    // Start with our buffer empty of preconceptions
++//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++
 +    return zp;
 +}
 +
@@ -15127,7 +14806,8 @@ index 0000000..9580165
 +    const unsigned int video_width, const unsigned int video_height)
 +{
 +    AVRpiZcFrameGeometry geo;
-+    geo.stride_y = (video_width + 32 + 31) & ~31;
++    geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++//    geo.stride_y = ((video_width + 32 + 31) & ~31);
 +    geo.stride_c = geo.stride_y / 2;
 +//    geo.height_y = (video_height + 15) & ~15;
 +    geo.height_y = (video_height + 32 + 31) & ~31;
@@ -15139,13 +14819,21 @@ index 0000000..9580165
 +{
 +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
 +    AVBufferRef * buf;
++    intptr_t idata = (intptr_t)zp->gmem.arm;
++#if ALLOC_N_OFFSET != 0
++    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
++#endif
 +
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
 +        goto fail0;
 +    }
 +
-+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
++#if ALLOC_N_OFFSET != 0
++    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
++#endif
++
++    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
 +    {
 +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
 +        goto fail2;
@@ -15317,6 +15005,18 @@ index 0000000..9580165
 +    return p == NULL ? -1 : p->vc_handle;
 +}
 +
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : fr_ref->data - p->arm;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++    return fr_ref == NULL ? 0 : fr_ref->size;
++}
++
++
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
 +{
 +    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
@@ -15379,12 +15079,12 @@ index 0000000..9580165
 +
 +#endif  // RPI
 +
-diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+diff --git b/libavcodec/rpi_zc.h a/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..f0109f4
+index 0000000..4dd7a8b
 --- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,83 @@
++++ a/libavcodec/rpi_zc.h
+@@ -0,0 +1,88 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
@@ -15439,6 +15139,11 @@ index 0000000..f0109f4
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
 +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
 +// Get the number of bytes allocated from the frame ref
 +// Returns 0 if ref doesn't look valid
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
@@ -15468,10 +15173,10 @@ index 0000000..f0109f4
 +
 +#endif
 +
-diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..708526e 100644
---- a/libavcodec/utils.c
-+++ b/libavcodec/utils.c
+diff --git b/libavcodec/utils.c a/libavcodec/utils.c
+index 3e8677d..f1efc0d 100644
+--- b/libavcodec/utils.c
++++ a/libavcodec/utils.c
 @@ -26,6 +26,12 @@
   */
  
@@ -15496,7 +15201,7 @@ index f7adb52..708526e 100644
  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
  {
-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+@@ -508,6 +518,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
      return ret;
  }
  
@@ -15544,7 +15249,7 @@ index f7adb52..708526e 100644
  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
  {
      FramePool *pool = avctx->internal->pool;
-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+@@ -555,6 +606,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
              av_buffer_pool_uninit(&pool->pools[i]);
              pool->linesize[i] = linesize[i];
              if (size[i]) {
@@ -15559,10 +15264,48 @@ index f7adb52..708526e 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index b31d233..2767306 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
+diff --git b/libavformat/matroskaenc.c a/libavformat/matroskaenc.c
+index 9c7a213..af941ce 100644
+--- b/libavformat/matroskaenc.c
++++ a/libavformat/matroskaenc.c
+@@ -2223,7 +2223,7 @@ static int mkv_check_new_extra_data(AVFormatContext *s, AVPacket *pkt)
+ 
+     switch (par->codec_id) {
+     case AV_CODEC_ID_FLAC:
+-        if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL) && !mkv->is_live) {
++        if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL)) {
+             AVCodecParameters *codecpriv_par;
+             int64_t curpos;
+             if (side_data_size != par->extradata_size) {
+diff --git b/libavformat/mov.c a/libavformat/mov.c
+index f2296f8..4550cf0 100644
+--- b/libavformat/mov.c
++++ a/libavformat/mov.c
+@@ -1186,12 +1186,6 @@ static void mov_metadata_creation_time(AVDictionary **metadata, int64_t time)
+     if (time) {
+         if(time >= 2082844800)
+             time -= 2082844800;  /* seconds between 1904-01-01 and Epoch */
+-
+-        if ((int64_t)(time * 1000000ULL) / 1000000 != time) {
+-            av_log(NULL, AV_LOG_DEBUG, "creation_time is not representable\n");
+-            return;
+-        }
+-
+         avpriv_dict_set_timestamp(metadata, "creation_time", time * 1000000);
+     }
+ }
+@@ -5794,7 +5788,6 @@ static int mov_read_close(AVFormatContext *s)
+     av_freep(&mov->fragment_index_data);
+ 
+     av_freep(&mov->aes_decrypt);
+-    av_freep(&mov->chapter_tracks);
+ 
+     return 0;
+ }
+diff --git b/libavformat/mpegts.c a/libavformat/mpegts.c
+index 3eff152..30dfb14 100644
+--- b/libavformat/mpegts.c
++++ a/libavformat/mpegts.c
 @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
  #endif
      { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
@@ -15572,11 +15315,11 @@ index b31d233..2767306 100644
      { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
-diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6f343f2..83f26d5 100644
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+diff --git b/libavformat/utils.c a/libavformat/utils.c
+index a059046..ef70074 100644
+--- b/libavformat/utils.c
++++ a/libavformat/utils.c
+@@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
          int default_stream_index = av_find_default_stream_index(s);
          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
              for (i = 0; i < s->nb_streams; i++) {
@@ -15585,11 +15328,11 @@ index 6f343f2..83f26d5 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 694e116..203ca7b 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+diff --git b/libavutil/buffer.c a/libavutil/buffer.c
+index 8d1aa5f..649876d 100644
+--- b/libavutil/buffer.c
++++ a/libavutil/buffer.c
+@@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
  
      return ret;
  }
@@ -15599,11 +15342,11 @@ index 694e116..203ca7b 100644
 +  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
 +  return buf->opaque;
 +}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 0c0ce12..82e0bc3 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+diff --git b/libavutil/buffer.h a/libavutil/buffer.h
+index 73b6bd0..d907de3 100644
+--- b/libavutil/buffer.h
++++ a/libavutil/buffer.h
+@@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
   */
  AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
  
@@ -15613,11 +15356,11 @@ index 0c0ce12..82e0bc3 100644
  /**
   * @}
   */
-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+diff --git b/pi-util/conf.sh a/pi-util/conf.sh
 new file mode 100755
 index 0000000..8b596a2
 --- /dev/null
-+++ b/pi-util/conf.sh
++++ a/pi-util/conf.sh
 @@ -0,0 +1,33 @@
 +echo "Configure for Pi2/3"
 +
@@ -15652,11 +15395,11 @@ index 0000000..8b596a2
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
-diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+diff --git b/pi-util/conf_h265.csv a/pi-util/conf_h265.csv
 new file mode 100644
-index 0000000..61d1399
+index 0000000..d3db338
 --- /dev/null
-+++ b/pi-util/conf_h265.csv
++++ a/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
 +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
 +2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
@@ -15783,7 +15526,7 @@ index 0000000..61d1399
 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
 +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
 +2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
@@ -15802,12 +15545,12 @@ index 0000000..61d1399
 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+diff --git b/pi-util/ffconf.py a/pi-util/ffconf.py
 new file mode 100644
-index 0000000..38f942f
+index 0000000..c896bc6
 --- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,146 @@
++++ a/pi-util/ffconf.py
+@@ -0,0 +1,154 @@
 +#!/usr/bin/env python
 +
 +import os
@@ -15851,16 +15594,18 @@ index 0000000..38f942f
 +    except:
 +        pass
 +
-+    rv = False
 +    if  m1 and m2 and m1.group() == m2.group():
 +        print >> flog, "Match: " + m1.group()
-+        rv = True
++        rv = 0
 +    elif not m1:
 +        print >> flog, "****** Cannot find m1"
++        rv = 3
 +    elif not m2:
 +        print >> flog, "****** Cannot find m2"
++        rv = 2
 +    else:
 +        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++        rv = 1
 +    flog.close()
 +    return rv
 +
@@ -15906,19 +15651,25 @@ index 0000000..38f942f
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
-+                if exp_test == 1:
-+                    failures.append(name)
-+                    print ": * FAIL *"
-+                else:
-+                    print ": fail"
-+            else:
++            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+
++            elif exp_test > 1 and rv == 1:
++                print ": fail"
++            else:
++                failures.append(name)
++                if rv == 1:
++                    print ": * FAIL *"
++                elif (rv == 2) :
++                    print ": * CRASH *"
++                elif (rv == 3) :
++                    print ": * MD5 MISSING *"
++                else :
++                    print ": * BANG *"
 +
 +    if failures or unx_success:
 +        print "Unexpected Failures:", failures
@@ -15954,11 +15705,11 @@ index 0000000..38f942f
 +
 +    doconf(csva, args.tests)
 +
-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
+diff --git b/pi-util/qasm.py a/pi-util/qasm.py
 new file mode 100644
 index 0000000..1eacc04
 --- /dev/null
-+++ b/pi-util/qasm.py
++++ a/pi-util/qasm.py
 @@ -0,0 +1,2502 @@
 +#!/usr/bin/env python
 +
@@ -18462,11 +18213,25 @@ index 0000000..1eacc04
 +
 +if __name__ == '__main__':
 +   main()
-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+diff --git b/pi-util/qem.sh a/pi-util/qem.sh
+new file mode 100644
+index 0000000..20ce7ee
+--- /dev/null
++++ a/pi-util/qem.sh
+@@ -0,0 +1,8 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ pi-util/qasm.py
++SRC_FILE=libavcodec/rpi_shader.qasm
++DST_BASE=shader
++
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+diff --git b/pi-util/rebase_liblinks.py a/pi-util/rebase_liblinks.py
 new file mode 100755
 index 0000000..6a9a33f
 --- /dev/null
-+++ b/pi-util/rebase_liblinks.py
++++ a/pi-util/rebase_liblinks.py
 @@ -0,0 +1,37 @@
 +#!/usr/bin/env python
 +
@@ -18505,11 +18270,11 @@ index 0000000..6a9a33f
 +
 +
 +
-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
+diff --git b/pi-util/syncroot.sh a/pi-util/syncroot.sh
 new file mode 100755
 index 0000000..d8bdd91
 --- /dev/null
-+++ b/pi-util/syncroot.sh
++++ a/pi-util/syncroot.sh
 @@ -0,0 +1,43 @@
 +set -e
 +
@@ -18554,4 +18319,84 @@ index 0000000..d8bdd91
 +pi-util/rebase_liblinks.py $DST
 +
 +
-
+diff --git b/pi-util/v3dusage.py a/pi-util/v3dusage.py
+new file mode 100644
+index 0000000..7e336a9
+--- /dev/null
++++ a/pi-util/v3dusage.py
+@@ -0,0 +1,75 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def main():
++    argp = argparse.ArgumentParser(description="QPU/VPU perf summary")
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++
++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++
++    ttotal = {'idle':0.0}
++    tstart = {}
++    time0 = None
++    idle_start = None
++    qpu_op_no = 0
++    op_count = 0
++
++    with open(args.logfile, "rt") as infile:
++        for line in infile:
++            match = rmatch.match(line)
++            if match:
++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++                time = float(match.group(1))
++                unit = match.group(3)
++                opstart = not match.group(2)
++                optype = match.group(7)
++                hascb = match.group(8) != "0"
++
++                if unit == 'qpu1':
++                    unit = unit + "." + str(qpu_op_no)
++                    if not opstart:
++                        if hascb or optype == 'EXECUTE_SYNC':
++                            qpu_op_no = 0
++                        else:
++                            qpu_op_no += 1
++
++                # Ignore sync type
++                if optype == 'EXECUTE_SYNC':
++                    continue
++
++                if not time0:
++                    time0 = time
++
++                if opstart:
++                    tstart[unit] = time;
++                elif unit in tstart:
++                    op_count += 1
++                    if not unit in ttotal:
++                        ttotal[unit] = 0.0
++                    ttotal[unit] += time - tstart[unit]
++                    del tstart[unit]
++
++                if not idle_start and not tstart:
++                    idle_start = time
++                elif idle_start and tstart:
++                    ttotal['idle'] += time - idle_start
++                    idle_start = None
++
++    if not time0:
++        print "No v3d profile records found"
++    else:
++        tlogged = time - time0
++
++        print "Logged time:", tlogged, "  Op count:", op_count
++        for unit in sorted(ttotal):
++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++
++
++if __name__ == '__main__':
++   main()
++
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
index 721a065449..5240cf58ce 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
@@ -22,4 +22,3 @@ index 2fd3f2b..7165652 100644
          if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
              *poutbuf      = NULL;
              *poutbuf_size = 0;
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
index 15d449d284..37b53e8fb6 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
@@ -53,4 +53,3 @@ index aca8382..f473f6c 100644
 -- 
 2.7.4
 
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch
deleted file mode 100644
index 848158d727..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch
+++ /dev/null
@@ -1,17 +0,0 @@
---- a/libavformat/tls_openssl.c
-+++ b/libavformat/tls_openssl.c
-@@ -233,12 +233,13 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
-     if ((ret = ff_tls_open_underlying(c, h, uri, options)) < 0)
-         goto fail;
-
--    p->ctx = SSL_CTX_new(c->listen ? TLSv1_server_method() : TLSv1_client_method());
-+    p->ctx = SSL_CTX_new(c->listen ? SSLv23_server_method() : SSLv23_client_method());
-     if (!p->ctx) {
-         av_log(h, AV_LOG_ERROR, "%s\n", ERR_error_string(ERR_get_error(), NULL));
-         ret = AVERROR(EIO);
-         goto fail;
-     }
-+    SSL_CTX_set_options(p->ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3);
-     if (c->ca_file) {
-         if (!SSL_CTX_load_verify_locations(p->ctx, c->ca_file, NULL))
-             av_log(h, AV_LOG_ERROR, "SSL_CTX_load_verify_locations %s\n", ERR_error_string(ERR_get_error(), NULL));