diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
index c1732c2d35..cdcc49e5fc 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.2sf"
-PKG_VERSION="2bc20c5"
-PKG_SHA256="e595cce4aa616c6f36bc110626172be43c87ffe013d2c6aa20aa2cdbaba49b39"
+PKG_VERSION="0f2298f"
+PKG_SHA256="354ae9f98e83b3a9b614076cc665ada7792c77a6707f85712088c4c90c772fec"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
index 51e66313e2..e20f45f9f5 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.dumb"
-PKG_VERSION="fbe7090"
-PKG_SHA256="8b91aec227250e9ec25010db1775aaf443e8923618b5097a9805db286929c7da"
+PKG_VERSION="be3e3d6"
+PKG_SHA256="f95d1175cba66b4443b089d8e788a0709c92d4556d45b1abc11647cb67e2a34d"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
index fe15c40aee..b61f3997cb 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.gsf"
-PKG_VERSION="6af240a"
-PKG_SHA256="1b0dbb73c7c071d892798d9ece2980bbe4eddf90d9f0fb99ff646aa55fac6061"
+PKG_VERSION="67d9cd8"
+PKG_SHA256="7ded6afc8dab0c65a1795845288f86bfe37ce376c6bcd389e9b624d240dd93ce"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
index 485de8fbaf..f0c75733e2 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.modplug"
-PKG_VERSION="0f5274e"
-PKG_SHA256="31ece57a1848c53b135f4971939ec723ea8368d51fa9431365427c60b52fad00"
+PKG_VERSION="72018cd"
+PKG_SHA256="e799c0a7405c4df89058b91b0925f0e7860d750c1613e3ef38e141f12fa78904"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
index bbe61b19e4..ba53a12ee6 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.qsf"
-PKG_VERSION="945257e"
-PKG_SHA256="ac7d301ff3d7b4caef0a23e88ecf7cf84da37f425c4cf3e2bc0d74731df8b3ce"
+PKG_VERSION="932874a"
+PKG_SHA256="a384b487bca722c62e31791df81a9750871308ad6c1c0434893db038efcda024"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
index d71dfeeaec..bfff9d19b2 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.sidplay"
-PKG_VERSION="172cf89"
-PKG_SHA256="ba580bcd662791c38fd1c8ef9b824084028c15d6bcce41908cf5595ad0bd9329"
+PKG_VERSION="28bd921"
+PKG_SHA256="e0f35803697d055f5defbb1a405804149860bc49a451819ade1b00fb2724a5dc"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
index 463839eb1b..16ec072b09 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.timidity"
-PKG_VERSION="5567057"
-PKG_SHA256="a6bfa95a08bdc2ceb8adf163d8c78c274fbb406df3dcc9d3bc78b753a8c814a7"
+PKG_VERSION="8d37e2c"
+PKG_SHA256="c99f3271409414e0675392a10b590eb77c81801eb86f69043948b65f5e706607"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
index 95099d6951..14db4260f2 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.vgmstream"
-PKG_VERSION="9569fe5"
-PKG_SHA256="f872b029370dc613194bd93536558a118f352eed0ca3035f36a8202c39143d33"
+PKG_VERSION="43e05e4"
+PKG_SHA256="7b57a437514e9ac31736f72f5be7634560e116b31949969d028a1f63a51be893"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
index 11d0401b01..d1b1a015b7 100644
--- a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audioencoder.flac"
-PKG_VERSION="1d540c6"
-PKG_SHA256="eb8eba562012a1048129f679ef0bd240a776fbca73b843084723af895212b0fe"
+PKG_VERSION="ed75200"
+PKG_SHA256="25f4449024fcaba0ccf519e565ae679e701dafb36035a56afd2197f6121c9bba"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
index 53c233452e..ac5afcd75e 100644
--- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="imagedecoder.raw"
-PKG_VERSION="8d9c448"
-PKG_SHA256="f0ec7c790fe37131c5a51b0dbe0f095bd0329dc6601c02bd6cd4627cf994f607"
+PKG_VERSION="aa45f0a"
+PKG_SHA256="5883d0f49e0f88e00a13dfcccf622032f0e0df5b9f67e99747d98fd500bbffb8"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index 6c19522058..358285b995 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="inputstream.adaptive"
-PKG_VERSION="1656efc"
-PKG_SHA256="68e72db74706dc6a03f7d19125e4c9e62868b6aed078c6fb595f8b326a54f732"
+PKG_VERSION="dde3921"
+PKG_SHA256="a3b2f2c47a9545921980fe1b81825538fd877c0ad9809ed266c80f5cba7544e6"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
index 76acd71cd2..b76ab3cfab 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="inputstream.rtmp"
-PKG_VERSION="e094fa3"
-PKG_SHA256="00e82db4cac59296f267192e6fc12dbeebf63db34985ed35819680757c76c663"
+PKG_VERSION="26260c9"
+PKG_SHA256="e55d808ed6a23138aa3abe94300013fb5656cba0efe210306db92c80d523185e"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
index 008d9d2978..7a05d2f2e6 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.mythtv"
-PKG_VERSION="b46e5be"
-PKG_SHA256="f9a5cd6c172ce5f4a4cb1db05dae0cf8adfc43776fd9a3a8ef55ca0865ce2e52"
+PKG_VERSION="8965048"
+PKG_SHA256="a894d858a17c448ac66ea6631004135d3170d23c15b220b1e48c149a7c4c2bfe"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
index 76091f2fb4..70dbd3b122 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.nextpvr"
-PKG_VERSION="a855663"
-PKG_SHA256="471e2ef3922bb26d5df83b2bb71a78ee322861c736dd72ae21a45593317c55ee"
+PKG_VERSION="03933e9"
+PKG_SHA256="b0d32816deed7e744e9785d23fffaa8e63d8dadb416aa841cc061f4cb559dd4d"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
index 9e3444bf8d..0d281f75d7 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.njoy"
-PKG_VERSION="99874de"
-PKG_SHA256="3bbd2b992825d2d786f2ce86d0d7161ceb9c8c97bb3cd4a6c365cce75cc2836c"
+PKG_VERSION="cc1cb56"
+PKG_SHA256="35425e762e780fc19759cdbc504a25f23be15e0da25a58c30056aeb9709061c1"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
index dcf90defed..44242c552a 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.octonet"
-PKG_VERSION="a77cf11"
-PKG_SHA256="9217e8e0bec3b882dd0c7cb30b9488be64514514e91dbad31556da1ad435b166"
+PKG_VERSION="e9b4c05"
+PKG_SHA256="01bd1f5584cc5f781c09e33e0123b70037edcda35cfc02b5d50f5536fdb56608"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
index 635b5441fa..148260af50 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.teleboy"
-PKG_VERSION="a0c218b"
-PKG_SHA256="eabe85ec76c140c9703598266c59d1b16197dc4e3461c7c7e4d13f61051a4439"
+PKG_VERSION="94bb643"
+PKG_SHA256="92d62261385eb7b9852252070075ae968354c2dd6f96f8fd46cc2196d27e619c"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
index 3f12d45577..933f7c5a42 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vbox"
-PKG_VERSION="b5da811"
-PKG_SHA256="b5fd9c726df32d49bd08ed565b551c6a62b864d8768870240f0d7dc288f221ff"
+PKG_VERSION="ff01396"
+PKG_SHA256="c4d6a0dc2f89c47de7ffc1fa2e1e7b2bb92ae1bf77b5ffcbdc5dccd6537d0c35"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
index 3a90f1d8f3..3150de64dd 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="7e6e385"
-PKG_SHA256="618ba2c7c33be4df580b29c913caf47430d979d91b41013a96c006fcb9407e11"
+PKG_VERSION="f3f80d5"
+PKG_SHA256="f89bebb6b81f8ad21b520837e227fd175d7e7bc59d5d492484e3528f14c50766"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index 97914fa313..0d687cd107 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vuplus"
-PKG_VERSION="9bfd868"
-PKG_SHA256="54f59345f9f226c528572a274ddb26e33f0e551786e7c926f7429c35340280b4"
+PKG_VERSION="5e154bc"
+PKG_SHA256="aa193e058c746dd459665d13289411073f29c7f2d740e0f17c3b870faae19158"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
index 55ad63bf08..53a75cb170 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.wmc"
-PKG_VERSION="d820bf8"
-PKG_SHA256="723e25571da3261d70f7911dc72bd881ca394b67d2dd9b4b022fcfe2aa754acd"
+PKG_VERSION="7e2cb4b"
+PKG_SHA256="d935ecf8dcc137953698cb7ea3bc7c8e3674dfaca2c038045fcbec481d9cd35c"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
index 74d3e40485..56f40ac647 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.zattoo"
-PKG_VERSION="a24879b"
-PKG_SHA256="670308d5982dd4ce18b620c485c36aafa019acb292f7924cdc560a286c48540c"
+PKG_VERSION="73009cd"
+PKG_SHA256="431960430b354250dbb4e9f3b78fe6ee0046762a5b505139b94073580e0b05bd"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
index 3557417f9e..356e2ddf95 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.asteroids"
-PKG_VERSION="d606895"
-PKG_SHA256="d8cf9d2ced18a6a13a11c1f8749266563ad4a847c8785241fe3dc8575b4cf69f"
+PKG_VERSION="2418981"
+PKG_SHA256="f69ce2b58494f7ba8e714c9c8f738661e0d9ff56fc96dcec225295a1359748c9"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
index d4c8f8ddd2..d89b713e7d 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.asterwave"
-PKG_VERSION="4326ddc"
-PKG_SHA256="f29d6dd707ef5cd69abcec14af71a2e9623caf207fd12a0e9e0e0379fc3bf798"
+PKG_VERSION="5bb1c48"
+PKG_SHA256="1213695199587155d9f46a7c96586ee46cebfb1b5d373c1b1e2ba77de19381af"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
index ec132f9b19..28fbe5d69d 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.biogenesis"
-PKG_VERSION="c1ea27c"
-PKG_SHA256="a13adce077df37926da5fb4fd4f3f61902b19c7400fa9d7dfc92ab982efd379b"
+PKG_VERSION="5241aec"
+PKG_SHA256="57185a419f7c32dfefeb7c82ed3f07f6f8840f2ac7da5d4c03d023a2cda44238"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
index 25d9344a25..506d703c04 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.cpblobs"
-PKG_VERSION="6acb909"
-PKG_SHA256="9d238824d5cb2dccd76ef40bac2b4ec3f38e815d4167c8d86f78501c52ca7b28"
+PKG_VERSION="be324f3"
+PKG_SHA256="10669b1dd1b7f5677af468e73ec48270218c3112cc83b95168f2f3b426b38d00"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
index 4112b21d20..7d5735c911 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.greynetic"
-PKG_VERSION="8b7b810"
-PKG_SHA256="305ab296fb6a60f538309b095332f58f67d6e542ec380a886a6107eab02e5a91"
+PKG_VERSION="6aefc4b"
+PKG_SHA256="b23ff0b2db842eebb58c147057ac835184f121c065ce0d33c2d03534ea95d28f"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
index 053adf667f..465556b4d4 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.matrixtrails"
-PKG_VERSION="3323406"
-PKG_SHA256="f1030704c6b6e179a074edbe36fb41cce2cbe580da26ad41848aad044b690aad"
+PKG_VERSION="99c5649"
+PKG_SHA256="46da66cd6b41b02d04e1c7ad01baf9294fa76e18596bdcf0d9e1fa595a7281a4"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
index 00a2e9ffed..477f8e41fe 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.pingpong"
-PKG_VERSION="cfd0a05"
-PKG_SHA256="c99ca83607dd9313ffde1ba809df9339cc923e1f9fc7be7c88af6b5b41b49a0a"
+PKG_VERSION="3a27396"
+PKG_SHA256="e87d270e05b446174a937b0e1d468812476f332ed0c194387adbbdf2df1c2163"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
index 9a61c88ca6..19a70d7178 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.pyro"
-PKG_VERSION="97e997e"
-PKG_SHA256="d873b67eb516a625a07554cab44495414dfb2aea92874ee268ad35702959b01c"
+PKG_VERSION="f91a732"
+PKG_SHA256="3f016bef45d36c0b8a6ab16b6b82c7c47b433d349db9d025d049d04901457ffc"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
index 5a5833e61b..7c93436adb 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.stars"
-PKG_VERSION="e0da61c"
-PKG_SHA256="be90a6b4158b4298ca5ebf4b25fb98d9a784c01659e2454cc0aa2e142aa935d4"
+PKG_VERSION="bb61e49"
+PKG_SHA256="418e5c0dcf010b83b2cdf7ca00ff27b663359d0706ed00ac85fd841a3e943f43"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
index 49270135cb..6ab26bf66a 100644
--- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensavers.rsxs"
-PKG_VERSION="e302833"
-PKG_SHA256="9d05d2315616cb578818243d8c7cb7486f5407613f4e8ca5d87a109bc73d380c"
+PKG_VERSION="3b74bb6"
+PKG_SHA256="5ea9b045e98a3ebccd12a2c4c238f97493d9128d68f50fd208365c5666a443f2"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
index b8543b9b00..47a774a2e3 100644
--- a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="vfs.rar"
-PKG_VERSION="53294af"
-PKG_SHA256="5007f097ffafb64c61bf31a902959a334819fdd26eb273d52a8437382eda6200"
+PKG_VERSION="22292bc"
+PKG_SHA256="4d2df1c6dc31f46dedf828f057ed90ca83f400c6f521c1a05510f82999febcaa"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
index 2e23137975..0158a01d6b 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.fishbmc"
-PKG_VERSION="ccc919c"
-PKG_SHA256="0a642873a2ba5acea271d04600160c7143c050f6b637db7d55a76ecb627c6e21"
+PKG_VERSION="3dae2bd"
+PKG_SHA256="471765286c6054717980510edf5d49390b0d4f38289c83830a9e0a444202825c"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
index 56ac1c1b78..748a1f518a 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.goom"
-PKG_VERSION="0c93889"
-PKG_SHA256="62b5b8d9f63650633a447c21f17b5341d11404a7238c2dda20283990d031cf5a"
+PKG_VERSION="65f1d9c"
+PKG_SHA256="7436332d329c275a5fd1a395b1312919726ed83d7d5375ca08fb305b49b2c590"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
index c9fb41eb85..0447467721 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.pictureit"
-PKG_VERSION="66f88ff"
-PKG_SHA256="8b91e71e4c7828a9bfa3df3fdce07aa5b0f9fab153bdf255a53833405f5f7e41"
+PKG_VERSION="8eb74a6"
+PKG_SHA256="358ced879c541974a4a2dbbaa7c6f633e77adb066bb639bb585e26a50820fd43"
 PKG_REV="2"
 PKG_ARCH="x86_64"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
index 3e8850c0b8..74cb56b432 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.projectm"
-PKG_VERSION="a39f73b"
-PKG_SHA256="5bf7d97acb5a4144b0c82397d39ea099eb9b4cc3c74aeb18f73352aee12bc06f"
+PKG_VERSION="bc05ed8"
+PKG_SHA256="b1e1db697502aa6810277b69d0e0141e40b6fb9cbd4f08298cceff0152544102"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
index ce97fda3ba..d839059b12 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.shadertoy"
-PKG_VERSION="1717f36"
-PKG_SHA256="988015445128036f79f006a0df9c1692838436b4420e418bf84a7113bfc46300"
+PKG_VERSION="764d59d"
+PKG_SHA256="0b050831c6f9b7de89d7cebb6d6b7984a4675db3744cd5c7c8aebaf6251c9181"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
index 4a0072f5e8..e7ab5dcf78 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.spectrum"
-PKG_VERSION="9b4a792"
-PKG_SHA256="6d2120bd1c1cb04233998736fd7ae43e42388b4b44ebe92331cd7c0064b37bf8"
+PKG_VERSION="d75d995"
+PKG_SHA256="1d838196a38bca5b1ca6b29f340165e4249513c548f9f183b2b07fbd10dae268"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
index 0ec0d8c268..5072591ebb 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.waveform"
-PKG_VERSION="0e78a14"
-PKG_SHA256="19d3377424daf6fd5a90e1707e71bd8ab34fe94b9a703c184f0e17ab8a73f514"
+PKG_VERSION="8204be7"
+PKG_SHA256="457d861a8ef5a054339effe803b4aae801256282b098db63ae45aa90a9c30c9e"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index ddcd2f6348..efa80c96ca 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="ef76936"
-PKG_SHA256="6fd4354279bcdb6b20fcc8968ca54125027e88ad55444c8d91da1ab9b436c59d"
+PKG_VERSION="9d82343"
+PKG_SHA256="c22b044ca692798049b731a69bec501b88fb41a59304b9d3b85d51331d2bdea7"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index e8561d2993..9444284475 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -18,8 +18,8 @@
 
 PKG_NAME="ffmpeg"
 # Current branch is: release/3.3-kodi
-PKG_VERSION="30554d7"
-PKG_SHA256="a1bc2f092e1b11ea3271a8fdcef8ec2f9bee7e1cf05f0a1b89ec7f903fee6d14"
+PKG_VERSION="20f6654"
+PKG_SHA256="34d4f16d529b03d276fe7cbab8c7d12c4dfd51f0c1f78c5f38fab4a66a836deb"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
index 2786d22397..1fc696eac4 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
@@ -1,8 +1,16 @@
 diff --git a/.gitignore b/.gitignore
-index 524fb73..305632b 100644
+index 524fb73c16..bcc983739f 100644
 --- a/.gitignore
 +++ b/.gitignore
-@@ -23,6 +23,7 @@
+@@ -1,6 +1,7 @@
+ *.a
+ *.o
+ *.o.*
++*.bin
+ *.d
+ *.def
+ *.dll
+@@ -23,6 +24,7 @@
  .\#*
  /.config
  /.version
@@ -11,7 +19,7 @@ index 524fb73..305632b 100644
  /ffplay
  /ffprobe
 diff --git a/ffmpeg.c b/ffmpeg.c
-index 4b4dae4..9a7c29c 100644
+index 4b4dae47fe..0149e73f46 100644
 --- a/ffmpeg.c
 +++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
@@ -20,13 +28,21 @@ index 4b4dae4..9a7c29c 100644
  
 +#ifdef RPI
 +#define RPI_DISPLAY
-+#define RPI_ZERO_COPY
++#define RPI_DISPLAY_ALL 0
 +#endif
 +
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -69,6 +74,25 @@
+@@ -43,6 +48,7 @@
+ #include "libavformat/avformat.h"
+ #include "libavdevice/avdevice.h"
+ #include "libswresample/swresample.h"
++#include "libavutil/atomic.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/channel_layout.h"
+ #include "libavutil/parseutils.h"
+@@ -69,6 +75,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -38,21 +54,21 @@ index 4b4dae4..9a7c29c 100644
 +#include <interface/mmal/mmal.h>
 +#include <interface/mmal/mmal_parameters_camera.h>
 +#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
 +#include <interface/mmal/util/mmal_util.h>
 +#include <interface/mmal/util/mmal_default_components.h>
 +#include <interface/mmal/util/mmal_connection.h>
 +#include <interface/mmal/util/mmal_util_params.h>
 +#pragma GCC diagnostic pop
-+#ifdef RPI_ZERO_COPY
 +#include "libavcodec/rpi_qpu.h"
-+#endif
++#include "libavutil/rpi_sand_fns.h"
 +#include "libavcodec/rpi_zc.h"
 +#endif
 +
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -165,6 +189,182 @@ static int restore_tty;
+@@ -165,6 +190,241 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -60,39 +76,36 @@ index 4b4dae4..9a7c29c 100644
 +
 +#define NUM_BUFFERS 4
 +
-+static MMAL_COMPONENT_T* rpi_display = NULL;
-+static MMAL_POOL_T *rpi_pool = NULL;
-+static volatile int rpi_display_count = 0;
 +
-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
++typedef struct rpi_display_env_s
++{
++    MMAL_COMPONENT_T* display;
++    MMAL_COMPONENT_T* isp;
++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
++    MMAL_CONNECTION_T * conn;
++
++    MMAL_POOL_T *rpi_pool;
++    volatile int rpi_display_count;
++    enum AVPixelFormat avfmt;
++} rpi_display_env_t;
++
++static rpi_display_env_t * rpi_display_env = NULL;
++
++
++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
 +{
 +    MMAL_POOL_T* pool;
-+    size_t i;
-+    size_t size = (w*h*3)/2;
-+#ifdef RPI_ZERO_COPY
 +    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
 +    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
 +    assert(pool);
-+#else
-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
-+
-+    for (i = 0; i < NUM_BUFFERS; ++i)
-+    {
-+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-+       char * bufPtr = buffer->data;
-+       memset(bufPtr, i*30, w*h);
-+       memset(bufPtr+w*h, 128, (w*h)/2);
-+    }
-+#endif
 +
 +    return pool;
 +}
 +
 +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+#ifdef RPI_ZERO_COPY
++    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
 +    av_rpi_zc_unref(buffer->user_data);
-+    --rpi_display_count;
-+#endif
++    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
 +    mmal_buffer_header_release(buffer);
 +}
 +
@@ -100,9 +113,12 @@ index 4b4dae4..9a7c29c 100644
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
++#define DISPLAY_PORT_DEPTH 4
++
++static rpi_display_env_t *
++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
 +{
-+    MMAL_COMPONENT_T* display;
++    MMAL_STATUS_T err;
 +    MMAL_DISPLAYREGION_T region =
 +    {
 +        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
@@ -111,51 +127,113 @@ index 4b4dae4..9a7c29c 100644
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
++#if RPI_ZC_SAND_8_IN_10_BUF
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
++#else
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
++#endif
 +    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
++    rpi_display_env_t * de;
++    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
 +
-+    bcm_host_init();  // TODO is this needed?
-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-+    assert(display);
++    bcm_host_init();  // Needs to be done by someone...
 +
-+    mmal_port_parameter_set(display->input[0], &region.hdr);
++    if ((de = av_mallocz(sizeof(*de))) == NULL) {
++        return NULL;
++    }
++
++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
++    av_assert0(de->display);
++    de->port_in = de->display->input[0];
++
++    if (isp_req)
++    {
++        mmal_component_create("vc.ril.isp", &de->isp);
++        de->port_in = de->isp->input[0];
++    }
++
++    mmal_port_parameter_set(de->display->input[0], &region.hdr);
 +
 +    {
-+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
++        MMAL_PORT_T * const port = de->port_in;
++        MMAL_ES_FORMAT_T* const format = port->format;
++        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
++        port->buffer_num = DISPLAY_PORT_DEPTH;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
++            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
++                MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
-+        format->es->video.height = geo.height_y;
++        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
++                                      (h + 15) & ~15 : geo.height_y;  // Magic
 +        format->es->video.crop.x = 0;
 +        format->es->video.crop.y = 0;
 +        format->es->video.crop.width = w;
 +        format->es->video.crop.height = h;
-+        mmal_port_format_commit(display->input[0]);
++        mmal_port_format_commit(port);
 +    }
 +
-+    mmal_component_enable(display);
++    de->rpi_pool = display_alloc_pool(de->port_in);
++    mmal_port_enable(de->port_in,display_cb_input);
 +
-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
++    if (isp_req) {
++        MMAL_PORT_T * const port_out = de->isp->output[0];
++        mmal_log_dump_port(de->port_in);
++        mmal_format_copy(port_out->format, de->port_in->format);
++        if (fmt == AV_PIX_FMT_SAND64_10) {
++            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
++                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
++            {
++                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
++            }
++            else
++                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
 +
-+    mmal_port_enable(display->input[0],display_cb_input);
-+    mmal_port_enable(display->control,display_cb_control);
++        }
++        port_out->format->encoding = MMAL_ENCODING_I420;
++        mmal_log_dump_port(port_out);
++        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
++        {
++            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
++            goto fail;
++        }
++        mmal_port_enable(de->isp->control,display_cb_control);
++        mmal_component_enable(de->isp);
++    }
++
++    mmal_component_enable(de->display);
++    mmal_port_enable(de->display->control,display_cb_control);
++    de->avfmt = fmt;
 +
 +    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
-+    return display;
++    return de;
++
++fail:
++    // **** Free stuff
++    return NULL;
 +}
 +
-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
 +{
 +    MMAL_BUFFER_HEADER_T* buf;
 +
-+    if (!display || !rpi_pool)
++    if (de == NULL)
 +        return;
 +
-+    if (rpi_display_count >= 3) {
++    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
 +        return;
 +    }
 +
-+    buf = mmal_queue_get(rpi_pool->queue);
++    buf = mmal_queue_get(de->rpi_pool->queue);
 +    if (!buf) {
 +        // Running too fast so drop the frame
 +        printf("Q alloc failure\n");
@@ -165,67 +243,64 @@ index 4b4dae4..9a7c29c 100644
 +    buf->cmd = 0;
 +    buf->offset = 0; // Offset to valid data
 +    buf->flags = 0;
-+#ifdef RPI_ZERO_COPY
-+{
-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
-+    if (fr_buf == NULL) {
-+        mmal_buffer_header_release(buf);
-+        return;
-+    }
-+
-+    buf->user_data = fr_buf;
-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->offset = av_rpi_zc_offset(fr_buf);
-+    buf->length = av_rpi_zc_length(fr_buf);
-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+#if 0
 +    {
-+        unsigned int n;
-+        for (n = 0; n < fr->width; n += 128) {
-+            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
++        if (fr_buf == NULL) {
++            mmal_buffer_header_release(buf);
++            return;
 +        }
++
++        buf->user_data = fr_buf;
++        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
++        buf->offset = av_rpi_zc_offset(fr_buf);
++        buf->length = av_rpi_zc_length(fr_buf);
++        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
 +    }
-+#endif
-+    ++rpi_display_count;
-+}
-+#else
-+{
-+#error YYY
-+    int w = fr->width;
-+    int h = fr->height;
-+    int w2 = (w+31)&~31;
-+    int h2 = (h+15)&~15;
-+
-+    buf->length = (w2 * h2 * 3)/2;
-+    buf->user_data = NULL;
-+
-+    //mmal_buffer_header_mem_lock(buf);
-+    memcpy(buf->data, fr->data[0], w2 * h);
-+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-+    //mmal_buffer_header_mem_unlock(buf);
-+}
-+#endif
-+
-+    while (rpi_display_count >= 3) {
++#if RPI_DISPLAY_ALL
++    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        usleep(5000);
 +    }
++#endif
 +
-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
 +    {
-+        printf("** send failed: depth=%d\n", rpi_display_count);
-+        display_cb_input(NULL, buf);
++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++        display_cb_input(de->port_in, buf);
 +    }
 +}
 +
-+static void display_exit(MMAL_COMPONENT_T* display)
++static void display_exit(rpi_display_env_t ** const pde)
 +{
++    rpi_display_env_t * const de = *pde;
++    *pde = NULL;
++
++    if (de != NULL) {
 +//    sleep(120);
-+    if (display) {
-+        mmal_component_destroy(display);
-+    }
-+    if (rpi_pool) {
-+        mmal_port_pool_destroy(display->input[0], rpi_pool);
++
++        if (de->port_in != NULL) {
++            mmal_port_disable(de->port_in);
++        }
++
++        // The above disable should kick out all buffers - check that
++        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
++            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
++        }
++
++        if (de->conn != NULL) {
++            mmal_connection_destroy(de->conn);
++        }
++        if (de->isp != NULL) {
++            mmal_component_destroy(de->isp);
++        }
++        if (de->display != NULL) {
++            mmal_component_destroy(de->display);
++        }
++        if (de->rpi_pool != NULL) {
++            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
++        }
++
++        av_free(de);
 +    }
 +}
 +
@@ -235,29 +310,30 @@ index 4b4dae4..9a7c29c 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -576,6 +776,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -576,6 +836,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
 +
 +#ifdef RPI_DISPLAY
-+    display_exit(rpi_display);
++    display_exit(&rpi_display_env);
 +#endif
 +
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -588,6 +793,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -587,7 +852,9 @@ static void ffmpeg_cleanup(int ret)
+         av_freep(&ist->filters);
          av_freep(&ist->hwaccel_device);
          av_freep(&ist->dts_buffer);
- 
-+#ifdef RPI_ZERO_COPY
+-
++#ifdef RPI_DISPLAY
 +        av_rpi_zc_uninit(ist->dec_ctx);
 +#endif
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -618,6 +826,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -618,6 +885,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -265,28 +341,28 @@ index 4b4dae4..9a7c29c 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1053,6 +1262,15 @@ static void do_video_out(OutputFile *of,
+@@ -1053,6 +1321,15 @@ static void do_video_out(OutputFile *of,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
 +#ifdef RPI_DISPLAY
 +    if (next_picture && ist != NULL)
 +    {
-+        if (!rpi_display)
-+            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
++        if (rpi_display_env == NULL)
++            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
++        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
 +    }
 +#endif
 +
      frame_rate = av_buffersink_get_frame_rate(filter);
      if (frame_rate.num > 0 && frame_rate.den > 0)
          duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
-@@ -2884,6 +3102,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2884,6 +3161,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
 +
-+#ifdef RPI_ZERO_COPY
++#ifdef RPI_DISPLAY
 +        // Overrides the above get_buffer2
 +        av_rpi_zc_init(ist->dec_ctx);
 +#endif
@@ -295,39 +371,44 @@ index 4b4dae4..9a7c29c 100644
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 0dd0c7b..b9732c5 100644
+index 0dd0c7b1bb..99755a297e 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
-@@ -5,6 +5,12 @@ NAME = avcodec
+@@ -5,6 +5,15 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
-+	  rpi_shader_cmd.h                                              \
++          rpi_shader_cmd.h                                              \
++          rpi_shader_template.h                                         \
++          rpi_shader_template_fn.h                                      \
 +          rpi_mailbox.h                                                 \
-+          rpi_hevc_transform.h                                          \
++          rpi_hevc_transform8.h                                         \
++          rpi_hevc_transform10.h                                        \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -47,6 +53,10 @@ OBJS = allcodecs.o                                                      \
+@@ -47,6 +56,11 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
 +       rpi_qpu.o                                                        \
 +       rpi_shader.o                                                     \
++       rpi_shader_template.o                                            \
 +       rpi_mailbox.o                                                    \
 +       rpi_zc.o                                                         \
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1103,3 +1113,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -1103,3 +1117,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
 +QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
 +
 +ifneq ("$(wildcard $(QASM_PY))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
@@ -337,9 +418,23 @@ index 0dd0c7b..b9732c5 100644
 +	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
 +endif
 +
-+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++	python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++	python pi-util/make_array.py $<
++
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
 diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 4df4772..ca05158 100644
+index 4df4772e02..ca05158de8 100644
 --- a/libavcodec/allcodecs.c
 +++ b/libavcodec/allcodecs.c
 @@ -696,6 +696,7 @@ static void register_all(void)
@@ -351,10 +446,10 @@ index 4df4772..ca05158 100644
      REGISTER_PARSER(MJPEG,              mjpeg);
      REGISTER_PARSER(MLP,                mlp);
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index 1eeac54..a94a240 100644
+index 1eeac5449e..7e23777f5d 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
-@@ -134,9 +134,13 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+@@ -134,9 +134,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
@@ -364,13 +459,14 @@ index 1eeac54..a94a240 100644
                                            arm/hevcdsp_idct_neon.o       \
 -                                          arm/hevcdsp_qpel_neon.o
 +                                          arm/hevcdsp_cres_neon.o       \
++                                          arm/hevcdsp_res16_neon.o      \
 +                                          arm/hevcdsp_qpel_neon.o       \
 +                                          arm/hevcdsp_sao_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
 diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b..0a3980a 100644
+index fdbf86b45e..0a3980a1ef 100644
 --- a/libavcodec/arm/cabac.h
 +++ b/libavcodec/arm/cabac.h
 @@ -26,13 +26,34 @@
@@ -553,7 +649,7 @@ index fdbf86b..0a3980a 100644
  #endif /* AVCODEC_ARM_CABAC_H */
 diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
 new file mode 100644
-index 0000000..31d3c59
+index 0000000000..31d3c59205
 --- /dev/null
 +++ b/libavcodec/arm/hevc_cabac.h
 @@ -0,0 +1,491 @@
@@ -1048,9 +1144,239 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
+new file mode 100644
+index 0000000000..380d3c8d3b
+--- /dev/null
++++ b/libavcodec/arm/hevc_idct_fn_neon.S
+@@ -0,0 +1,224 @@
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT  (15 - BIT_DEPTH)
++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q0, r1
++        vdup.16     q1, r1
++        vst1.16     {q0, q1}, [r0]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        mov         r3, #16
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++1:      subs        r3, #1
++        vstm        r0!, {q8-q15}
++        bne         1b
++        bx lr
++endfunc
++
++
++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x00240053 // 36 and 83
++        vmov.32     d0[0], r3
++
++        tr4_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x4a  // 74
++        vmov.32     d0[0], r3
++        ldr         r3, =0x1d  // 29
++        vmov.32     d0[1], r3
++        ldr         r3, =0x37  // 55
++        vmov.32     d1[0], r3
++
++        tr4_luma_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
++        push   {r4-r8}
++        vpush {d8-d15}
++        mov    r5, #16
++
++        adrl      r3, tr4f
++        vld1.16   {d0, d1}, [r3]
++
++        // left half
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #128
++        //skip right half if col_limit in r1 is less than 4
++        cmp      r1, #4
++        blt      1f
++        //right half
++        add      r0, #8
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #136
++1:
++        // top half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        vstm r0!, {q1-q4}
++
++        // bottom half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        //vstm     r0, {q1-q4}
++        vst1.16 {q1-q2}, [r0]
++        add     r0, #32
++        vst1.16 {q3-q4}, [r0]
++        sub     r0, #32
++        vpop {d8-d15}
++        pop {r4-r8}
++        bx lr
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
 diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
 new file mode 100644
-index 0000000..373576b
+index 0000000000..373576b4cb
 --- /dev/null
 +++ b/libavcodec/arm/hevc_misc_neon.S
 @@ -0,0 +1,62 @@
@@ -1118,10 +1444,10 @@ index 0000000..373576b
 +
 diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
 new file mode 100644
-index 0000000..880b26e
+index 0000000000..bafefd4318
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_cres_neon.S
-@@ -0,0 +1,275 @@
+@@ -0,0 +1,296 @@
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
@@ -1138,7 +1464,8 @@ index 0000000..880b26e
 +@ add_residual4x4_c(
 +@   uint8_t *_dst,        [r0]
 +@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
++@   ptrdiff_t stride,     [r2]
++@   int dc_v)             [r3]
 +
 +function ff_hevc_add_residual_4x4_u_neon_8, export=1
 +        vld1.8      {d16}, [r0, :64], r2
@@ -1146,8 +1473,8 @@ index 0000000..880b26e
 +        vld1.8      {d18}, [r0, :64], r2
 +        vld1.8      {d19}, [r0, :64], r2
 +        vld1.16     {q0, q1}, [r1]
-+        vmov.i64    q2, #0
-+        vmov.i64    q3, #0
++        vdup.16     q2, r3
++        vdup.16     q3, r3
 +        vmovl.u8    q10, d16
 +        sub         r0, r0, r2, lsl #2
 +        vmovl.u8    q11, d17
@@ -1174,9 +1501,11 @@ index 0000000..880b26e
 +@   uint8_t *_dst,        [r0]
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
 +
 +function ff_hevc_add_residual_8x8_u_neon_8, export=1
 +        mov         r12,    #4
++        vdup.16     q15, r3
 +1:
 +        vld2.8      {d16, d17}, [r0, :128], r2
 +        vld2.8      {d18, d19}, [r0, :128]
@@ -1186,9 +1515,13 @@ index 0000000..880b26e
 +        sub         r0, r2
 +        vmovl.u8    q11, d18
 +        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d17
 +        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d19
 +        vqmovun.s16 d16,  q0
++        vqmovun.s16 d17,  q2
 +        vqmovun.s16 d18,  q1
++        vqmovun.s16 d19,  q3
 +        vst2.8      {d16, d17}, [r0, :128], r2
 +        vst2.8      {d18, d19}, [r0, :128], r2
 +        bne         1b
@@ -1199,9 +1532,11 @@ index 0000000..880b26e
 +@   uint8_t *_dst,        [r0]
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
 +
 +function ff_hevc_add_residual_16x16_u_neon_8, export=1
 +        mov         r12,    #16
++        vdup.16     q15, r3
 +1:
 +        vld2.8      {q8, q9}, [r0, :256]
 +        vld1.16     {q0, q1}, [r1, :256]!
@@ -1210,8 +1545,12 @@ index 0000000..880b26e
 +        vmovl.u8    q11, d17
 +        vqadd.s16   q0,  q10
 +        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
 +        vqmovun.s16 d16, q0
 +        vqmovun.s16 d17, q1
++        vqmovun.s16 d18, q2
++        vqmovun.s16 d19, q3
 +        vst2.8      {q8, q9}, [r0, :256], r2
 +        bne         1b
 +        bx          lr
@@ -1231,8 +1570,8 @@ index 0000000..880b26e
 +        vld1.8      {d18}, [r0, :64], r2
 +        vld1.8      {d19}, [r0, :64], r2
 +        vld1.16     {q2, q3}, [r1]
-+        vmov.i64    q0, #0
-+        vmov.i64    q1, #0
++        vdup.16     q0, r3
++        vdup.16     q1, r3
 +        vmovl.u8    q10, d16
 +        sub         r0, r0, r2, lsl #2
 +        vmovl.u8    q11, d17
@@ -1262,6 +1601,7 @@ index 0000000..880b26e
 +
 +function ff_hevc_add_residual_8x8_v_neon_8, export=1
 +        mov         r12,    #4
++        vdup.16     q15, r3
 +1:
 +        vld2.8      {d16, d17}, [r0, :128], r2
 +        vld2.8      {d18, d19}, [r0, :128]
@@ -1272,8 +1612,12 @@ index 0000000..880b26e
 +        vmovl.u8    q11, d19
 +        vqadd.s16   q0,  q10
 +        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d18
 +        vqmovun.s16 d17,  q0
++        vqmovun.s16 d16,  q2
 +        vqmovun.s16 d19,  q1
++        vqmovun.s16 d18,  q3
 +        vst2.8      {d16, d17}, [r0, :128], r2
 +        vst2.8      {d18, d19}, [r0, :128], r2
 +        bne         1b
@@ -1287,14 +1631,19 @@ index 0000000..880b26e
 +
 +function ff_hevc_add_residual_16x16_v_neon_8, export=1
 +        mov         r12,    #16
++        vdup.16     q15, r3
 +1:
 +        vld2.8      {q8, q9}, [r0, :256]
 +        vld1.16     {q0, q1}, [r1, :256]!
 +        subs        r12,   #1
 +        vmovl.u8    q10, d18
 +        vmovl.u8    q11, d19
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d17
 +        vqadd.s16   q0,  q10
 +        vqadd.s16   q1,  q11
++        vqmovun.s16 d16, q2
++        vqmovun.s16 d17, q3
 +        vqmovun.s16 d18, q0
 +        vqmovun.s16 d19, q1
 +        vst2.8      {q8, q9}, [r0, :256], r2
@@ -1395,10 +1744,8 @@ index 0000000..880b26e
 +@ 32x32 chroma never occurs so NIF
 +
 +@ ============================================================================
-+
-+
 diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..9bd0a42 100644
+index 166bddb104..15c4329cdb 100644
 --- a/libavcodec/arm/hevcdsp_deblock_neon.S
 +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
 @@ -15,7 +15,7 @@
@@ -1410,66 +1757,204 @@ index 166bddb..9bd0a42 100644
   */
  
  
-@@ -31,6 +31,9 @@
+@@ -24,70 +24,238 @@
+ 
+ .macro hevc_loop_filter_chroma_start
+         ldr      r12, [r2]
+-        ldr      r3, [r2, #4]
+-        add      r2, r3, r12
+-        cmp      r2, #0
++        ldr      r2, [r2, #4]
++        orrs     r2, r12, r2, lsl #16
+         it       eq
          bxeq     lr
  .endm
  
+-.macro hevc_loop_filter_chroma_body
+-        vsubl.u8  q3, d4, d2
+-        vsubl.u8  q11, d18, d19
+-        vshl.i16  q3, #2
+-        vadd.i16  q11, q3
+-        vdup.16   d0, r12
+-        vdup.16   d1, r3
+-        vrshr.s16 q11, q11, #3
+-        vneg.s16  q12, q0
 +@ Uses: d2, d4, d18, d19
 +@ Returns: d2, d4
-+@ Modifies: d0-d7, d22-d25
- .macro hevc_loop_filter_chroma_body
-         vsubl.u8  q3, d4, d2
-         vsubl.u8  q11, d18, d19
-@@ -49,6 +52,33 @@
-         vqmovun.s16 d4, q2
- .endm
- 
++@ Modifies: d0-d7, d22-d25, r12
 +
-+@ Uses r2[0:7], r2[8:15]
-+@ Modifies: d0-d7, d22-d25
-+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
-+        vsubl.u8  q3, \Q0, \P0
-+        vsubl.u8  q11, \P1, \Q1
-+        vshl.i16  q3, #2
-+        vadd.i16  q11, q3
++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
++        vsubl.u8  q0, \Q0, \P0
++        vsubl.u8  q1, \P1, \Q1
++        vdup.16   d4, r2
++        lsr       r2, r2, #16
++        vshl.i16  q0, #2
++        ldr       r12, [sp, #0] @ r12 = &no_q
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
++        vdup.16   d5, r2
 +
-+        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
-+        vdup.16   d0, r2
-+        vmovl.u8  q0, d0
-+        vuzp.16   d0, d1
-+
-+        vrshr.s16 q11, q11, #3
-+        vneg.s16  q12, q0
++        vrshr.s16 q0, q0, #3
++        ldrh      r12, [r12]
++        vneg.s16  q3, q2
++        vmin.s16  q0, q0, q2
 +        vmovl.u8  q2, \Q0
-+        vmin.s16  q11, q11, q0
-+        vmax.s16  q11, q11, q12
-+        vaddw.u8  q1, q11, \P0
-+        vsub.i16  q2, q11
++        vmax.s16  q0, q0, q3
++        vaddw.u8  q1, q0, \P0
++        vsub.i16  q2, q0
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        vqmovun.s16 \P0, q1
 +        vqmovun.s16 \Q0, q2
 +.endm
 +
++@ Uses r2 (tc a;b)
++@ Modifies: q0-q3
++@ On exit
++@   r12 (and flags) contain no_p;no_q
++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
++        vsub.i16  q0, \Q0, \P0
++        lsl       r12, r2, #(\bit_depth - 8)
++        vsub.i16  q1, \P1, \Q1
++        vshl.i16  q0, #2
++        vdup.16   d4, r12
++        lsr       r12, r12, #16
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]
++        vdup.16   d5, r12
++
++        vrshr.s16 q0, q0, #3
++        vneg.s16  q3, q2
++        movw      r12, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q0, q2
++        vmax.s16  q0, q0, q3
++        vdup.i16  q3, r12
++        ldr       r12, [sp, #0]
++
++        vadd.i16  \P0, q0, \P0
++        vsub.i16  \Q0, q0
++
++        vmov.i64  q2, #0
++        ldrh      r12, [r12]
++        vmin.s16  \P0, q3
++        vmin.s16  \Q0, q3
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        vmax.s16  \P0, q2
++        vmax.s16  \Q0, q2
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
++        vsubl.u8  q0, \Q0u, \P0u
++        vsubl.u8  q1, \Q0v, \P0v
++        vsubl.u8  q2, \P1u, \Q1u
++        vsubl.u8  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
+         vmovl.u8  q2, d4
+-        vmin.s16  q11, q11, q0
+-        vmax.s16  q11, q11, q12
+-        vaddw.u8  q1, q11, d2
+-        vsub.i16  q2, q11
+-        vqmovun.s16 d2, q1
+-        vqmovun.s16 d4, q2
++        vmovl.u8  q3, d6
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vaddw.u8  q2, q0, \P0u
++        vmax.s16  q1, q3
++        vaddw.u8  q3, q1, \P0v
++
++        vqmovun.s16 \P0u, q2
++        vmovl.u8  q2, \Q0u
++        vqmovun.s16 \P0v, q3
++        vmovl.u8  q3, \Q0v
++        vsub.i16  q2, q0
++        vsub.i16  q3, q1
++
++        vqmovun.s16 \Q0u, q2
++        vqmovun.s16 \Q0v, q3
+ .endm
+ 
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
++        vsub.i16  q0, \Q0u, \P0u
++        vsub.i16  q1, \Q0v, \P0v
++        vsub.i16  q2, \P1u, \Q1u
++        vsub.i16  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
++        vshll.u8  q2, d4, #\bit_depth - 8
++        vshll.u8  q3, d6, #\bit_depth - 8
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        movw      r2, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vmov.i64  q2, #0
++        vmax.s16  q1, q3
++        vdup.i16  q3, r2
++        vadd.i16  \P0u, q0
++        vsub.i16  \Q0u, q0
++        vadd.i16  \P0v, q1
++        vsub.i16  \Q0v, q1
++
++        vmax.s16  \P0u, q2
++        vmax.s16  \Q0u, q2
++        vmax.s16  \P0v, q2
++        vmax.s16  \Q0v, q2
++        vmin.s16  \P0u, q3
++        vmin.s16  \Q0u, q3
++        vmin.s16  \P0v, q3
++        vmin.s16  \Q0v, q3
++.endm
++
 +
 +
  .macro hevc_loop_filter_luma_start
          ldr     r12, [r3]
          ldr      r3, [r3, #4]
-@@ -60,15 +90,17 @@
-         lsr      r3, #16
+-        lsl      r3, #16
+-        orr      r3, r12
+-        cmp      r3, #0
++        orrs     r3, r12, r3, lsl #16
+         it       eq
+         bxeq     lr
+-        lsr      r3, #16
  .endm
  
 -.macro hevc_loop_filter_luma_body
-+@ Uses: r2, r3, r12
-+@ Modifies: r5, r6, r7, r8, r9
-+function hevc_loop_filter_luma_body
-+        vmovl.u8  q15, d23
-+        vmovl.u8  q14, d22
-+        vmovl.u8  q13, d21
-+        vmovl.u8  q12, d20
-+        vmovl.u8  q11, d19
-+        vmovl.u8  q10, d18
-+        vmovl.u8  q9, d17
-         vmovl.u8  q8, d16
+-        vmovl.u8  q8, d16
 -        vmovl.u8  q9, d18
 -        vmovl.u8  q10, d20
 -        vmovl.u8  q11, d22
@@ -1477,46 +1962,103 @@ index 166bddb..9bd0a42 100644
 -        vmovl.u8  q13, d26
 -        vmovl.u8  q14, d28
 -        vmovl.u8  q15, d30
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++
++@ Input:
++@  r2          beta    (raw: needs shift for bitdepth > 8)
++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
++@  [sp,#96]    &no_p[0]
++@  [sp,#100]   &no_q[0]
++@
++@ Input & output
++@  8-bit: d16-d23
++@ 16-bit:  q8-q15
++@
++@ Output
++@  Z           r10==0
++@  r10[ 0:7 ]  no_p[0]
++@  r10[ 8:15]  no_p[1]
++@  r10[16:23]  no_q[0]
++@  r10[24:31]  no_q[1]
++
  
++.macro m_filter_luma bit_depth
++.if \bit_depth == 8
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
++        vmovl.u8  q8, d16
++.endif
          vadd.i16   q7, q9, q11
++.if \bit_depth > 8
++        lsl        r2, r2, #(\bit_depth - 8)
++.endif
          vadd.i16   q6, q14, q12
-@@ -77,7 +109,6 @@
++.if \bit_depth > 8
++        lsl        r3, r3, #(\bit_depth - 8)
++.endif
+         vsub.i16   q7, q10
++        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
+         vsub.i16   q6, q13
          vabd.s16   q7, q7, q10
          vabd.s16   q6, q6, q13
- 
 -
++        ldrh       r10, [r5]
+ 
          vdup.16    q0, r2
          vmov       q4, q7
          vmov       q5, q6
-@@ -152,7 +183,7 @@
+-        vdup.16    d4, r12
++        ldr        r5, [sp, #100]
++        vdup.16    d4, r3
++        lsr        r3, r3, #16
+         vtrn.16    q7, q4
++        ldrh       r5, [r5]
+         vtrn.16    q6, q5
+ 
+         vshl.u64   q7, #32
+         vshr.u64   q4, #32
+         vshl.u64   q6, #32
++        orr        r10, r10, r5, lsl #16
+         vshr.u64   q5, #32
+         vshr.u64   q7, #32
+         vshr.u64   q6, #32
+@@ -152,7 +320,7 @@
  
          and        r9, r8, r7
          cmp        r9, #0
 -        beq        weakfilter_\@
-+        beq        weakfilter_
++        beq        1f
  
          vadd.i16  q2, q11, q12
          vadd.i16  q4, q9, q8
-@@ -210,11 +241,11 @@
+@@ -210,11 +378,11 @@
          vbit      q13, q3, q5
          vbit      q14, q2, q5
  
 -weakfilter_\@:
-+weakfilter_:
++1:
          mvn       r8, r8
          and       r9, r8, r7
          cmp       r9, #0
 -        beq       ready_\@
-+        beq       ready_
++        beq       2f
  
          vdup.16    q4, r2
  
-@@ -275,75 +306,345 @@ weakfilter_\@:
+@@ -275,111 +443,1041 @@ weakfilter_\@:
          vbit      q11, q0, q5
          vbit      q12, q4, q5
  
 -ready_\@:
-+ready_:
++2:
++.if \bit_depth == 8
          vqmovun.s16 d16, q8
 -        vqmovun.s16 d18, q9
 -        vqmovun.s16 d20, q10
@@ -1525,7 +2067,7 @@ index 166bddb..9bd0a42 100644
 -        vqmovun.s16 d26, q13
 -        vqmovun.s16 d28, q14
 -        vqmovun.s16 d30, q15
--.endm
++        cmp       r10, #0
 +        vqmovun.s16 d17, q9
 +        vqmovun.s16 d18, q10
 +        vqmovun.s16 d19, q11
@@ -1533,7 +2075,30 @@ index 166bddb..9bd0a42 100644
 +        vqmovun.s16 d21, q13
 +        vqmovun.s16 d22, q14
 +        vqmovun.s16 d23, q15
++.else
++        movw      r12, #(1 << \bit_depth - 1)
++        vmov.i64  q0, #0
++        vdup.i16  q1, r12
++        @ q8 & q15 should be unaltered and so don't require clipping
++        vmax.s16  q9,  q0
++        cmp       r10, #0
++        vmax.s16  q10, q0
++        vmax.s16  q11, q0
++        vmax.s16  q12, q0
++        vmax.s16  q13, q0
++        vmax.s16  q14, q0
++        vmin.s16  q9,  q1
++        vmin.s16  q10, q1
++        vmin.s16  q11, q1
++        vmin.s16  q12, q1
++        vmin.s16  q13, q1
++        vmin.s16  q14, q1
++.endif
 +        mov       pc, lr
+ .endm
+ 
++function hevc_loop_filter_luma_body
++        m_filter_luma 8
 +endfunc
 +
 +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
@@ -1545,7 +2110,16 @@ index 166bddb..9bd0a42 100644
 +        b        v_loop_luma_common
 +endfunc
 +
- 
++
++@ void ff_hevc_v_loop_filter_luma_neon(
++@   uint8_t *_pix,      [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int *_tc,           [r3]
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++
  function ff_hevc_v_loop_filter_luma_neon, export=1
          hevc_loop_filter_luma_start
 -        push     {r5-r11}
@@ -1553,14 +2127,6 @@ index 166bddb..9bd0a42 100644
 +
 +        sub      r4, r0, #4
 +v_loop_luma_common:
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
-+
          vpush    {d8-d15}
 -        sub      r0, #4
 -        vld1.8   {d16}, [r0], r1
@@ -1617,44 +2183,38 @@ index 166bddb..9bd0a42 100644
 +
 +        @ no_p[1]
 +        tst     r10, #0xff00
-+        itt ne
-+        addne    r4, r4, r1, lsl #2
++        add     r2, r4, r1, lsl #2
 +        bne     1f
 +        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
 +        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
 +        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
-+
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
++1:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     1f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
 +1:
 +        @ no_q[1]
 +        tst     r10, #0xff000000
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne     2f
++        add     r2, r0, r1, lsl #2
++        bne     1f
 +        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
 +        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
 +        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
-+
-+2:
-+        @ no_p[0]
-+        tst     r10, #0xff
-+        bne     3f
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
-+
-+3:
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
++1:
 +        @ no_q[0]
 +        tst     r10, #0xff0000
-+        bne     4f
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
-+
-+4:
++        bne     1f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
 +bypasswrite:
          vpop     {d8-d15}
 -        pop      {r5-r11}
@@ -1662,6 +2222,81 @@ index 166bddb..9bd0a42 100644
 +        pop      {r4-r10,pc}
  endfunc
  
++.macro m_filter_v_luma_common_16 bit_depth
++        vpush    {d8-d15}
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        neg     r1, r1
++
++        @ p[1]
++        tst      r10, #0xff00
++        add      r2, r4, r1, lsl #2
++        bne      1f
++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
++1:
++        @ p[0]
++        tst      r10, #0xff
++        bne      1f
++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
++1:
++        @ q[1]
++        tst      r10, #0xff000000
++        add      r2, r0, r1, lsl #2
++        bne      1f
++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
++1:
++        @ q[0]
++        tst      r10, #0xff0000
++        bne      1f
++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++        vpop     {d8-d15}
++        pop      {r4-r10,pc}
++.endm
++
++
++
++
 +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
 +@                                 ptrdiff_t stride, [r1]
 +@                                 int beta,         [r2]
@@ -1711,13 +2346,6 @@ index 166bddb..9bd0a42 100644
 +        neg     r1, r1
 +        add     r0, r0, r1
 +
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        bne      1f
 +
 +        vst1.8  {d22}, [r0], r1
@@ -1768,8 +2396,81 @@ index 166bddb..9bd0a42 100644
 +
 +        pop      {r4-r10,pc}
 +
- endfunc
- 
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        vpush    {d8-d15}
++        sub      r0, r0, r1, lsl #2
++
++        vld1.16 { q8}, [r0], r1
++        vld1.16 { q9}, [r0], r1
++        vld1.16 {q10}, [r0], r1
++        vld1.16 {q11}, [r0], r1
++        vld1.16 {q12}, [r0], r1
++        vld1.16 {q13}, [r0], r1
++        vld1.16 {q14}, [r0], r1
++        vld1.16 {q15}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        vpop     {d8-d15}
++
++        sub      r0, r1
++        neg      r1, r1
++        bne      1f
++
++        vst1.16  {q14}, [r0], r1
++        vst1.16  {q13}, [r0], r1
++        vst1.16  {q12}, [r0], r1
++        vst1.16  {q11}, [r0], r1
++        vst1.16  {q10}, [r0], r1
++        vst1.16  { q9}, [r0]
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        tst      r10, #0xff0000
++        mov      r2, r0
++        bne      1f
++        vst1.16  {d28}, [r2], r1
++        vst1.16  {d26}, [r2], r1
++        vst1.16  {d24}, [r2]
++
++1:
++        tst      r10, #0xff000000
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d29}, [r2], r1
++        vst1.16  {d27}, [r2], r1
++        vst1.16  {d25}, [r2]
++
++1:
++        tst      r10, #0xff
++        @ r0 = r0 + r1 * 3
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d22}, [r0], r1
++        vst1.16  {d20}, [r0], r1
++        vst1.16  {d18}, [r0]
++
++1:
++        tst      r10, #0xff00
++        bne      1f
++        vst1.16  {d23}, [r2], r1
++        vst1.16  {d21}, [r2], r1
++        vst1.16  {d19}, [r2]
++
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
 +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
@@ -1783,9 +2484,7 @@ index 166bddb..9bd0a42 100644
 +        vld2.8   {d26,d27}, [r0], r1
 +        vld2.8   {d28,d29}, [r0]
 +        sub      r0, r0, r1, lsl #1
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
 +        cmp      r3, #0
 +        bne      1f
 +        vst2.8   {d18,d19}, [r0], r1
@@ -1795,122 +2494,509 @@ index 166bddb..9bd0a42 100644
 +        @ At least one no_f bit is set
 +        @ Which means we need to break this apart in an ugly fashion
 +1:      vzip.8   d18, d19
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
 +        vzip.8   d26, d27
 +        sub      r1, r1, #8
 +
-+        tst      r3, #1
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d18}, [r0]
 +1:      add      r0, r0, #8
-+        tst      r3, #2
-+        bne      2f
++        bcs      2f
 +        vst1.8   {d19}, [r0]
-+2:      add      r0, r0, r1
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
 +
-+        tst      r3, #4
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d26}, [r0]
-+1:      add      r0, r0, #8
-+        tst      r3, #8
-+        it ne
-+        bxne     lr
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #8
 +        vst1.8   {d27}, [r0]
 +        bx       lr
 +
 +endfunc
 +
 +
++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++        sub      r0, r0, r1, lsl #1
++        vld2.16  {q8,  q9 }, [r0], r1
++        vld2.16  {q10, q11}, [r0], r1
++        vld2.16  {q12, q13}, [r0], r1
++        vld2.16  {q14, q15}, [r0]
++        sub      r0, r0, r1, lsl #1
++
++        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++
++        cmp      r3, #0
++        bne      1f
++        vst2.16  {q10, q11}, [r0], r1
++        vst2.16  {q12, q13}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.16  q10, q11
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
++        vzip.16  q12, q13
++        sub      r1, r1, #16
++
++        bmi      1f
++        vst1.16  {q10}, [r0]
++1:      add      r0, r0, #16
++        bcs      2f
++        vst1.16  {q11}, [r0]
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
++
++        bmi      1f
++        vst1.16  {q12}, [r0]
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #16
++        vst1.16  {q13}, [r0]
++        bx       lr
++.endm
++
++
 +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     uint8_t * src_l,       // r3
 +@                                     unsigned int no_f);   // sp[0]
 +@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++
 +function ff_hevc_v_loop_filter_uv2_neon_8, export=1
 +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
-+        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
++        sub      r12, r0, r3
 +
 +        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
++        cmp      r12, #4
 +
 +        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
 +
 +        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
 +
 +        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
-+        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
 +
 +        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
 +
 +        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
 +
 +        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
-+        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
-+
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
 +
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
++        cmp      r12, #0
++        add      r3, #2
 +        neg      r1, r1
-+
-+        ldr      r2, [sp, #0]
-+
-+        @ p[1]
-+        tst      r2, #2
-+        itt ne
-+        addne    r3, r3, r1, lsl #2
 +        bne      1f
-+        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
-+        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
 +
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
++        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
++        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
++        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
++        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
++        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
++        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
++        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
++        bx       lr
++
++@ Either split or partial
 +1:
-+        @ q[1]
-+        tst      r2, #8
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne 2f
-+        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
-+        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
-+        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
-+        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.8   {d20[7], d21[7]}, [r0], r1
++        vst2.8   {d20[6], d21[6]}, [r0], r1
++        vst2.8   {d20[5], d21[5]}, [r0], r1
++        vst2.8   {d20[4], d21[4]}, [r0]
++1:
++        bmi      2f
++        vst2.8   {d20[3], d21[3]}, [r2], r1
++        vst2.8   {d20[2], d21[2]}, [r2], r1
++        vst2.8   {d20[1], d21[1]}, [r2], r1
++        vst2.8   {d20[0], d21[0]}, [r2]
 +
 +2:
-+        @ p[0]
-+        tst      r2, #1
-+        bne      3f
-+        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
-+
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.8   {d18[7], d19[7]}, [r3], r1
++        vst2.8   {d18[6], d19[6]}, [r3], r1
++        vst2.8   {d18[5], d19[5]}, [r3], r1
++        vst2.8   {d18[4], d19[4]}, [r3]
 +3:
-+        @ q[0]
-+        tst      r2, #4
-+        it ne
-+        bxne     lr
-+        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
-+        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
-+        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
-+        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
-+
++        it mi
++        bxmi     lr
++        vst2.8   {d18[3], d19[3]}, [r2], r1
++        vst2.8   {d18[2], d19[2]}, [r2], r1
++        vst2.8   {d18[1], d19[1]}, [r2], r1
++        vst2.8   {d18[0], d19[0]}, [r2]
 +        bx       lr
-+endfunc
+ endfunc
+ 
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++.macro m_filter_v_uv2_16 bit_depth
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++        sub      r12, r0, r3
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++        cmp      r12, #8
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
++
++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++        cmp      r12, #0
++        add      r3, #4
++        neg      r1, r1
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
++        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
++        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
++        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
++        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
++        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
++        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
++        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
++        bx       lr
++
++@ Either split or partial
++1:
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.16  {d25[3], d27[3]}, [r0], r1
++        vst2.16  {d25[2], d27[2]}, [r0], r1
++        vst2.16  {d25[1], d27[1]}, [r0], r1
++        vst2.16  {d25[0], d27[0]}, [r0]
++1:
++        bmi      2f
++        vst2.16  {d24[3], d26[3]}, [r2], r1
++        vst2.16  {d24[2], d26[2]}, [r2], r1
++        vst2.16  {d24[1], d26[1]}, [r2], r1
++        vst2.16  {d24[0], d26[0]}, [r2]
++
++2:
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.16  {d21[3], d23[3]}, [r3], r1
++        vst2.16  {d21[2], d23[2]}, [r3], r1
++        vst2.16  {d21[1], d23[1]}, [r3], r1
++        vst2.16  {d21[0], d23[0]}, [r3]
++3:
++        it mi
++        bxmi     lr
++        vst2.16  {d20[3], d22[3]}, [r2], r1
++        vst2.16  {d20[2], d22[2]}, [r2], r1
++        vst2.16  {d20[1], d22[1]}, [r2], r1
++        vst2.16  {d20[0], d22[0]}, [r2]
++        bx       lr
++.endm
++
 +
 +
  function ff_hevc_v_loop_filter_chroma_neon, export=1
          hevc_loop_filter_chroma_start
++
++        sub      r0, #2
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f
++
++        vst2.8   {d17[0], d18[0]}, [r0], r1
++        vst2.8   {d17[1], d18[1]}, [r0], r1
++        vst2.8   {d17[2], d18[2]}, [r0], r1
++        vst2.8   {d17[3], d18[3]}, [r0], r1
++        vst2.8   {d17[4], d18[4]}, [r0], r1
++        vst2.8   {d17[5], d18[5]}, [r0], r1
++        vst2.8   {d17[6], d18[6]}, [r0], r1
++        vst2.8   {d17[7], d18[7]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.8   {d17[0]}, [r0], r1
++        vst1.8   {d17[1]}, [r0], r1
++        vst1.8   {d17[2]}, [r0], r1
++        vst1.8   {d17[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.8   {d18[0]}, [r0], r1
++        vst1.8   {d18[1]}, [r0], r1
++        vst1.8   {d18[2]}, [r0], r1
++        vst1.8   {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.8   {d18[4]}, [r0], r1
++        vst1.8   {d18[5]}, [r0], r1
++        vst1.8   {d18[6]}, [r0], r1
++        vst1.8   {d18[7]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.8   {d17[4]}, [r0], r1
++        vst1.8   {d17[5]}, [r0], r1
++        vst1.8   {d17[6]}, [r0], r1
++        vst1.8   {d17[7]}, [r0], r1
++        bx       lr
++
++endfunc
++
++
++.macro m_filter_v_chroma_16 bit_depth
++        hevc_loop_filter_chroma_start
++
          sub      r0, #4
-@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-         vst1.8   {d4}, [r0]
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #2
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f
++
++        vst2.16  {d18[0], d20[0]}, [r0], r1
++        vst2.16  {d18[1], d20[1]}, [r0], r1
++        vst2.16  {d18[2], d20[2]}, [r0], r1
++        vst2.16  {d18[3], d20[3]}, [r0], r1
++        vst2.16  {d19[0], d21[0]}, [r0], r1
++        vst2.16  {d19[1], d21[1]}, [r0], r1
++        vst2.16  {d19[2], d21[2]}, [r0], r1
++        vst2.16  {d19[3], d21[3]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.16  {d18[0]}, [r0], r1
++        vst1.16  {d18[1]}, [r0], r1
++        vst1.16  {d18[2]}, [r0], r1
++        vst1.16  {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.16  {d20[0]}, [r0], r1
++        vst1.16  {d20[1]}, [r0], r1
++        vst1.16  {d20[2]}, [r0], r1
++        vst1.16  {d20[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.16  {d21[0]}, [r0], r1
++        vst1.16  {d21[1]}, [r0], r1
++        vst1.16  {d21[2]}, [r0], r1
++        vst1.16  {d21[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.16  {d19[0]}, [r0], r1
++        vst1.16  {d19[1]}, [r0], r1
++        vst1.16  {d19[2]}, [r0], r1
++        vst1.16  {d19[3]}, [r0], r1
++        bx       lr
++.endm
++
++
++@ void ff_hevc_h_loop_filter_chroma_neon(
++@   uint8_t *_pix,     [r0]
++@   ptrdiff_t _stride, [r1]
++@   int *_tc,          [r2]
++@   uint8_t *_no_p,    [r3]
++@   uint8_t *_no_q);   [sp+0]
++
++function ff_hevc_h_loop_filter_chroma_neon, export=1
++        hevc_loop_filter_chroma_start
++        sub      r0, r0, r1, lsl #1
+         vld1.8   {d16}, [r0], r1
+         vld1.8   {d17}, [r0], r1
+         vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2},  [r0], r1
+-        vld1.8   {d4},  [r0], r1
+-        vld1.8   {d19}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d21}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        hevc_loop_filter_chroma_body
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        vst1.8   {d16}, [r0], r1
++        vld1.8   {d19}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f     @ Partial write
+         vst1.8   {d17}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d2},  [r0], r1
+-        vst1.8   {d4},  [r0], r1
+-        vst1.8   {d19}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d21}, [r0]
++        vst1.8   {d18}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        vmov     r2, r3, d17
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff00
++        it eq
++        streq    r3, [r0, #4]
++
++        add      r0, r1
++        tst      r12, #0xff0000
++        vmov     r2, r3, d18
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff000000
++        it eq
++        streq    r3, [r0, #4]
++
          bx       lr
  endfunc
+ 
+-function ff_hevc_h_loop_filter_chroma_neon, export=1
++.macro m_filter_h_chroma_16 bit_depth
+         hevc_loop_filter_chroma_start
+         sub      r0, r0, r1, lsl #1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2}, [r0], r1
+-        vld1.8   {d4}, [r0], r1
+-        vld1.8   {d19}, [r0]
++        vld1.16  {q8}, [r0], r1
++        vld1.16  {q9}, [r0], r1
++        vld1.16  {q10}, [r0], r1
++        vld1.16  {q11}, [r0]
+         sub      r0, r0, r1, lsl #1
+-        hevc_loop_filter_chroma_body
+-        vst1.8   {d2}, [r0], r1
+-        vst1.8   {d4}, [r0]
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f     @ Partial write
++        vst1.16  {q9}, [r0], r1
++        vst1.16  {q10}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        bne      2f
++        vst1.16  {d18}, [r0]
++2:
++        tst      r12, #0xff00
++        bne      3f
++        add      r0, #8
++        vst1.16  {d19}, [r0]
++        sub      r0, #8
++3:
++        tst      r12, #0xff0000
++        add      r0, r1
++        bne      4f
++        vst1.16  {d20}, [r0]
++4:
++        tst      r12, #0xff000000
++        it ne
++        bxne     lr
++        add      r0, #8
++        vst1.16  {d21}, [r0]
++
+         bx       lr
++.endm
++
 +
 +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
 + *                                            int *curr_rpl0, int *curr_
@@ -2036,9 +3122,54 @@ index 166bddb..9bd0a42 100644
 +        b           11b
 +endfunc
 +
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++        m_filter_luma 10
++endfunc
++
++function ff_hevc_h_loop_filter_luma_neon_10, export=1
++        m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_luma2_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common_10
++endfunc
++
++function ff_hevc_v_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #8
++v_loop_luma_common_10:
++        m_filter_v_luma_common_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_uv_neon_10, export=1
++        m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_uv2_neon_10, export=1
++        m_filter_v_uv2_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_chroma_neon_10, export=1
++        m_filter_h_chroma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_chroma_neon_10, export=1
++        m_filter_v_chroma_16 10
+ endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
-index 0000000..00eab9e
+index 0000000000..00eab9eeee
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
@@ -2379,11 +3510,399 @@ index 0000000..00eab9e
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
+diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
+index e39d00634b..ee2111f9b2 100644
+--- a/libavcodec/arm/hevcdsp_idct_neon.S
++++ b/libavcodec/arm/hevcdsp_idct_neon.S
+@@ -21,82 +21,6 @@
+ #include "libavutil/arm/asm.S"
+ #include "neon.S"
+ 
+-function ff_hevc_idct_4x4_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q0, r1
+-        vdup.16     q1, r1
+-        vst1.16     {q0, q1}, [r0]
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_8x8_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_16x16_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_32x32_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        mov         r3, #16
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-1:      subs        r3, #1
+-        vstm        r0!, {q8-q15}
+-        bne         1b
+-        bx lr
+-endfunc
+-
+ function ff_hevc_add_residual_4x4_neon_8, export=1
+         vldm        r1, {q0-q1}
+         vld1.32     d4[0], [r0], r2
+@@ -168,6 +92,131 @@ function ff_hevc_add_residual_32x32_neon_8, export=1
+         bx          lr
+ endfunc
+ 
++
++@ ff_hevc_add_residual_4x4_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_neon_8, export=1
++        vdup.16     q15, r2
++
++        vld1.32     d4[0], [r0], r1
++        vld1.32     d4[1], [r0], r1
++        vld1.32     d5[0], [r0], r1
++        vld1.32     d5[1], [r0], r1
++        sub         r0, r0, r1, lsl #2
++        vaddw.u8    q0, q15, d4
++        vaddw.u8    q1, q15, d5
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r1
++        vst1.32     d0[1], [r0], r1
++        vst1.32     d1[0], [r0], r1
++        vst1.32     d1[1], [r0], r1
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_4x4_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #4
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_8x8_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #8
++
++1:      subs        r3,   #1
++        vld1.8      d16,  [r0]
++        vaddw.u8    q0,   q15, d16
++        vqmovun.s16 d0,   q0
++        vst1.32     d0,   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_8x8_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #8
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_16x16_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #16
++
++1:      subs        r3,   #1
++        vld1.8      {q8},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst1.8      {q0},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_16x16_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #16
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_32x32_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_32x32_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #32
++
++1:      subs        r3,   #1
++        vld1.8      {q8, q9},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst1.8     {q0, q1},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++
+ .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+         vtrn.64         \r0, \r4
+         vtrn.64         \r1, \r5
+@@ -263,55 +312,6 @@ endfunc
+         vqrshrn.s32   \r3, q3, \shift
+ .endm
+ 
+-function ff_hevc_transform_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x00240053 // 36 and 83
+-        vmov.32     d0[0], r3
+-
+-        tr4_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_transform_luma_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x4a  // 74
+-        vmov.32     d0[0], r3
+-        ldr         r3, =0x1d  // 29
+-        vmov.32     d0[1], r3
+-        ldr         r3, =0x37  // 55
+-        vmov.32     d1[0], r3
+-
+-        tr4_luma_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_luma_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+ .macro tr8_begin in0, in1, in2, in3
+         vmull.s16  q7, \in0, d1[1]   // 89 * src1
+         vmull.s16  q8, \in0, d1[0]   // 75 * src1
+@@ -356,100 +356,6 @@ endfunc
+         vqrshrn.s32   d8, q5, \shift
+ .endm
+ 
+-function ff_hevc_transform_8x8_neon_8, export=1
+-        push   {r4-r8}
+-        vpush {d8-d15}
+-        mov    r5, #16
+-
+-        adr       r3, tr4f
+-        vld1.16   {d0, d1}, [r3]
+-
+-        // left half
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #128
+-        //skip right half if col_limit in r1 is less than 4
+-        cmp      r1, #4
+-        blt      1f
+-        //right half
+-        add      r0, #8
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #136
+-1:
+-        // top half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        vstm r0!, {q1-q4}
+-
+-        // bottom half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        //vstm     r0, {q1-q4}
+-        vst1.16 {q1-q2}, [r0]
+-        add     r0, #32
+-        vst1.16 {q3-q4}, [r0]
+-        sub     r0, #32
+-        vpop {d8-d15}
+-        pop {r4-r8}
+-        bx lr
+-endfunc
+ 
+ .align 4
+ tr4f:
+@@ -463,3 +369,11 @@ tr16:
+ .word 0x00500046  // 80, d2[2] = 70
+ .word 0x0039002b  // 57, d2[0] = 43
+ .word 0x00190009  // 25, d2[2] = 9
++
++#define BIT_DEPTH 8
++#include "hevc_idct_fn_neon.S"
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "hevc_idct_fn_neon.S"
++
 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 1a3912c..c87e9d3 100644
+index 1a3912c609..3b7e5bd148 100644
 --- a/libavcodec/arm/hevcdsp_init_neon.c
 +++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,11 +22,26 @@
+@@ -22,11 +22,41 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -2395,6 +3914,11 @@ index 1a3912c..c87e9d3 100644
  void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +
++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
 +#ifdef RPI
 +void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
 +                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
@@ -2405,65 +3929,196 @@ index 1a3912c..c87e9d3 100644
 +void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
 +                             uint8_t * src_l,
 +                             unsigned int no_f);
++
++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
 +#endif
 +
  void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
-@@ -43,6 +58,52 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+@@ -34,6 +64,15 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
++
+ void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+ void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+@@ -43,6 +82,157 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
  void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                         ptrdiff_t stride);
  
++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
 +#if RPI_HEVC_SAND
 +void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_v);
 +void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_v);
 +void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_v);
 +void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_u);
 +void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_u);
 +void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
++                                       ptrdiff_t stride, int dc_u);
 +void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
 +void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
 +void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
 +#endif
 +
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
 +
-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
 +
-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++#if RPI_HEVC_SAND
++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
 +
-+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
-+                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
 +
-+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
 +
++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++#endif
++
++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
 +
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
                                  ptrdiff_t srcstride, int height, \
-@@ -58,6 +119,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -2479,227 +4134,110 @@ index 1a3912c..c87e9d3 100644
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,14 +212,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+@@ -142,14 +341,124 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                                MvField *curr, MvField *neigh, uint8_t *bs);
++
++
++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int8_t offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = 3; // BIT_DEPTH - 5
-+    int cwidth = 0;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    switch(cwidth){
-+    case 8:
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 16:
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 32:
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 64:
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    default:
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++)
-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
++    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
 +}
 +
-+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++#if RPI_HEVC_SAND
++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height)
 +{
-+    // Width 32 already dealt with
-+    // width 16 code works in double lines
-+    if (width == 16 && (height & 1) == 0) {
-+        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
-+                                          sao_offset_val_u, sao_left_class_u,
-+                                          sao_offset_val_v, sao_left_class_v,
-+                                          width, height);
-+    }
-+    else
-+    {
-+        const int shift  = 3; // BIT_DEPTH - 5
-+        int k, y, x;
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int8_t offset_table_u[32] = { 0 };
-+        int8_t offset_table_v[32] = { 0 };
-+
-+        stride_src /= sizeof(pixel);
-+        stride_dst /= sizeof(pixel);
-+
-+        for (k = 0; k < 4; k++)
-+            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        for (k = 0; k < 4; k++)
-+            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2)
-+            {
-+                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
-+            }
-+            dst += stride_dst;
-+            src += stride_src;
-+
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
-+
-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
 +{
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    int8_t sao_offset_val[8];  // padding of 3 for vld
-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    int cwidth = 0;
-+
-+    for (x = 0; x < 5; x++) {
-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+    }
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    switch (cwidth) {
-+    case 32:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    case 64:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    default:
-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++) {
-+                int diff0         = CMP(src[x], src[x + a_stride]);
-+                int diff1         = CMP(src[x], src[x + b_stride]);
-+                int idx           = diff0 + diff1;
-+                if (idx)
-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
++#endif
++#endif
 +
 +
-+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 +
-+    if (width == 32 && (height & 7) == 0) {
-+        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
-+    }
-+    else
-+    {
-+        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+        static const int8_t pos[4][2][2] = {
-+            { { -1,  0 }, {  1, 0 } }, // horizontal
-+            { {  0, -1 }, {  0, 1 } }, // vertical
-+            { { -1, -1 }, {  1, 1 } }, // 45 degree
-+            { {  1, -1 }, { -1, 1 } }, // 135 degree
-+        };
-+        int8_t sao_offset_val_u[8];  // padding of 3 for vld
-+        int8_t sao_offset_val_v[8];  // padding of 3 for vld
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int a_stride, b_stride;
-+        int x, y;
-+
-+        for (x = 0; x < 5; x++) {
-+            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
-+            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
-+        }
-+
-+        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2) {
-+                int diff0u = CMP(src[x], src[x + a_stride]);
-+                int diff1u = CMP(src[x], src[x + b_stride]);
-+                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
-+                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
-+}
-+#undef CMP
-+
-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
 +
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
@@ -2710,7 +4248,9 @@ index 1a3912c..c87e9d3 100644
          c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
 +        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
          c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
          c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
 +#ifdef RPI
 +        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
 +        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
@@ -2719,10 +4259,14 @@ index 1a3912c..c87e9d3 100644
          c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
          c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
          c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
-@@ -160,7 +455,25 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -160,7 +469,53 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
          c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
          c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
 +#if RPI_HEVC_SAND
 +        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
 +        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
@@ -2733,19 +4277,43 @@ index 1a3912c..c87e9d3 100644
 +        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
 +        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
 +        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
 +#endif
          c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-+          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-+          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
-+        }
-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
++#endif
++#endif
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +514,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -2767,22 +4335,711 @@ index 1a3912c..c87e9d3 100644
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +548,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
++    else if (bit_depth == 10) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
++#endif
++        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
++        c->add_residual[0]             = ff_hevc_add_residual_4x4_neon_10;
++        c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_10;
++        c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_10;
++        c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
++#endif
++        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
++#endif
++#endif
++    }
 +
 +    assert(offsetof(MvField, mv) == 0);
 +    assert(offsetof(MvField, ref_idx) == 8);
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
  }
+diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..7cc5cd5e5c
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_res16_neon.S
+@@ -0,0 +1,610 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1]
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vld1.16     {d0}, [r0, :64], r2
++        vld1.16     {d1}, [r0, :64], r2
++        vld1.16     {d2}, [r0, :64], r2
++        vld1.16     {d3}, [r0, :64], r2
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r2
++        vst1.16     {d1}, [r0, :64], r2
++        vst1.16     {d2}, [r0, :64], r2
++        vst1.16     {d3}, [r0, :64], r2
++        bx          lr
++
++endfunc
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vdup.i16    q9,  r3
++        vld1.16     {d0}, [r0, :64], r1
++        vld1.16     {d1}, [r0, :64], r1
++        vdup.16     q15, r2
++        vld1.16     {d2}, [r0, :64], r1
++        vld1.16     {d3}, [r0, :64], r1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r1
++        vst1.16     {d1}, [r0, :64], r1
++        vst1.16     {d2}, [r0, :64], r1
++        vst1.16     {d3}, [r0, :64], r1
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #2
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r2
++        vqadd.s16   q0,  q10
++        vld1.16     {q2}, [r0, :128], r2
++        vqadd.s16   q1,  q11
++        vld1.16     {q3}, [r0, :128], r2
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {q0}, [r0, :128], r2
++        vmin.s16    q2,  q2,  q9
++        vst1.16     {q1}, [r0, :128], r2
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q2}, [r0, :128], r2
++        vst1.16     {q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #1
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        mov         r12, #2
++        vdup.16     q15, r2
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vld1.16     {q0}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r1
++        vqadd.s16   q0,  q15
++        vld1.16     {q2}, [r0, :128], r1
++        vqadd.s16   q1,  q15
++        vld1.16     {q3}, [r0, :128], r1
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q8
++        vmax.s16    q1,  q8
++        vmax.s16    q2,  q8
++        vmax.s16    q3,  q8
++        vmin.s16    q0,  q9
++        vmin.s16    q1,  q9
++        vst1.16     {q0}, [r0, :128], r1
++        vmin.s16    q2,  q9
++        vst1.16     {q1}, [r0, :128], r1
++        vmin.s16    q3,  q9
++        vst1.16     {q2}, [r0, :128], r1
++        vst1.16     {q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #8
++1:
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q0, q1}, [r0, :128], r2
++        vst1.16     {q2, q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual16x16_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #8
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r1
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0, :128], r1
++        vst1.16     {q2, q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++
++@ add_residual32x32(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual32x32_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #32
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vldm        r1, {q10-q13}
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  #32
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000..08a021d
+index 0000000000..30113d9c93
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,862 @@
+@@ -0,0 +1,1882 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -2806,124 +5063,211 @@ index 0000000..08a021d
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
-+.macro init_sao_band
-+        pld      [r1]
-+        vld1.8   {q0, q1}, [r2]  // offset table
-+        ldr       r2, [sp, #0]   // stride_dst
-+        ldr      r12, [sp, #4]   // height
-+        vmov.u8  q3, #128
-+.endm
++.set EDGE_SRC_STRIDE, 160
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
++        vshr.u8 q13, q9, #3
++        vadd.s8  q9, \Q_K128
++
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
 +
-+// 128 in q3
-+// input q8 - q11
-+.macro sao_band_64
-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
-+        vadd.s8  q8, q3
-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
-+        vadd.s8  q9, q3
-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
 +        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
++        vshr.u8 q12, q10, #3
++        vadd.s8  q10, \Q_K128
 +        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
++        vshr.u8 q13, q11, #3
++        vadd.s8  q11, \Q_K128
++
++        vsub.s8  q8, \Q_K128
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vsub.s8  q9, \Q_K128
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
++        vqadd.s8 q10, q12
++        vqadd.s8 q11, q13
++        vsub.s8  q10, \Q_K128
++        vsub.s8  q11, \Q_K128
 +.endm
 +
-+function ff_hevc_sao_band_w8_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #8
-+        vld1.8   {d16}, [r1, :64], r3
-+        vld1.8   {d17}, [r1, :64], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8   {d18}, [r1, :64], r3
-+        vld1.8   {d19}, [r1, :64], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {d20}, [r1, :64], r3
-+        vld1.8   {d21}, [r1, :64], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8   {d22}, [r1, :64], r3
-+        vld1.8   {d23}, [r1, :64], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8  {d16}, [r0, :64], r2
-+        vst1.8  {d17}, [r0, :64], r2
-+        vst1.8  {d18}, [r0, :64], r2
-+        vst1.8  {d19}, [r0, :64], r2
-+        vst1.8  {d20}, [r0, :64], r2
-+        vst1.8  {d21}, [r0, :64], r2
-+        vst1.8  {d22}, [r0, :64], r2
-+        vst1.8  {d23}, [r0, :64], r2
-+        bne    1b
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
 +
-+        bx lr
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT1, d25
++
++        vqadd.s8 q8, q12
++        vsub.s8  q8, \Q_K128
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q12
++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++function band_load_y
++        vmov.i64  q0, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q1, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
++
++        mov       r4, sp
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
++        mov       sp, r4
++
++        ldr       r12, [sp, #20]        @ height
++        pld       [r1]
++
++        sub       r12, #1
++        add       r4, r1, r3
++        bx        lr
 +endfunc
 +
-+function ff_hevc_sao_band_w16_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #4
-+        vld1.8  {q8}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8  {q9}, [r1, :128], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8  {q10}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8  {q11}, [r1, :128], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8}, [r0, :128], r2
-+        vst1.8   {q9}, [r0, :128], r2
-+        vst1.8   {q10}, [r0, :128], r2
-+        vst1.8   {q11}, [r0, :128], r2
-+        bne    1b
 +
-+        bx lr
-+endfunc
++function band_load_c
++        vmov.i64  q2, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q3, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
 +
-+function ff_hevc_sao_band_w32_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #2
-+        vld1.8   {q8-q9}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8-q9}, [r0, :128], r2
-+        vst1.8   {q10-q11}, [r0, :128], r2
-+        bne      1b
++        mov       r4, sp                @ Remember SP
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
 +
-+        bx       lr
-+endfunc
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
 +
-+function ff_hevc_sao_band_w64_neon_8, export=1
-+        init_sao_band
++        @ And again for the 2nd set
++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        ldr       r12, [r4, #20]        @ sao_left_class2
++
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
++
++        mov       sp, r4
++
++        ldr       r12, [sp, #28]        @ height
++        pld       [r1]
 +
-+        push      {r4, lr}
 +        subs      r12, #1
-+        mov       r4, r1
-+        it ne
-+        addne     r4, r3
++        add       r4, r1, r3
++        bx        lr
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_64_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
 +
 +1:      subs      r12, #1
 +        vldm      r1, {q8-q11}
 +        pld       [r4]
-+        vshr.u8   q12, q8, #3
-+        vshr.u8   q13, q9, #3
 +        add       r1, r3
-+        vshr.u8   q14, q10, #3
-+        vshr.u8   q15, q11, #3
-+        sao_band_64
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
 +        it ne
 +        addne     r4, r3
 +        vstm      r0, {q8-q11}
@@ -2933,8 +5277,113 @@ index 0000000..08a021d
 +        pop       {r4, pc}
 +endfunc
 +
++@ ff_hevc_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
 +
-+@ ff_hevc_sao_band_c_w64_neon_8(
++function ff_hevc_sao_band_32_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #2
++        vld1.8    { q8, q9 }, [r1, :128], r3
++        vld1.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_16_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #4
++        vld1.8    { q8}, [r1, :128], r3
++        vld1.8    { q9}, [r1, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r0, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_8_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.32   {d16[0]}, [r1, :32], r3
++        vld1.32   {d16[1]}, [r1, :32], r3
++        vld1.32   {d17[0]}, [r1, :32], r3
++        vld1.32   {d17[1]}, [r1, :32], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.32   {d16[0]}, [r0, :32], r2
++        vst1.32   {d16[1]}, [r0, :32], r2
++        vst1.32   {d17[0]}, [r0, :32], r2
++        vst1.32   {d17[1]}, [r0, :32], r2
++        bpl       1b
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_32_neon_8(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -2946,707 +5395,1535 @@ index 0000000..08a021d
 +@   int width              sp[16]
 +@   int height             sp[20]
 +
-+@ As this is often done in-place on the frame buffer it is worth preloading
-+@ the pixel values but we want to beware of loading ouside our buffer to avoid
-+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++function ff_hevc_sao_band_c_32_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
 +
-+function ff_hevc_sao_band_c_neon_8, export=1
-+        mov     r12, sp
-+        push   {r4-r8, lr}  // 24 bytes
++        vmov.i8   q15, #128
++        sub       r3, #32
++        sub       r2, #32
 +
-+        ldm     r12, {r4-r7}
++1:      subs      r12, #1
++        vld2.8    { q8, q9 }, [r1, :128]!
++        vld2.8    {q10, q11}, [r1, :128], r3
 +
-+        add     r4, #2
-+        add     r6, #2
-+        vld1.16 {d16}, [r4]    @ Unaligned
-+        lsl     r5, r5, #3
-+        vld1.16 {d18}, [r6]
-+        pld     [r1]
-+        vmov.i8  d17, #0
-+        mov     r4, r1
-+        vmov.i8  d19, #0
-+        lsl     r7, r7, #3
-+        vdup.8  q1, r5
-+        ldr     r5, [r12, #16]  @ width
-+        vdup.8  q2, r7
-+        ldr     r12, [r12, #20]
-+        vqmovn.s16 d0, q8
-+        cmp     r5, #16         @ At some point we may want a table lookup
-+        vqmovn.s16 d1, q9
-+        vmov.i8 q3, #128
-+        beq     16f
++        pld       [r4]
 +
-+        @ d0 U lookup
-+        @ d1 V lookup
-+        @ q1 U raw offset
-+        @ q2 V raw offset
-+        @ q3 #128
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
 +
-+        @ r4 = r1 = src - Inteded for preload pointer
-+        @ r12 = height
++        vst2.8    { q8, q9 }, [r0, :128]!
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        itt ne
++        addne     r4, r3
++        addne     r4, #32
++
++        bpl       1b
++
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_16_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        vmov.i8   q15, #128
++
++1:      subs      r12, #2
++        vld2.8    { q8, q9 }, [r1, :128], r3
++        vld2.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_8_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.8    {d16, d17}, [r1, :128], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    {d16, d17}, [r0, :128], r2
++        bpl       1b
++        pop     {r4, pc}
++
++4:
++1:      subs      r12, #1
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++        vuzp.8    d16, d17
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vzip.8    d16, d17
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      subs      r12, #1
++        vldm      r1, {q4-q11}
++        add       r1, r3
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q4-q11}
++        add       r0, r2
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_64_neon_10, export=1
++        band_64_16 10
++endfunc
++
++@ ff_hevc_sao_band_32_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_32_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++
++1:      subs      r12, #1
++        vldm      r1, {q8-q11}
++        add       r1, r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_32_neon_10, export=1
++        band_32_16 10
++endfunc
++
++@ ff_hevc_sao_band_16_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_16_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++
++1:      subs      r12, #2
++        vld1.16   { q8, q9 }, [r1, :128], r3
++        vld1.16   {q10, q11}, [r1, :128], r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8, q9 }, [r0, :128], r2
++        vst1.16   {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_16_neon_10, export=1
++        band_16_16 10
++endfunc
++
++@ ff_hevc_sao_band_8_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_8_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++        ldr       lr, [sp, #16]
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.16   { q8}, [r1, :128], r3
++        vld1.16   { q9}, [r1, :128], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8}, [r0, :128], r2
++        vst1.16   { q9}, [r0, :128], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.16   {d16}, [r1, :64], r3
++        vld1.16   {d17}, [r1, :64], r3
++        vld1.16   {d18}, [r1, :64], r3
++        vld1.16   {d19}, [r1, :64], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   {d16}, [r0, :64], r2
++        vst1.16   {d17}, [r0, :64], r2
++        vst1.16   {d18}, [r0, :64], r2
++        vst1.16   {d19}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_8_neon_10, export=1
++        band_8_16 10
++endfunc
++
++
++@ ff_hevc_sao_band_c_32_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_32_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        vpush     {q4-q7}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #96
++
++1:      subs      r12, #1
++
++        vld2.16   { q4, q5 }, [r1, :128]!
++        vld2.16   { q6, q7 }, [r1, :128]!
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        pld       [r4]
++        sub       r1, #96
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
 +
-+        @ Might (unlikely) be called with height == 1
-+        subs      r12, #1
 +        it ne
 +        addne     r4, r3
 +
-+1:
-+        subs      r12, #1
-+        vld2.8    {q8-q9}, [r1, :128]!
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        sub       r1, #32
-+        vsub.u8   q15, q11, q2
-+        pld       [r4]
-+        vshr.u8   q12, #3
-+        vadd.s8   q8, q3
-+        vshr.u8   q13, #3
-+        vadd.s8   q9, q3
++        vst2.16   { q4, q5 }, [r0, :128]!
++        vst2.16   { q6, q7 }, [r0, :128]!
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
 +
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne     r4, r3        @ Do not inc on final pass
-+        vst2.8    {q8-q9}, [r0, :128]!
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        sub       r0, #32
 +        bpl       1b
 +
-+        pop    {r4-r8, pc}
-+
-+@ -- width 16 (UV pairs) --
-+16:
-+        subs    r12, #2
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+
-+1:
-+        subs      r12, #2
-+        vld2.8    {q8-q9}, [r1, :128], r3
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        pld       [r4]
-+        vsub.u8   q15, q11, q2
-+        pld       [r4, r3]
-+        vshr.u8  q12, #3
-+        vadd.s8  q8, q3
-+        vshr.u8  q13, #3
-+        vadd.s8  q9, q3
-+
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+        vst2.8    {q8-q9}, [r0, :128], r2
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        bpl       1b
-+
-+        pop    {r4-r8, pc}
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
 +
++function ff_hevc_sao_band_c_32_neon_10, export=1
++        band_c_32_16 10
 +endfunc
 +
 +
-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
++@ ff_hevc_sao_band_c_16_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_16_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #32
++        sub       r3, #32
++
++1:      subs      r12, #1
++
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
 +.endm
 +
++function ff_hevc_sao_band_c_16_neon_10, export=1
++        band_c_16_16 10
++endfunc
 +
-+// input
-+// a in q0 - q3
-+// c in q4 - q7
-+// b in q8 - q11
-+// offset table r4,r5 and r6,r7
-+//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
-+// output in q0 - q3
-+// clobbers q12 - q15
 +
-+@ a <- c <- b
++@ ff_hevc_sao_band_c_8_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_8_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        ldr       lr, [sp, #24]         @ width
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.16   { q8, q9 }, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #2
++        vld2.16   {d16, d17}, [r1, :128], r3
++        vld2.16   {d18, d19}, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   {d16, d17}, [r0, :128], r2
++        vst2.16   {d18, d19}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_c_8_neon_10, export=1
++        band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ [r5]  translate values
 +@
-+@ It appears that Neon can stall if you try and use results too soon so we try to
-+@ spread our instruction out
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
 +
-+.macro edgeidx64
++function edge_64b_body_8
 +
-+        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q1
-+        vcgt.u8 q14, q6, q2
-+        vcgt.u8 q15, q7, q3
++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8 q13,  q5,  q1
++        vcgt.u8 q14,  q6,  q2
++        vcgt.u8 q15,  q7,  q3
 +
-+        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u8 q1, q1, q5
-+        vcgt.u8 q2, q2, q6
-+        vcgt.u8 q3, q3, q7
++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q1,  q5
++        vcgt.u8  q2,  q6
++        vcgt.u8  q3,  q7
 +
-+        vsub.s8 q0, q0, q12 // a = sign(c-a)
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12       @ a = sign(c-a)
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q9
-+        vcgt.u8 q14, q6, q10
-+        vcgt.u8 q15, q7, q11
++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q13, q5,  q9
++        vcgt.u8  q14, q6,  q10
++        vcgt.u8  q15, q7,  q11
 +
-+        vsub.s8 q0, q0, q12
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u8 q13, q9, q5
-+        vcgt.u8 q14, q10, q6
-+        vcgt.u8 q15, q11, q7
++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q13, q9,  q5
++        vcgt.u8  q14, q10, q6
++        vcgt.u8  q15, q11, q7
 +
-+        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s8 q1, q1, q13
-+        vmov.u8 q12, #2
-+        vadd.s8 q2, q2, q14
-+        vadd.s8 q3, q3, q15
++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
++        vadd.s8  q1,  q13
++        vmov.u8  q12, #2
++        vadd.s8  q2,  q14
++        vadd.s8  q3,  q15
 +
-+        vadd.s8 q0, q0, q12
-+        vadd.s8 q1, q1, q12
-+        @ whilst vmov dn, rm, rn exists it is a vfp instruction
-+        @ and causes a stall till neon pipe empty - so don't do that!
-+        vmov    d26[0], r4
-+        vmov    d26[1], r5
-+        vmov    d27[0], r6
-+        vmov    d27[1], r7
-+        vadd.s8 q2, q2, q12
-+        vuzp.8    q0, q1
-+        vmov.u8 q15, #128
-+        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++        vadd.s8  q0,  q12
++        vadd.s8  q1,  q12
 +
-+        vtbl.8  d0, {d26}, d0
-+        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++        vld1.8   {d26, d27}, [r5]
 +
-+        vtbl.8  d1, {d26}, d1
-+        vadd.s8 q14, q5, q15
++        vadd.s8  q2,  q12
++        vuzp.8   q0,  q1
++        vmov.u8  q15, #128
++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
 +
-+        vtbl.8  d2, {d27}, d2
-+        vuzp.8    q2, q3
++        vtbl.8   d0,  {d26}, d0
++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
 +
-+        vtbl.8  d3, {d27}, d3
++        vtbl.8   d1,  {d26}, d1
++        vadd.s8  q14, q5, q15
 +
-+        vtbl.8  d4, {d26}, d4
-+        vzip.8    q0, q1
++        vtbl.8   d2,  {d27}, d2
++        vuzp.8   q2,  q3
 +
-+        vtbl.8  d5, {d26}, d5
-+        vqadd.s8 q0, q0, q12
-+        vqadd.s8 q1, q1, q14
-+        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d3,  {d27}, d3
 +
-+        vtbl.8  d6, {d27}, d6
-+        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d4,  {d26}, d4
++        vzip.8   q0,  q1
 +
-+        vtbl.8  d7, {d27}, d7
-+        vzip.8   q2, q3
++        vtbl.8   d5,  {d26}, d5
++        vqadd.s8 q0,  q12
++        vqadd.s8 q1,  q14
++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
 +
-+        vsub.s8 q0, q0, q15
-+        vqadd.s8 q2, q2, q12
-+        vqadd.s8 q3, q3, q14
-+        vsub.s8 q1, q1, q15
-+        vsub.s8 q2, q2, q15
-+        vsub.s8 q3, q3, q15
++        vtbl.8   d6,  {d27}, d6
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
 +
-+.endm
++        vtbl.8   d7,  {d27}, d7
++        vzip.8   q2,  q3
++
++        vsub.s8  q0,  q15
++        vqadd.s8 q2,  q12
++        vqadd.s8 q3,  q14
++        vsub.s8  q1,  q15
++        vsub.s8  q2,  q15
++        vsub.s8  q3,  q15
++
++        bx      lr
++endfunc
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ r4    upper clip value
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_16
++
++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q1
++        vcgt.u16 q14, q6, q2
++        vcgt.u16 q15, q7, q3
++
++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u16 q1, q1, q5
++        vcgt.u16 q2, q2, q6
++        vcgt.u16 q3, q3, q7
++
++        vsub.s16 q0, q0, q12 // a = sign(c-a)
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q9
++        vcgt.u16 q14, q6, q10
++        vcgt.u16 q15, q7, q11
++
++        vsub.s16 q0, q0, q12
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u16 q13, q9, q5
++        vcgt.u16 q14, q10, q6
++        vcgt.u16 q15, q11, q7
++
++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s16 q1, q1, q13
++        vmov.u8  q12, #2
++        vadd.s16 q2, q2, q14
++        vadd.s16 q3, q3, q15
++
++        vmovn.s16 d0, q0
++        vmovn.s16 d1, q1
++        vmovn.s16 d2, q2
++        vmovn.s16 d3, q3
++
++        vuzp.8   q0, q1
++
++        vld1.8   {d26, d27}, [r5]
++
++        vadd.s8  q0, q0, q12
++        vadd.s8  q1, q1, q12
++
++        vtbl.8   d0, {d26}, d0
++        vtbl.8   d1, {d26}, d1
++        vtbl.8   d2, {d27}, d2
++        vtbl.8   d3, {d27}, d3
++
++        vmov.i64 q12, #0
++
++        vzip.8   q0, q1
++
++        vdup.i16 q13, r4
++
++        @ Avoid overwrite whilst widening
++        vaddw.s8 q2, q6, d2
++        vaddw.s8 q3, q7, d3
++        vaddw.s8 q1, q5, d1
++        vaddw.s8 q0, q4, d0
++
++        @ now clip
++        clip16_4 q2, q3, q1, q0, q12, q13
 +
-+function edge_w64_body
-+        edgeidx64
-+        vstm    r0, {q0-q3}
-+        add     r0, r0, r2
 +        bx       lr
 +endfunc
 +
-+.macro init_edge_64
-+        push   {r4-r8,lr}
-+        ldr    r12, [sp, #24] // height
-+        ldr    r5,  [sp, #28] // sao_offset_val_table
-+        ldrd   r4, r5, [r5]
-+        mov    r6, r4
-+        mov    r7, r5
-+.endm
 +
-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #15
-+        vext.8 q1, q4, q5, #15
-+        vext.8 q2, q5, q6, #15
-+        vext.8 q3, q6, q7, #15
-+        // load b
-+        vext.8 q8, q4, q5, #1
-+        vext.8 q9, q5, q6, #1
-+        vext.8 q10, q6, q7, #1
-+        vext.8 q11, q7, q12, #1
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
++
++        vsub.s8  q0,  q3
++        vsub.s8  q10, q9
++        vadd.s8  q0,  q10       @ a = sign(c-a)
++
++        vadd.s8  q0,  q14
++        vuzp.8   d0,  d1
++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d0,  {d16}, d0
++        vtbl.8   d1,  {d17}, d1
++
++        vzip.8   d0,  d1
++        vqadd.s8 q0,  q3
++        vsub.s8  q0,  q15
++
++        bx      lr
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub     r1, r3
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
++        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
++        vsub.s16 q0, q3         @ a = sign(c-a)
++        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.s16 q0, q3
++        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
++
++        vmovn.s16 d0, q0
++        @ d1 will have random contents that we transform but
++        @ that doesn't matter as we then discard them
++        vuzp.8   d0, d1
++
++        vadd.s8  q0, q0, q14
++
++        vtbl.8   d0, {d16}, d0
++        vtbl.8   d1, {d17}, d1
++
++        vzip.8   d0, d1
++
++        vaddw.s8 q0, q1, d0
++
++        @ now clip
++        vmax.s16 q0, q12
++        vmin.s16 q0, q15
++        bx       lr
++endfunc
++
++
++@ ff_hevc_sao_edge_[c_]xx_neon(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
++@   int eo,                           [sp, #sp_base + 0]
++@   int width,                        [sp, #sp_base + 4]
++@   int height)                       [sp, #sp_base + 8]
++
++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
++        push     {r4-r6, lr}    @ 16 bytes
++.set sp_base, 16
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++        mov      r12, #2
++.if \is_chroma
++        ldr      r4, [sp, #16]
++.set sp_base, sp_base + 4
++.endif
++        vld1.8   {d16[2]}, [r3], r12
++        vld1.8   {d16[0]}, [r3], r12
++        vld1.8   {d16[1]}, [r3], r12
++        vld1.8   {d16[3]}, [r3], r12
++        vld1.8   {d16[4]}, [r3]
++.if \is_chroma
++        vld1.8   {d17[2]}, [r4], r12
++        vld1.8   {d17[0]}, [r4], r12
++        vld1.8   {d17[1]}, [r4], r12
++        vld1.8   {d17[3]}, [r4], r12
++        vld1.8   {d17[4]}, [r4]
++.else
++        vmov     d17, d16
++.endif
++
++@ Setup constant registers
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++.else
++        vmov.u8  q15, #128
++.endif
++        vmov.u8  q14, #2
++.endif
++        movw     r3, EDGE_SRC_STRIDE
++
++@ If setup_64b we need the xlat table on the stack and q4-q7 saved
++.if \setup_64b
++        sub      r5, sp, #16
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++        ldr      r12, [sp, #sp_base + 4]        @ width
++        cmp      r12, #8
++.endif
++        ldr      r12, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
++.if \check_w4
++        it lt
++        addlt    r6, #16
++.endif
++        ldr      r6, [r6, r12, lsl #2]
++
++        ldr      r12, [sp, #sp_base + 8]        @ height
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++        push     {r0, r1, r6, r12}
++        blx      r6
++        pop      {r0, r1, r6, r12}
++
++        add      r0, #64
++        add      r1, #64
++.endif
++
++        blx      r6
++
++@ Tidy up & return
++.if \setup_64b
++        vpop     {q4-q8}        @ spurious but harmless load of q8
++.endif
++        pop      {r4-r6, pc}
++.endm
++
++
++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
++.endm
++
++
++.macro  edge_64b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #8
++1:      vldm     r1, {d7-d16}
++        subs     r12, #1
++        add      r1, r3
 +        // load a
-+        vld1.8  {q0-q1}, [r1, :128]!
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        sub     r1, #32
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+1:      subs    r12, #1
++        vext.8   q0,  q3,  q4, #(16 - \pb)
++        vext.8   q1,  q4,  q5, #(16 - \pb)
++        vext.8   q2,  q5,  q6, #(16 - \pb)
++        vext.8   q3,  q6,  q7, #(16 - \pb)
 +        // load b
-+        vld1.8  {q8-q9}, [r1, :128]!
-+        vld1.8  {q10-q11}, [r1, :128], r3
-+        sub     r1, #32
-+        bl      edge_w64_body
++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        vext.8   q8,  q4,  q5, #\pb
++        vext.8   q9,  q5,  q6, #\pb
++        vext.8   q10, q6,  q7, #\pb
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_32bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {q4-q5}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q0-q1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q8-q9}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {q6-q7}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q2-q3}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q10-q11}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_16b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++        sub      r3, #\pb * 2
++
++1:      subs     r12, #1
++
++        vld1.64  {q0}, [r1]             @ load a
++        add      r1, #\pb
++        vld1.64  {q1}, [r1, :128]       @ load c
++        add      r1, #\pb
++        vld1.64  {q2}, [r1], r3         @ load b
++
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {d2}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d0}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d4}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {d3}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d5}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r0, :64], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #4
++
++        vld1.32  {d2[0]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[0]}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.32  {d4[0]}, [r1], r3      @ R
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[1]}, [r1], r3      @ L
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3      @ R
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[1]}, [r1], r3      @ M
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++
++.macro  edge_64b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0-q1}, [r1, :128]!
++        vld1.8   {q2-q3}, [r1, :128], r3
++        sub      r1, #32
++        // load c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        sub      r1, #32
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q8-q9}, [r1, :128]!
++        vld1.8   {q10-q11}, [r1, :128], r3
++        sub      r1, #32
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q4
++        vmov.64  q1, q5
++        vmov.64  q2, q6
++        vmov.64  q3, q7
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vmov.64  q4, q8
++        vmov.64  q5, q9
++        vmov.64  q6, q10
++        vmov.64  q7, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
++.macro  edge_32bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
 +        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        sub     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #31
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #33
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vld1.8   {q0-q1}, [r1, :128], r3
++        vld1.8   {q4-q5}, [r1, :128], r3
 +
-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #33
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #31
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov     q2, q4
++        vmov     q3, q5
++        vld1.8   {q8-q9}, [r1, :128], r3
++        vld1.8   {q10-q11}, [r1, :128], r3
++        vmov     q6, q8
++        vmov     q7, q9
 +
++        bl       \body_fn
 +
-+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
-+@   uint8_t *_dst,               r0
-+@   uint8_t *_src,               r1
-+@   ptrdiff_t stride_dst,        r2
-+@   ptrdiff_t stride_src,        r3
-+@   int height,                  sp[0]
-+@   int16_t *sao_offset_table_u,  sp[4]
-+@   int16_t *sao_offset_table_v); sp[8]
-+@   int eo                        sp[12]
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
 +
-+function ff_hevc_sao_edge_c_w64_neon_8, export=1
-+        push   {r4-r8,lr}     // 6 reg = 24
-+        ldr    r5,  [sp, #28] // sao_offset_val_table_u
-+        ldr    r7,  [sp, #32] // sao_offset_val_table_v
-+
-+        @ Load and rearrange offsets
-+        @ Also "convert" from 16bit to 8bit
-+        ldrb    r4, [r5, #2]
-+        ldrb    r8, [r5, #4]
-+        ldrb    r6, [r7, #2]
-+        ldrb    r12, [r7, #4]
-+        orr     r4, r4, r8, lsl #8
-+        orr     r6, r6, r12, lsl #8
-+        ldrb    r8, [r5, #6]
-+        ldrb    r12, [r7, #6]
-+        orr     r4, r4, r8, lsl #24
-+        orr     r6, r6, r12, lsl #24
-+        ldrb    r5, [r5, #8]
-+        ldrb    r7, [r7, #8]
-+
-+        ldr     r12, [sp, #36] // e0
-+        adr     r8, edge_c_tbl_w64
-+        ldr     r8, [r8, r12, lsl #2]
-+
-+        ldr     r12, [sp, #24] // height
-+        vpush   {d8-d15}
-+        mov     pc, r8
-+
-+edge_c_tbl_w64:
-+        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
-+
-+ff_hevc_sao_edge_c_eo0_w64_neon_8:
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #14
-+        vext.8 q1, q4, q5, #14
-+        vext.8 q2, q5, q6, #14
-+        vext.8 q3, q6, q7, #14
-+        // load b
-+        vext.8 q8, q4, q5, #2
-+        vext.8 q9, q5, q6, #2
-+        vext.8 q10, q6, q7, #2
-+        vext.8 q11, q7, q12, #2
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+
-+ff_hevc_sao_edge_c_eo1_w64_neon_8:
-+        sub     r1, r3
-+        // load a
-+        vldm    r1, {q0-q3}
-+        add     r1, r3
-+        // load c
-+        vldm    r1, {q4-q7}
-+        add     r1, r3
-+1:      subs    r12, #1
-+        // load b
-+        vldm    r1, {q8-q11}
-+        add     r1, r3
-+        bl      edge_w64_body
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q8
++        vmov.64  q1, q9
++
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vmov.64  q4, q10
++        vmov.64  q5, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+ff_hevc_sao_edge_c_eo2_w64_neon_8:
-+1:      sub     r1, r3
++.macro  edge_16b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1, :128], r3
++        // load c
++        vld1.8   {q1}, [r1, :128], r3
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q2}, [r1, :128], r3
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        // copy c to a
++        vmov.64  q0, q1
++        // copy b to c
++        vmov.64  q1, q2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {d0}, [r1, :64], r3
++        vld1.8   {d2}, [r1, :64], r3
++
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov.64  d1, d2
++        vld1.8   {d4}, [r1, :64], r3
++        vld1.8   {d5}, [r1, :64], r3
++        vmov.64  d3, d4
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++
++        // copy c to a
++        vmov.64  d0, d4
++        // copy b to c
++        vmov.64  d2, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e1, body_fn
++        mov      r6, lr
++debug_me:
++        sub      r1, r3
++        // load a
++        vld1.32  {d0[0]}, [r1], r3
++        vld1.32  {d0[1]}, [r1], r3
++
++1:      subs     r12, #4
++        @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.32  {d4[0]}, [r1], r3
++        vld1.32  {d4[1]}, [r1], r3
++        vld1.32  {d5[0]}, [r1], r3
++        vld1.32  {d5[1]}, [r1], r3
++
++        vmov.32  d1, d4
++        vext.32  d2, d0, d4, #1
++        vext.32  d3, d4, d5, #1
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        vmov.32  d0, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_64b_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #32
++        sub      r3, #(32 - \pb)
++
++1:      sub      r1, r3
 +        // load a
 +        // TODO: fix unaligned load
 +        //       don't reload a like in eo1
-+        sub     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #30
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #34
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vld1.8   {q0-q1}, [r1]!
++        vld1.8   {q2-q3}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        // load  b
++        vld1.8   {q8-q9}, [r1]!
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(64 + \pb)
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
 +
-+ff_hevc_sao_edge_c_eo3_w64_neon_8:
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #34
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #30
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
-+
-+
-+.macro init_edge_32
-+        ldr     r12, [sp, #4] // sao_offset_val_table
-+        vld1.32 {d31}, [r12]
-+        ldr     r12, [sp] // height
++        add      r3, #(32 - \pb)
++        bx       r6
 +.endm
 +
-+.macro diff out0, tmp0, in0, in1
-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
++.macro  edge_32bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        vld1.8   {q0-q1}, [r1], r3
++        vld1.8   {q2-q3}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {q4-q5}, [r1, :128], r3
++        vld1.8   {q6-q7}, [r1, :128]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {q8-q9}, [r1], r3
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0], r2
++        vst1.8   {q2-q3}, [r0], r2
++        bgt      1b
++
++        bx       r6
 +.endm
 +
-+.macro table32
-+        vmov.s8  q10, #2
-+        vadd.s8  q0, q10
-+        vadd.s8  q1, q10
-+        vmov.s8  q10, #128
-+        vtbl.8   d0, {d31}, d0
-+        vadd.s8  q11, q2, q10
-+        vtbl.8   d1, {d31}, d1
-+        vadd.s8  q12, q3, q10
-+        vtbl.8   d2, {d31}, d2
-+        vqadd.s8 q11, q0
-+        vtbl.8   d3, {d31}, d3
-+        vqadd.s8 q12, q1
-+        vsub.s8  q0, q11, q10
-+        vsub.s8  q1, q12, q10
-+        vst1.8   {q0-q1}, [r0, :128], r2
++.macro  edge_16b_e2, body_fn, pb
++        mov      r6, lr
++        add     r3, #\pb
++
++1:      sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q1}, [r1, :128], r3
++        // load  b
++        vld1.8   {q2}, [r1]
++        sub      r1, #\pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
 +.endm
 +
-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        sub     r1, #4
-+1:      subs    r12, #1
-+        vld1.8  {q13-q14}, [r1]!
-+        vld1.32 d30, [r1], r3
-+        sub     r1, #32
-+        // a
-+        vext.8   q0, q13, q14, #3
-+        vext.8   q1, q14, q15, #3
-+        vshr.u64 d24, d30, #24
-+        // c
-+        vext.8   q2, q13, q14, #4
-+        vext.8   q3, q14, q15, #4
-+        vshr.u64 d16, d30, #32
-+        // diff0
-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-+        diff   d18, d25, d24, d16
-+        // -diff1
-+        vext.s8 q0, q13, q14, #1
-+        vext.s8 q1, q14, q9, #1
++.macro  edge_8bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
 +
-+        vsub.s8 q0, q13, q0 //diff0 + diff1
-+        vsub.s8 q1, q14, q1
-+        table32
-+        bne     1b
-+        vpop {q4-q7}
++1:      sub      r1, r3
++        vld1.8   {d0}, [r1], r3
++        vld1.8   {d1}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {d2}, [r1, :64], r3
++        vld1.8   {d3}, [r1, :64]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {d4}, [r1], r3
++        vld1.8   {d5}, [r1]
++        sub      r1, #(\pb * 2)
 +
-+        bx      lr
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        @ line 0 {d0[0], -,     -    }  r1 lo
++        vld1.32  {d0[0]}, [r1], r3
++        subs     r12, #4
++        @ Line 1 {d0[1], d2[0], -    }  r1 lo
++        vld1.32  {d0[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d2[0]}, [r1], r3
++        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb * 2
++        vld1.32  {d4[0]}, [r1], r3
++        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb * 2
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1], r3
++        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
++        vld1.32  {d3[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3
++        @ Line 4 {-,      -,    d5[1]}  r1 hi
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_64b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_64b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_32bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_32bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_16b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_16b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_8bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_8bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_4bx4_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_64b_e0     \body_fn, \pb
++10:     edge_64b_e1     \body_fn
++20:     edge_64b_e2     \body_fn, \pb
++30:     edge_64b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_32bx2_e0   \body_fn, \pb
++10:     edge_32bx2_e1   \body_fn
++20:     edge_32bx2_e2   \body_fn, \pb
++30:     edge_32bx2_e3   \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_32bx2_e0   \body_fn_64b, \pb
++10:     edge_32bx2_e1   \body_fn_64b
++20:     edge_32bx2_e2   \body_fn_64b, \pb
++30:     edge_32bx2_e3   \body_fn_64b, \pb
++5:      edge_16b_e0     \body_fn_16b, \pb
++15:     edge_16b_e1     \body_fn_16b
++25:     edge_16b_e2     \body_fn_16b, \pb
++35:     edge_16b_e3     \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++5:      edge_8bx2_e0    \body_fn, \pb
++15:     edge_8bx2_e1    \body_fn
++25:     edge_8bx2_e2    \body_fn, \pb
++35:     edge_8bx2_e3    \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_8bx2_e0    \body_fn, \pb
++10:     edge_8bx2_e1    \body_fn
++20:     edge_8bx2_e2    \body_fn, \pb
++30:     edge_8bx2_e3    \body_fn, \pb
++5:      edge_4bx4_e0    \body_fn, \pb
++15:     edge_4bx4_e1    \body_fn
++25:     edge_4bx4_e2    \body_fn, \pb
++35:     edge_4bx4_e3    \body_fn, \pb
++.endm
++
++@ void ff_hevc_sao_edge_8_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_8, export=1
++        edge_16b_init   8, 0, 1, 99f
++99:
++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        // load a
-+        sub     r1, r3
-+        vld1.8  {q0-q1}, [r1, :128], r3
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q8-q9}, [r1, :128], r3
-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-+        vadd.s8 q0, q4, q12 //diff0 + diff1
-+        vadd.s8 q1, q5, q13
-+        table32
-+        // CMP ( c, a )
-+        vneg.s8 q12, q4
-+        vneg.s8 q13, q5
-+        // c
-+        vmov.64 q2, q8
-+        vmov.64 q3, q9
-+        bne     1b
-+        vpop {q4-q7}
-+        bx      lr
++@ void ff_hevc_sao_edge_16_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_8, export=1
++        edge_16b_init   8, 0, 0, 99f
++99:
++        edge_16b_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+        init_edge_32
-+        vpush   {d8-d15}
-+        // load a
-+        sub     r1, r3
-+        sub     r1, #8
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #7
-+        vext.8  q1, q11, q12, #7
-+        // load c
-+        vld1.8  {d9}, [r1, :64]!
-+        vld1.8  {q2-q3}, [r1, :64], r3
-+        sub     r1, #8
-+        vext.8  q4, q4, q2, #15
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #9
-+        vext.8  q9, q11, q12, #9
-+        vext.8  q6, q10, q11, #8
-+        vext.8  q7, q11, q12, #8
-+        vext.8  q5, q10, q11, #7
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
-+        // inputs for next loop iteration
-+        // a
-+        vmov.8  q0, q4
-+        vext.8  q1, q2, q3, #15
-+        // c
-+        vmov.8  q2, q6
-+        vmov.8  q3, q7
-+        vmov.8  q4, q5
-+        bne     1b
-+        vpop    {d8-d15}
-+        bx      lr
++@ void ff_hevc_sao_edge_32_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+        init_edge_32
-+        sub     r1, r3
-+        // load a
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #1
-+        vext.8  q1, q11, q12, #1
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :64]!
-+        vld1.8  {d30}, [r1, :64], r3
-+        sub     r1, #40
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #7
-+        vext.8  q9, q11, q12, #7
-+        vext.8  q14, q12, q10, #7
++@ void ff_hevc_sao_edge_64_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
 +
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
++function ff_hevc_sao_edge_64_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 1
++endfunc
 +
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
++@ ff_hevc_sao_edge_c_8_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
 +
-+        // inputs for next loop iteration
-+        // a
-+        vext.8  q0, q2, q3, #1
-+        vext.8  q1, q3, q15, #1
-+        // c
-+        vext.8  q2, q8, q9, #1
-+        vext.8  q3, q9, q14, #1
-+        vext.8  d30, d28, d2, #1
-+        bne     1b
-+        bx      lr
++function ff_hevc_sao_edge_c_8_neon_8, export=1
++        edge_16b_init   8, 1, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_16_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_sao_edge_8_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_10, export=1
++        edge_16b_init   10, 0, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_16_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_64_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_sao_edge_64_neon_10, export=1
++        edge_64b_init   10, 0, 1, 99f
++endfunc
++
++@ void ff_hevc_sao_edge_32_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_8_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_8_neon_10, export=1
++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_10, export=1
++        edge_64b_init   10, 1, 1, 99f
++endfunc
++
++
++@ ff_hevc_sao_edge_c_16_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_10, export=1
++        edge_64b_init   10, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 4
 +endfunc
 +
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 57334df..7648294 100644
+index 57334df3fc..7648294965 100644
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
 @@ -443,6 +443,8 @@ enum AVCodecID {
@@ -3692,7 +6969,7 @@ index 57334df..7648294 100644
       * discarded by the caller from the end of the stream to get the original
       * audio without any trailing padding.
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c62..ccfa991 100644
+index 1bf1c620d6..ccfa991f60 100644
 --- a/libavcodec/cabac.h
 +++ b/libavcodec/cabac.h
 @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
@@ -3712,7 +6989,7 @@ index 1bf1c62..ccfa991 100644
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
 diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 9711019..9f99a2c 100644
+index 9711019e9d..9f99a2c927 100644
 --- a/libavcodec/codec_desc.c
 +++ b/libavcodec/codec_desc.c
 @@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = {
@@ -3765,7 +7042,7 @@ index 9711019..9f99a2c 100644
      /* various PCM "codecs" */
      {
 diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index 86df5eb..22c4f1d 100644
+index 86df5eb9b3..22c4f1d82a 100644
 --- a/libavcodec/h264.h
 +++ b/libavcodec/h264.h
 @@ -41,7 +41,9 @@ enum {
@@ -3779,7 +7056,7 @@ index 86df5eb..22c4f1d 100644
  
  #endif /* AVCODEC_H264_H */
 diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index bc35a61..055828c 100644
+index 2564c6c6c3..f939fa3fc9 100644
 --- a/libavcodec/h264_parser.c
 +++ b/libavcodec/h264_parser.c
 @@ -60,6 +60,8 @@ typedef struct H264ParseContext {
@@ -3876,10 +7153,10 @@ index bc35a61..055828c 100644
 +    .split          = h264_split,
 +};
 diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index de77d2a..a63db2b 100644
+index de77d2ac43..2568fd88b3 100644
 --- a/libavcodec/hevc.h
 +++ b/libavcodec/hevc.h
-@@ -21,6 +21,45 @@
+@@ -21,6 +21,47 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
  
@@ -3895,8 +7172,6 @@ index de77d2a..a63db2b 100644
 +  #include "rpi_qpu.h"
 +  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
 +
-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+  #define RPI_WORKER
 +  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
 +  // This has no effect unless RPI_WORKER is defined
 +  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
@@ -3919,6 +7194,10 @@ index de77d2a..a63db2b 100644
 +  #define RPI_HEVC_SAND      0
 +  #endif
 +
++
++  #define RPI_QPU_EMU_Y      0
++  #define RPI_QPU_EMU_C      0
++
 +  #define RPI_TSTATS 0
 +#endif
 +
@@ -3926,7 +7205,7 @@ index de77d2a..a63db2b 100644
   * Table 7-3: NAL unit type codes
   */
 diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index e27c54e..09727d9 100644
+index e27c54ed4b..925bccd188 100644
 --- a/libavcodec/hevc_cabac.c
 +++ b/libavcodec/hevc_cabac.c
 @@ -21,6 +21,8 @@
@@ -3943,7 +7222,7 @@ index e27c54e..09727d9 100644
  #include "hevcdec.h"
  
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +// BY22 is probably faster than simple bypass if the processor has
@@ -4288,7 +7567,7 @@ index e27c54e..09727d9 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -968,90 +1229,395 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -968,90 +1229,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -4301,7 +7580,7 @@ index e27c54e..09727d9 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-+{
+ {
 +    CABACContext * const c = &s->HEVClc->cc;
 +    uint32_t y;
 +    unsigned int prefix;
@@ -4342,7 +7621,7 @@ index e27c54e..09727d9 100644
 +#endif
 +
 +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
- {
++{
 +    CABACContext * const c = &s->HEVClc->cc;
      int prefix = 0;
      int suffix = 0;
@@ -4418,7 +7697,7 @@ index e27c54e..09727d9 100644
 +        rv = (rv << 1) | b;
 +    }
 +    return rv;
-+}
+ }
 +#endif
 +
 +
@@ -4502,7 +7781,7 @@ index e27c54e..09727d9 100644
 +        (*stat_coeff)++;
 +    else if (x == 0 && *stat_coeff > 0)
 +        (*stat_coeff)--;
- }
++}
 +#endif
 +
 +
@@ -4559,22 +7838,21 @@ index e27c54e..09727d9 100644
 +    int * const pPrev_sig)
 +{
 +    while (--i >= 0) {
-+        unsigned int x_cg = scan_x_cg[i];
-+        unsigned int y_cg = scan_y_cg[i];
++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++        const unsigned int x_cg = scan_x_cg[i];
 +
 +        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right + Down * 2 when calculating
-+        // significant coeff flags so we obtain it here
-+        //.
++        // we use the full Right * 2 + Down when calculating
++        // significant coeff flags so we obtain it here.
++        //
 +        // The group flag array is one longer than it needs to
 +        // be so we don't need to check for y_cg limits
-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
 +
 +        if (i == 0 ||
 +            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
 +        {
-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
++            gf_y[0] |= (1 << x_cg);
 +            *pPrev_sig = prev_sig;
 +            break;
 +        }
@@ -4592,31 +7870,46 @@ index e27c54e..09727d9 100644
 +    unsigned int stride = frame->linesize[c_idx];
 +    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
 +    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
-+    const int is_sliced = rpi_sliced_frame(frame);
++    const int is_sliced = av_rpi_is_sand_frame(frame);
 +    uint8_t * dst = !is_sliced ?
 +            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
 +        c_idx == 0 ?
-+            rpi_sliced_frame_pos_y(frame, x, y) :
-+            rpi_sliced_frame_pos_c(frame, x, y);
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
 +
 +    if (s->enable_rpi) {
-+        const unsigned int i = s->num_pred_cmds[s->pass0_job];
-+        HEVCPredCmd * const pc = s->univ_pred_cmds[s->pass0_job] + i - 1;
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
 +
 +        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
 +            pc->ta.dst == dst)
 +        {
-+            av_assert0(pc->size == log2_trafo_size &&
++            av_assert1(pc->size == log2_trafo_size &&
 +                       pc->c_idx == 1 &&
-+                       pc->ta.buf + (1 << (log2_trafo_size * 2)) &&
 +                       pc->ta.stride == stride);
 +
 +            pc->type = RPI_PRED_ADD_RESIDUAL_C;
 +        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride);
++
++            // Rewrite as add residual - must rewrite all fields as different union member
++            pc->type = RPI_PRED_ADD_RESIDUAL_V;
++            pc->c_idx = c_idx;
++            pc->ta.buf = coeffs;
++            pc->ta.dst = dst;
++            pc->ta.stride = stride;
++            pc->ta.dc = dc;
++        }
 +        else
 +        {
 +            HEVCPredCmd * const cmd = pc + 1;
-+            s->num_pred_cmds[s->pass0_job] = i + 1;
++            s->jb0->intra.n = i + 1;
 +
 +            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
 +            cmd->size = log2_trafo_size;
@@ -4624,20 +7917,81 @@ index e27c54e..09727d9 100644
 +            cmd->ta.buf = coeffs;
 +            cmd->ta.dst = dst;
 +            cmd->ta.stride = stride;
++            cmd->ta.dc = 0;
 +        }
 +    }
 +    else if (!is_sliced || c_idx == 0) {
 +        s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
 +    }
 +#if RPI_HEVC_SAND
++    // * These should probably never happen
 +    else if (c_idx == 1) {
-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
 +    else {
-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
 +#endif
 +}
++
++
++static void rpi_add_dc(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame->linesize[c_idx];
++    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++    if (s->enable_rpi) {
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.stride == stride);
++
++            pc->ta.dc = (int16_t)coeff;
++        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride &&
++                       (pc->dc.dc & ~0xffff) == 0);
++
++            pc->dc.dc |= (coeff << 16);
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->jb0->intra.n = i + 1;
++
++            cmd->type = RPI_PRED_ADD_DC + c_idx;
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->dc.dst = dst;
++            cmd->dc.stride = stride;
++            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++        }
++    }
++}
++
++
 +#endif
  
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
@@ -4680,6 +8034,7 @@ index e27c54e..09727d9 100644
 +#endif
 +#ifdef RPI
 +    int use_vpu;
++    int use_dc = 0;
 +#endif
 +    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
@@ -4701,7 +8056,6 @@ index e27c54e..09727d9 100644
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
  
      // Derive QP for dequant
      if (!lc->cu.cu_transquant_bypass_flag) {
@@ -4710,7 +8064,7 @@ index e27c54e..09727d9 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1067,9 +1633,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1067,9 +1708,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -4731,7 +8085,7 @@ index e27c54e..09727d9 100644
          }
  
          if (c_idx == 0) {
-@@ -1102,39 +1678,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1102,39 +1753,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -4822,7 +8176,7 @@ index e27c54e..09727d9 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1162,119 +1775,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1162,119 +1850,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -4879,53 +8233,35 @@ index e27c54e..09727d9 100644
 -    for (i = num_last_subset; i >= 0; i--) {
 -        int n, m;
 -        int x_cg, y_cg, x_c, y_c, pos;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+#ifdef RPI
-+        use_vpu = 0;
-+        if (s->enable_rpi) {
-+            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
-+            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+        else
-+#endif
-+        {
-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-         int implicit_non_zero_coeff = 0;
+-        int implicit_non_zero_coeff = 0;
 -        int64_t trans_coeff_level;
 -        int prev_sig = 0;
 -        int offset = i << 4;
 -        int rice_init = 0;
-+        int n_end;
- 
-         uint8_t significant_coeff_flag_idx[16];
--        uint8_t nb_significant_coeff_flag = 0;
 -
+-        uint8_t significant_coeff_flag_idx[16];
+-        uint8_t nb_significant_coeff_flag = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+ 
 -        x_cg = scan_x_cg[i];
 -        y_cg = scan_y_cg[i];
--
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+ 
 -        if ((i < num_last_subset) && (i > 0)) {
 -            int ctx_cg = 0;
 -            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 -            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
--
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
++            use_dc = (num_coeff == 1) && !special &&
++                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
+ 
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 -            implicit_non_zero_coeff = 1;
@@ -4933,9 +8269,37 @@ index e27c54e..09727d9 100644
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
--        }
--
++            if (use_dc) {
++                // Just need a little empty space
++                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++                // No need to clear
++            }
++            else
++            {
++                use_vpu = !special && log2_trafo_size >= 4;
++                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++                memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++            }
+         }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++        }
++    }
+ 
 -        last_scan_pos = num_coeff - offset - 1;
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
++
++        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
          if (i == num_last_subset) {
@@ -4967,23 +8331,24 @@ index e27c54e..09727d9 100644
 +                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
 +                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
 +            };
++            // N.B. prev_sig = Right * 2 + Down
 +            static const uint8_t ctx_idx_maps[3][4][16] = {
 +                {
 +                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                }
              };
@@ -5021,7 +8386,7 @@ index e27c54e..09727d9 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1288,34 +1916,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1288,34 +2004,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -5070,11 +8435,12 @@ index e27c54e..09727d9 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1325,141 +1949,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1325,141 +2037,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
 -        n_end = nb_significant_coeff_flag;
+-
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -5122,9 +8488,6 @@ index e27c54e..09727d9 100644
 +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
 +                }
  
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
 -        if (n_end) {
 -            int first_nz_pos_in_cg;
 -            int last_nz_pos_in_cg;
@@ -5135,6 +8498,9 @@ index e27c54e..09727d9 100644
 -            int sum_abs = 0;
 -            int sign_hidden;
 -            int sb_type;
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -5145,18 +8511,13 @@ index e27c54e..09727d9 100644
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
  
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
  
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
 -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
@@ -5164,7 +8525,11 @@ index e27c54e..09727d9 100644
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 -                c_rice_param = lc->stat_coeff[sb_type] / 4;
 -            }
--
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
 -            if (!(i == num_last_subset) && greater1_ctx == 0)
 -                ctx_set++;
 -            greater1_ctx = 1;
@@ -5246,9 +8611,6 @@ index e27c54e..09727d9 100644
 +                        {
 +                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
 +                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
  
 -            for (m = 0; m < n_end; m++) {
 -                n = significant_coeff_flag_idx[m];
@@ -5269,6 +8631,9 @@ index e27c54e..09727d9 100644
 -                                if (lc->stat_coeff[sb_type] > 0)
 -                                    lc->stat_coeff[sb_type]--;
 -                            rice_init = 1;
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
 +                            if (stat_coeff != NULL)
 +                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                            stat_coeff = NULL;
@@ -5373,7 +8738,7 @@ index e27c54e..09727d9 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1469,7 +2137,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1469,7 +2225,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -5382,7 +8747,7 @@ index e27c54e..09727d9 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1489,7 +2157,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1489,10 +2245,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
              s->hevcdsp.transform_4x4_luma(coeffs);
@@ -5396,13 +8761,27 @@ index e27c54e..09727d9 100644
 +        {
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
-                 s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-@@ -1512,7 +2186,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            {
++#ifdef RPI
++                if (use_dc)
++                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++                else
++#endif
++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            }
+             else {
+                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                 if (max_xy < 4)
+@@ -1512,7 +2281,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    if (!use_dc)
++    {
++        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    }
 +#else
      s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
 +#endif
@@ -5410,7 +8789,7 @@ index e27c54e..09727d9 100644
  
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 14e7c8d..0256b01 100644
+index b53f4cc721..b56f4d20f6 100644
 --- a/libavcodec/hevc_filter.c
 +++ b/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
@@ -5426,26 +8805,31 @@ index 14e7c8d..0256b01 100644
  #include "libavutil/common.h"
  #include "libavutil/internal.h"
  
-@@ -30,6 +36,11 @@
+@@ -30,6 +36,16 @@
  
  #include "bit_depth_template.c"
  
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#endif
++#if RPI_HEVC_SAND
 +#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++#else
++#define RPI_ZC_SAND_8_IN_10_BUF 0
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -138,6 +149,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+@@ -138,6 +154,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
      return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
  }
  
 +static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
 +{
-+#ifdef RPI
-+    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
 +#else
 +    return s->ps.sps->pixel_shift;
 +#endif
@@ -5454,7 +8838,75 @@ index 14e7c8d..0256b01 100644
  static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
  {
-@@ -192,7 +212,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+@@ -160,12 +185,21 @@ int i, j;
+     }
+ }
+ 
++// "DSP" these?
+ static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+ {
+-    if (pixel_shift)
+-        *(uint16_t *)dst = *(uint16_t *)src;
+-    else
+-        *dst = *src;
++    switch (pixel_shift)
++    {
++        case 2:
++            *(uint32_t *)dst = *(uint32_t *)src;
++            break;
++        case 1:
++            *(uint16_t *)dst = *(uint16_t *)src;
++            break;
++        default:
++            *dst = *src;
++            break;
++    }
+ }
+ 
+ static void copy_vert(uint8_t *dst, const uint8_t *src,
+@@ -173,18 +207,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
+                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
+ {
+     int i;
+-    if (pixel_shift == 0) {
+-        for (i = 0; i < height; i++) {
+-            *dst = *src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
+-    } else {
+-        for (i = 0; i < height; i++) {
+-            *(uint16_t *)dst = *(uint16_t *)src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
+     }
+ }
+ 
+@@ -192,7 +237,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
                             ptrdiff_t stride_src, int x, int y, int width, int height,
                             int c_idx, int x_ctb, int y_ctb)
  {
@@ -5463,7 +8915,7 @@ index 14e7c8d..0256b01 100644
      int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
      int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
  
-@@ -223,13 +243,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -223,13 +268,14 @@ static void restore_tqb_pixels(HEVCContext *s,
          int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
          int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
          int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
@@ -5481,21 +8933,27 @@ index 14e7c8d..0256b01 100644
                      for (n = 0; n < (min_pu_size >> vshift); n++) {
                          memcpy(src, dst, len);
                          src += stride_src;
-@@ -245,7 +266,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -245,7 +291,13 @@ static void restore_tqb_pixels(HEVCContext *s,
  
  static void sao_filter_CTB(HEVCContext *s, int x, int y)
  {
 -    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++#if SAO_FILTER_N == 5
 +    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
      HEVCLocalContext *lc = s->HEVClc;
      int c_idx;
      int edges[4];  // 0 left 1 top 2 right 3 bottom
-@@ -266,12 +287,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -266,12 +318,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
      uint8_t right_tile_edge  = 0;
      uint8_t up_tile_edge     = 0;
      uint8_t bottom_tile_edge = 0;
-+#ifdef RPI
-+    const int sliced = rpi_sliced_frame(s->frame);
++#if RPI_HEVC_SAND
++    const int sliced = av_rpi_is_sand_frame(s->frame);
 +    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
 +#else
 +    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
@@ -5513,7 +8971,7 @@ index 14e7c8d..0256b01 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -303,7 +334,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -303,7 +365,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          }
      }
  
@@ -5522,7 +8980,7 @@ index 14e7c8d..0256b01 100644
          int x0       = x >> s->ps.sps->hshift[c_idx];
          int y0       = y >> s->ps.sps->vshift[c_idx];
          ptrdiff_t stride_src = s->frame->linesize[c_idx];
-@@ -312,28 +343,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -312,28 +374,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
          int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
          int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
@@ -5530,24 +8988,24 @@ index 14e7c8d..0256b01 100644
          ptrdiff_t stride_dst;
          uint8_t *dst;
  
-+#ifdef RPI
-+        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
 +        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
 +        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
 +        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
 +            !sliced ? src - (1 << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
 +        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
 +            !sliced ? src + (width << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
 +
 +
 +        if (sliced && c_idx > 1) {
@@ -5578,7 +9036,7 @@ index 14e7c8d..0256b01 100644
 +                dst = lc->edge_emu_buffer;
 +                stride_dst = 2*MAX_PB_SIZE;
 +                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
 +                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
@@ -5599,9 +9057,11 @@ index 14e7c8d..0256b01 100644
 -            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
 -                                            sao->offset_val[c_idx], sao->band_position[c_idx],
 -                                            width, height);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
++//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
++
 +                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
 +                                                    sao->offset_val[1], sao->band_position[1],
 +                                                    sao->offset_val[2], sao->band_position[2],
@@ -5617,7 +9077,7 @@ index 14e7c8d..0256b01 100644
              }
              sao->type_idx[c_idx] = SAO_APPLIED;
              break;
-@@ -341,108 +426,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -341,108 +459,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          {
              int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
              int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
@@ -5756,7 +9216,7 @@ index 14e7c8d..0256b01 100644
 -                                                vert_edge,
 -                                                horiz_edge,
 -                                                diag_edge);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +            if (sliced && c_idx != 0)
 +            {
 +                // Class always the same for both U & V (which is just as well :-))
@@ -5786,18 +9246,42 @@ index 14e7c8d..0256b01 100644
 +                                                    horiz_edge,
 +                                                    diag_edge);
 +            }
++            // ??? Does this actually work for chroma ???
              restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
                                 x, y, width, height, c_idx);
              sao->type_idx[c_idx] = SAO_APPLIED;
-@@ -452,6 +546,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -450,8 +578,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         }
+         }
      }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++    {
++        const unsigned int stride1 = s->frame->linesize[0];
++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++    }
++#endif
  }
  
 +// Returns 2 or 0.
  static int get_pcm(HEVCContext *s, int x, int y)
  {
      int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-@@ -478,7 +573,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -478,7 +628,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      uint8_t *src;
      int x, y;
      int chroma, beta;
@@ -5806,7 +9290,7 @@ index 14e7c8d..0256b01 100644
      uint8_t no_p[2] = { 0 };
      uint8_t no_q[2] = { 0 };
  
-@@ -495,6 +590,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -495,6 +645,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -5822,7 +9306,7 @@ index 14e7c8d..0256b01 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -528,19 +632,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -528,19 +687,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
@@ -5840,14 +9324,14 @@ index 14e7c8d..0256b01 100644
 -                                                       s->frame->linesize[LUMA],
 -                                                       beta, tc, no_p, no_q);
 +                }
-+#ifdef RPI
-+                if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++                if (av_rpi_is_sand_frame(s->frame)) {
 +
 +                    // This copes properly with no_p/no_q
-+                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
 +                                                     s->frame->linesize[LUMA],
 +                                                     beta, tc, no_p, no_q,
-+                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
 +                }
 +                else
 +#endif
@@ -5882,21 +9366,21 @@ index 14e7c8d..0256b01 100644
              }
          }
  
-@@ -560,7 +696,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -560,7 +751,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
 -                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
 +                src =
-+#ifdef RPI
-+                    rpi_sliced_frame(s->frame) ?
-+                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#if RPI_HEVC_SAND
++                    av_rpi_is_sand_frame(s->frame) ?
++                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
 +#endif
 +                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
                  if (pcmf) {
                      no_p[0] = get_pcm(s, x, y - 1);
                      no_p[1] = get_pcm(s, x + 4, y - 1);
-@@ -570,6 +711,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -570,6 +766,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -5916,17 +9400,19 @@ index 14e7c8d..0256b01 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -578,6 +732,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -578,6 +787,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  
      if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI
-+        if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++        if (av_rpi_is_sand_frame(s->frame)) {
 +            const int v = 2;
 +            const int h = 2;
 +
 +            // vertical filtering chroma
 +            for (y = y0; y < y_end; y += 8 * v) {
++//                const int demi_y = y + 4 * v >= s->ps.sps->height;
++                const int demi_y = 0;
 +                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
 +                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
 +                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
@@ -5934,7 +9420,7 @@ index 14e7c8d..0256b01 100644
 +                    if ((bs0 == 2) || (bs1 == 2)) {
 +                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
 +                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
 +
 +                        // tc_offset here should be set to cur_tc_offset I think
 +                        const uint32_t tc4 =
@@ -5954,10 +9440,10 @@ index 14e7c8d..0256b01 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                       s->frame->linesize[1],
 +                                                       tc4,
-+                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
 +                                                       no_f);
 +                    }
 +                }
@@ -5972,6 +9458,9 @@ index 14e7c8d..0256b01 100644
 +                    x_end2 = x_end - 8 * h;
 +
 +                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
++                    const int demi_x = 0;
++
 +                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
 +                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
 +                    if ((bs0 == 2) || (bs1 == 2)) {
@@ -5980,7 +9469,7 @@ index 14e7c8d..0256b01 100644
 +                        const uint32_t tc4 =
 +                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
 +                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
 +
 +                        if (tc4 == 0)
 +                            continue;
@@ -5996,7 +9485,7 @@ index 14e7c8d..0256b01 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                             s->frame->linesize[1],
 +                                                             tc4, no_f);
 +                    }
@@ -6008,21 +9497,21 @@ index 14e7c8d..0256b01 100644
          for (chroma = 1; chroma <= 2; chroma++) {
              int h = 1 << s->ps.sps->hshift[chroma];
              int v = 1 << s->ps.sps->vshift[chroma];
-@@ -594,7 +833,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -594,7 +893,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
                          c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x - 1, y);
                              no_p[1] = get_pcm(s, x - 1, y + (4 * v));
-@@ -604,9 +848,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -604,9 +908,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6046,21 +9535,21 @@ index 14e7c8d..0256b01 100644
                      }
                  }
  
-@@ -627,7 +885,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -627,7 +945,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
                          c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x,           y - 1);
                              no_p[1] = get_pcm(s, x + (4 * h), y - 1);
-@@ -637,6 +900,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -637,6 +960,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6080,7 +9569,7 @@ index 14e7c8d..0256b01 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -647,69 +923,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -647,69 +983,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -6150,7 +9639,7 @@ index 14e7c8d..0256b01 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -720,10 +933,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -720,10 +993,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -6176,7 +9665,7 @@ index 14e7c8d..0256b01 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -735,34 +960,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -735,34 +1020,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -6253,7 +9742,7 @@ index 14e7c8d..0256b01 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -773,64 +1020,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -773,64 +1080,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -6356,7 +9845,7 @@ index 14e7c8d..0256b01 100644
          }
      }
  }
-@@ -839,11 +1076,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -839,11 +1136,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
@@ -6425,7 +9914,7 @@ index 14e7c8d..0256b01 100644
 +  // Call VPU
 +  {
 +      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
-+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
 +      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
 +      vpu_qpu_job_finish(vqj);
 +  }
@@ -6462,61 +9951,167 @@ index 14e7c8d..0256b01 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -852,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -852,16 +1243,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
++                ff_hevc_progress_signal_recon(s, y);
 +            }
          }
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
++                ff_hevc_progress_signal_recon(s, y + ctb_size);
 +            }
-+        }
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
 +#ifdef RPI_DEBLOCK_VPU
 +        if (s->enable_rpi_deblock) {
-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+          if (done_deblock) {
-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+          }
++            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
++            if (done_deblock) {
++                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++            }
 +        } else {
 +#if RPI_INTER
-+          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++        }
 +#else
 +#if RPI_INTER
 +        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
 +#endif
 +    }
  }
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
+index a8f7876b59..1c6f15bde3 100644
+--- a/libavcodec/hevc_mvs.c
++++ b/libavcodec/hevc_mvs.c
+@@ -112,7 +112,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
+     return 0;
+ }
+ 
+-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
+ {
+     int tx, scale_factor;
+ 
+@@ -126,10 +126,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+                            (scale_factor * src->y < 0)) >> 8);
+ }
+ 
+-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+-                       int colPic, int poc,
+-                       RefPicList *refPicList, int X, int refIdxLx,
+-                       RefPicList *refPicList_col, int listCol, int refidxCol)
++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
++                       const int colPic, const int poc,
++                       const RefPicList * const refPicList, const int X, const int refIdxLx,
++                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
+ {
+     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
+     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
+@@ -160,11 +160,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+                 refPicList_col, L ## l, temp_col.ref_idx[l])
+ 
+ // derive the motion vectors section 8.5.3.1.8
+-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+-                                         int refIdxLx, Mv *mvLXCol, int X,
+-                                         int colPic, RefPicList *refPicList_col)
++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
++                                         const int refIdxLx, Mv * const mvLXCol, const int X,
++                                         const int colPic, const RefPicList * const refPicList_col)
+ {
+-    RefPicList *refPicList = s->ref->refPicList;
++    const RefPicList * const refPicList = s->ref->refPicList;
+ 
+     if (temp_col.pred_flag == PF_INTRA)
+         return 0;
+@@ -215,20 +215,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+ /*
+  * 8.5.3.1.7  temporal luma motion vector prediction
+  */
+-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+-                                       int nPbW, int nPbH, int refIdxLx,
+-                                       Mv *mvLXCol, int X)
++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
++                                       const int nPbW, const int nPbH, const int refIdxLx,
++                                       Mv * const mvLXCol, const int X)
+ {
+     MvField *tab_mvf;
+     MvField temp_col;
+     int x, y, x_pu, y_pu;
+-    int min_pu_width = s->ps.sps->min_pu_width;
++    const int min_pu_width = s->ps.sps->min_pu_width;
+     int availableFlagLXCol = 0;
+     int colPic;
+ 
+-    HEVCFrame *ref = s->ref->collocated_ref;
++    HEVCFrame * const ref = s->ref->collocated_ref;
+ 
+-    if (!ref) {
++    if (ref == NULL || ref->tab_mvf == NULL) {
+         memset(mvLXCol, 0, sizeof(*mvLXCol));
+         return 0;
+     }
+@@ -240,14 +240,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     x = x0 + nPbW;
+     y = y0 + nPbH;
+ 
+-    if (tab_mvf &&
+-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+         y < s->ps.sps->height &&
+         x < s->ps.sps->width) {
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
+@@ -255,13 +254,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     }
+ 
+     // derive center collocated motion vector
+-    if (tab_mvf && !availableFlagLXCol) {
++    if (!availableFlagLXCol) {
+         x                  = x0 + (nPbW >> 1);
+         y                  = y0 + (nPbH >> 1);
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
 diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index acd55cc..c1716c2 100644
+index f2c26c4598..74e152d4b3 100644
 --- a/libavcodec/hevc_ps.c
 +++ b/libavcodec/hevc_ps.c
-@@ -780,7 +780,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+@@ -819,7 +819,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
      switch (sps->bit_depth) {
      case 8:
          if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
@@ -6529,19 +10124,66 @@ index acd55cc..c1716c2 100644
          if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
          if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
         break;
-@@ -1001,6 +1006,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
-     sps->amp_enabled_flag = get_bits1(gb);
-     sps->sao_enabled      = get_bits1(gb);
+@@ -831,7 +836,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+         break;
+     case 10:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY10;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+         break;
+@@ -1097,7 +1107,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+         if (sps_extension_flag[0]) {
+             int extended_precision_processing_flag;
+-            int high_precision_offsets_enabled_flag;
+             int cabac_bypass_alignment_enabled_flag;
  
-+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
-+
-     sps->pcm_enabled_flag = get_bits1(gb);
-     if (sps->pcm_enabled_flag) {
-         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+@@ -1112,10 +1121,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+                    "extended_precision_processing_flag not yet implemented\n");
+ 
+             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+-            high_precision_offsets_enabled_flag  = get_bits1(gb);
+-            if (high_precision_offsets_enabled_flag)
++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
++            if (sps->high_precision_offsets_enabled_flag)
+                 av_log(avctx, AV_LOG_WARNING,
+-                   "high_precision_offsets_enabled_flag not yet implemented\n");
++                   "high_precision_offsets_enabled_flag not fully implemented\n");
+ 
+             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+ 
+diff --git a/libavcodec/hevc_ps.h b/libavcodec/hevc_ps.h
+index 44de3980e1..f45d28cd80 100644
+--- a/libavcodec/hevc_ps.h
++++ b/libavcodec/hevc_ps.h
+@@ -206,6 +206,7 @@ typedef struct HEVCSPS {
+     int implicit_rdpcm_enabled_flag;
+     int explicit_rdpcm_enabled_flag;
+     int intra_smoothing_disabled_flag;
++    int high_precision_offsets_enabled_flag;
+     int persistent_rice_adaptation_enabled_flag;
+ 
+     ///< coded frame dimension in various units
 diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
-index 9103c84..eb26e7d 100644
+index d2759ba5f5..1dcc238c5a 100644
 --- a/libavcodec/hevc_refs.c
 +++ b/libavcodec/hevc_refs.c
+@@ -23,7 +23,7 @@
+ 
+ #include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
+-
++#include "libavutil/rpi_sand_fns.h"
+ #include "internal.h"
+ #include "thread.h"
+ #include "hevc.h"
 @@ -206,7 +206,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
              HEVCFrame *frame = &s->DPB[min_idx];
              AVFrame *dst = out;
@@ -6562,7 +10204,7 @@ index 9103c84..eb26e7d 100644
 -                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
 -                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
 -                dst->data[i] += off;
-+            if (fmt == AV_PIX_FMT_SAND128)
++            if (av_rpi_is_sand_format(fmt))
 +            {
 +                // Sand cannot be windowed by offset so add side data if we have an offset
 +                const HEVCWindow * const window = &frame->window;
@@ -6588,11 +10230,21 @@ index 9103c84..eb26e7d 100644
              }
              av_log(s->avctx, AV_LOG_DEBUG,
                     "Output frame with POC %d.\n", frame->poc);
+@@ -427,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
+     frame->sequence = s->seq_decode;
+     frame->flags    = 0;
+ 
+-    if (s->threads_type == FF_THREAD_FRAME)
+-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
++    ff_hevc_progress_set_all_done(frame);
+ 
+     return frame;
+ }
 diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index f9e8ff0..8a3d874 100644
+index 5579a4df43..fd48468c6c 100644
 --- a/libavcodec/hevcdec.c
 +++ b/libavcodec/hevcdec.c
-@@ -42,8 +42,207 @@
+@@ -42,8 +42,346 @@
  #include "hevcdec.h"
  #include "profiles.h"
  
@@ -6600,26 +10252,17 @@ index f9e8ff0..8a3d874 100644
 +  #include "rpi_qpu.h"
 +  #include "rpi_shader.h"
 +  #include "rpi_shader_cmd.h"
++  #include "rpi_shader_template.h"
 +  #include "rpi_zc.h"
++  #include "libavutil/rpi_sand_fns.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
 +  #define RPI_CACHE_UNIF_MVS  1
 +
-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
-+  //#define RPI_SIMULATE_QPUS
-+  #ifdef RPI_WORKER
-+    #include "pthread.h"
-+  #endif
-+
++  #include "pthread.h"
 +  #include "libavutil/atomic.h"
 +
 +  static void worker_core(HEVCContext * const s);
-+
-+  // We can pred any block height, but caching may make some heights better than others
-+  // Currently it doesn't seem to make a lot of difference
-+  // 0 => any height
-+  #define Y_P_MAX_H     0
-+  #define Y_B_MAX_H     0
 +#endif
 +
 +#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
@@ -6644,14 +10287,15 @@ index f9e8ff0..8a3d874 100644
 +
 +// UV still has min 4x4 pred
 +// Allow for even spread +1 for setup, +1 for rounding
-+// If we have load sharingw e will want different (bigger) numbers and/or a non-constant chunk size
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
 +
 +// Worst case (all 4x4) commands per CTU
 +#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
 +#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
 +
-+#define UV_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 / QPU_N_UV + 2)
-+#define Y_COMMANDS_PER_QPU  (((RPI_MAX_WIDTH * 64) / (4 * 4))     / QPU_N_Y  + 2)
++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
 +
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
@@ -6679,35 +10323,127 @@ index f9e8ff0..8a3d874 100644
 +    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
 +};
 +
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
 +static const int * const inter_pred_setup_y_qpu[12] = {
 +    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
 +    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
 +    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
 +};
 +
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
 +static const int * const inter_pred_sync_qpu[12] = {
 +    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
 +    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
 +    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
 +};
 +
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
 +static const int * const inter_pred_exit_c_qpu[12] = {
-+    mc_interrupt_exit12c, mc_exit_c, mc_exit_c, mc_exit_c,
-+    mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c,
-+    mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
 +};
 +
 +static const int * const inter_pred_exit_y_qpu[12] = {
-+    mc_interrupt_exit12, mc_exit, mc_exit, mc_exit,
-+    mc_exit,   mc_exit, mc_exit, mc_exit,
-+    mc_exit,   mc_exit, mc_exit, mc_exit
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
 +};
 +
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const unsigned int n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
++{
++    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
++
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++
++    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob *const jb = s->jobs + i;
++        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
++        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
++    }
++}
++
 +
 +#endif
 +
 +
-+#ifdef RPI_WORKER
++#ifdef RPI
 +
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
@@ -6715,108 +10451,154 @@ index f9e8ff0..8a3d874 100644
 +#define LOG_ENTER
 +#define LOG_EXIT
 +
++#define USE_SEM 1
++
 +// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-+static void worker_submit_job(HEVCContext *s)
++static void worker_submit_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++;
-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_post(&s->jb0->sem_in);
++    s->jb0->pending = 1;
++    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb0 = s->jobs + s->pass0_job;
++    LOG_EXIT
 +}
 +
 +// Call this to say we have completed pass1
-+static void worker_complete_job(HEVCContext *s)
++static void worker_complete_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++;
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_t * const sem = &s->jb1->sem_out;
++    // Must set job no before signalling as otherwise rpi_do_all_passes
++    // may call worker_core from the main thread with a bad job number
++    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb1 = s->jobs + s->pass1_job;
++    sem_post(sem);
++    LOG_EXIT
 +}
 +
-+// Call this to wait for all jobs to have completed at the end of a frame
-+static void worker_wait(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  while( s->worker_head !=s->worker_tail)
-+  {
-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+  }
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
 +
 +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
 +// available to receive the next job.
 +static void worker_pass0_ready(HEVCContext *s)
 +{
-+  LOG_ENTER
-+    pthread_mutex_lock(&s->worker_mutex);
-+    // tail is number of submitted jobs
-+    // head is number of completed jobs
-+    // tail-head is number of outstanding jobs in the queue
-+    // we need to ensure there is at least 1 space left for us to use
-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-+    {
-+      // Wait until another job is completed
-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++    LOG_ENTER
++    HEVCRpiJob * const jb = s->jb0;
++    if (jb->pending) {
++        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++            /* Loop */;
++        jb->pending = 0;
 +    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_EXIT
++}
++
++// Call this to wait for all jobs to have completed at the end of a frame
++static void worker_wait(HEVCContext * const s)
++{
++    LOG_ENTER
++    unsigned int i;
++    for (i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob * const jb = s->jobs + i;
++        if (jb->pending) {
++            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++                /* Loop */;
++            jb->pending = 0;
++        }
++    }
++    LOG_EXIT
 +}
 +
 +static void *worker_start(void *arg)
 +{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
++    HEVCContext * const s = (HEVCContext *)arg;
 +
-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
++    for (;;)
 +    {
-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
++        HEVCRpiJob * const jb = s->jb1;
++        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
++            /* Loop */;
++        if (jb->terminate)
++            break;
 +
-+    if (s->kill_worker) {
-+      break;
++        LOG_ENTER
++        worker_core(s);
++        worker_complete_job(s);
++        LOG_EXIT
 +    }
-+    LOG_ENTER
-+    worker_core(s);
-+
-+    worker_complete_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
++    return NULL;
 +}
 +
++static void worker_pic_free_all(HEVCContext * const s)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++        if (cf->s[0].buf != NULL)
++            av_freep(&cf->mptr);
++        if (cf->s[2].buf != NULL)
++            gpu_free(&cf->gptr);
++        memset(cf, 0, sizeof(*cf));
++    }
++}
++
++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
++//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
++//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
++//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
++
++        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++            goto fail;
++        cf->s[2].buf = (int16_t *)cf->gptr.arm;
++        cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++        // Must be 64 byte aligned for our zero apping code so over-allocate &
++        // round
++        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0] + 63))) == NULL)
++            goto fail;
++        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    }
++    return 0;
++
++fail:
++    printf("%s: **** Failed\n", __func__);
++    worker_pic_free_all(s);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++    }
++}
 +#endif
++
 +
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -56,6 +255,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+@@ -56,6 +394,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
 +#ifdef RPI
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      if (s->coeffs_buf_arm[job][0]) {
-+        gpu_free(&s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = 0;
-+      }
-+      if (s->coeffs_buf_arm[job][2]) {
-+        gpu_free(&s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = 0;
-+      }
-+    }
++    worker_pic_free_all(s);
 +#endif
++
 +#ifdef RPI_DEBLOCK_VPU
 +    {
 +        int i;
@@ -6833,7 +10615,7 @@ index f9e8ff0..8a3d874 100644
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -92,6 +317,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+@@ -92,6 +447,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
@@ -6842,32 +10624,17 @@ index f9e8ff0..8a3d874 100644
 +    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
 +    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
 +    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+    int job;
 +
 +    av_assert0(sps);
-+//    s->max_ctu_count = sps->ctb_width;
-+//    printf("CTB with=%d\n", sps->ctb_width);
-+//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
-+    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
-+    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++#if RPI_ROUND_TO_LINES
++    // Round down to an integral quantity of lines
++    if (s->max_ctu_count > sps->ctb_width)
++        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
++#endif
 +
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        for(job=0;job<RPI_MAX_JOBS;job++) {
-+            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+            if (!s->coeffs_buf_arm[job][0])
-+                goto fail;
-+
-+            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+            if (!s->coeffs_buf_arm[job][2])
-+                goto fail;
-+            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+        }
-+    }
++    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
++        goto fail;
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
 +    {
@@ -6923,7 +10690,7 @@ index f9e8ff0..8a3d874 100644
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -138,6 +446,29 @@ fail:
+@@ -138,6 +561,29 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -6950,19 +10717,21 @@ index f9e8ff0..8a3d874 100644
 +  }
 +}
 +
- static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+ static int pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -332,7 +663,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+@@ -351,8 +797,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
  static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
  {
      #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
 -    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+-    int ret, i;
 +    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
-     int ret, i;
++    int ret;
  
      pic_arrays_free(s);
-@@ -351,6 +682,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     s->ps.sps = NULL;
+@@ -370,6 +816,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
      switch (sps->pix_fmt) {
      case AV_PIX_FMT_YUV420P:
      case AV_PIX_FMT_YUVJ420P:
@@ -6975,7 +10744,20 @@ index f9e8ff0..8a3d874 100644
  #if CONFIG_HEVC_DXVA2_HWACCEL
          *fmt++ = AV_PIX_FMT_DXVA2_VLD;
  #endif
-@@ -384,6 +721,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -384,6 +836,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND64_10;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -403,6 +861,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
          ret = ff_thread_get_format(s->avctx, pix_fmts);
          if (ret < 0)
              goto fail;
@@ -6983,24 +10765,58 @@ index f9e8ff0..8a3d874 100644
          s->avctx->pix_fmt = ret;
      }
      else {
-@@ -406,11 +744,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -412,26 +871,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
+     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
+     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
++#ifdef RPI
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++#endif
+ 
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
+-    }
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
+ 
+     if (sps->sao_enabled && !s->avctx->hwaccel) {
+-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+-        int c_idx;
++        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
+ 
          for(c_idx = 0; c_idx < c_count; c_idx++) {
              int w = sps->width >> sps->hshift[c_idx];
              int h = sps->height >> sps->vshift[c_idx];
-+            // ******** Very very nasty allocation kludge for plaited Chroma
-             s->sao_pixel_buffer_h[c_idx] =
+-            s->sao_pixel_buffer_h[c_idx] =
 -                av_malloc((w * 2 * sps->ctb_height) <<
-+                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
-             s->sao_pixel_buffer_v[c_idx] =
+-                          sps->pixel_shift);
+-            s->sao_pixel_buffer_v[c_idx] =
 -                av_malloc((h * 2 * sps->ctb_width) <<
-+                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
+-                          sps->pixel_shift);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
          }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
      }
-@@ -678,6 +1017,11 @@ static int hls_slice_header(HEVCContext *s)
-                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
-                 pred_weight_table(s, gb);
+ 
+     s->ps.sps = sps;
+@@ -699,6 +1168,11 @@ static int hls_slice_header(HEVCContext *s)
+                 if (ret < 0)
+                     return ret;
              }
 +            else
 +            {
@@ -7010,20 +10826,25 @@ index f9e8ff0..8a3d874 100644
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -933,6 +1277,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -954,6 +1428,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
 +#ifdef RPI
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
++{
++    return s->jb0->intra.cmds + s->jb0->intra.n++;
++}
++
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
 +    // U & V done on U call in the case of sliced frames
-+    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
 +        return;
 +
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
 +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
@@ -7032,7 +10853,7 @@ index f9e8ff0..8a3d874 100644
 +        cmd->i_pred.y = y0;
 +        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
 +    }
-+    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
 +        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
 +    else {
@@ -7045,7 +10866,7 @@ index f9e8ff0..8a3d874 100644
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -945,8 +1317,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -966,8 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -7058,7 +10879,7 @@ index f9e8ff0..8a3d874 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1032,7 +1407,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1053,7 +1563,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -7070,7 +10891,7 @@ index f9e8ff0..8a3d874 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1061,7 +1440,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1082,7 +1596,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -7082,7 +10903,7 @@ index f9e8ff0..8a3d874 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1090,7 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1111,7 +1629,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -7094,7 +10915,7 @@ index f9e8ff0..8a3d874 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1100,7 +1487,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1121,7 +1643,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -7106,7 +10927,7 @@ index f9e8ff0..8a3d874 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1112,26 +1503,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1133,26 +1659,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -7153,7 +10974,7 @@ index f9e8ff0..8a3d874 100644
              }
          }
      }
-@@ -1277,47 +1688,120 @@ do {
+@@ -1298,47 +1844,119 @@ do {
      return 0;
  }
  
@@ -7189,12 +11010,12 @@ index f9e8ff0..8a3d874 100644
 -    if (s->ps.sps->chroma_format_idc) {
 -        s->hevcdsp.put_pcm(dst1, stride1,
 +#if RPI_HEVC_SAND
-+    if (rpi_sliced_frame(s->frame)) {
-+        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++    if (av_rpi_is_sand_frame(s->frame)) {
++        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
 +                           s->frame->linesize[0],
 +                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
 +
-+        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
 +                           s->frame->linesize[1],
                             cb_size >> s->ps.sps->hshift[1],
                             cb_size >> s->ps.sps->vshift[1],
@@ -7233,10 +11054,9 @@ index f9e8ff0..8a3d874 100644
 +#ifdef RPI
 +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
 +{
-+    int16_t * const coeffs = (buf_no != 3) ?
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
-+    s->num_coeffs[s->pass0_job][buf_no] += n;
++    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
 +    return coeffs;
 +}
 +#endif
@@ -7281,7 +11101,7 @@ index f9e8ff0..8a3d874 100644
 +
 +        // Add command
 +        {
-+            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
 +            cmd->type = RPI_PRED_I_PCM;
 +            cmd->size = log2_cb_size;
 +            cmd->i_pcm.src = coeffs;
@@ -7299,7 +11119,7 @@ index f9e8ff0..8a3d874 100644
  /**
   * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
   *
-@@ -1349,6 +1833,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1370,6 +1988,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                             (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -7310,7 +11130,7 @@ index f9e8ff0..8a3d874 100644
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1395,7 +1883,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1416,7 +2038,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -7319,7 +11139,7 @@ index f9e8ff0..8a3d874 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1419,6 +1907,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1440,6 +2062,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -7330,7 +11150,7 @@ index f9e8ff0..8a3d874 100644
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1504,6 +1996,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+@@ -1525,6 +2151,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -7341,7 +11161,7 @@ index f9e8ff0..8a3d874 100644
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1568,6 +2064,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+@@ -1589,6 +2219,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -7352,13 +11172,143 @@ index f9e8ff0..8a3d874 100644
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1695,14 +2195,582 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
-     }
+@@ -1662,13 +2296,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+                                                          _mx1, _my1, block_w);
  }
  
--static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
--                                int nPbW, int nPbH,
--                                int log2_cb_size, int partIdx, int idx)
+-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
+-                                const Mv *mv, int y0, int height)
++#ifdef RPI
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field)
++{
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCContext *const fs = ref->tf.owner[field]->priv_data;
++        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert0(pwait->req == -1 && pwait->next == NULL);
++
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            while (sem_wait(sem) != 0)
++                av_assert0(errno == EINTR);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
++{
++    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRPiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
+ {
+-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
+ 
+-    if (s->threads_type == FF_THREAD_FRAME )
+-        ff_thread_await_progress(&ref->tf, y, 0);
++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
++{
++    av_assert0(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++#endif
++
++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
++                                const Mv * const mv, const int y0, const int height)
++{
++    if (s->threads_type == FF_THREAD_FRAME) {
++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++
++#ifdef RPI
++        if (s->enable_rpi) {
++            int16_t *const pr = s->jb0->progress + ref->dpb_no;
++            if (*pr < y) {
++                *pr = y;
++            }
++        }
++        else
++#endif
++        // It is a const ThreadFrame but the prototype isn't
++        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++    }
+ }
+ 
+ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1707,23 +2440,551 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+             ff_hevc_hls_mvd_coding(s, x0, y0, 1);
+         }
+ 
+-        mv->pred_flag += PF_L1;
+-        mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
+-        ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
+-                                 part_idx, merge_idx, mv, mvp_flag, 1);
+-        mv->mv[1].x += lc->pu.mvd.x;
+-        mv->mv[1].y += lc->pu.mvd.y;
++        mv->pred_flag += PF_L1;
++        mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
++        ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
++                                 part_idx, merge_idx, mv, mvp_flag, 1);
++        mv->mv[1].x += lc->pu.mvd.x;
++        mv->mv[1].y += lc->pu.mvd.y;
++    }
++}
++
 +
 +#if RPI_INTER
 +
@@ -7374,7 +11324,7 @@ index f9e8ff0..8a3d874 100644
 +
 +    yp->load += load_val;
 +    ipe->used_grp = 1;
-+    ((uint32_t *)yp->qpu_mc_curr)[-1] = fn;  // Link is always last el of previous cmd
++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
 +
 +    return yp;
 +}
@@ -7384,8 +11334,8 @@ index f9e8ff0..8a3d874 100644
 +{
 +    for (unsigned int i = 0; i != ipe->n; ++i) {
 +        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        ((uint32_t *)q->qpu_mc_curr)[-1] = q->code_sync;
-+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)((uint32_t *)q->qpu_mc_curr + 1);
++        q->qpu_mc_curr->data[-1] = q->code_sync;
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
 +        q->load = 0;
 +    }
 +}
@@ -7428,39 +11378,40 @@ index f9e8ff0..8a3d874 100644
 +    }
 +}
 +
-+static void rpi_alloc_inter_pred(HEVCRpiInterPredEnv * const ipe,
-+                                 const unsigned int n, const unsigned int n_grp,
-+                                 const unsigned int q1_size, const unsigned int min_gap,
-+                                 const int * const * const setup_fns,
-+                                 const int * const * const sync_fns,
-+                                 const int * const * const exit_fns)
++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
 +{
-+    unsigned int i;
-+
 +    memset(ipe, 0, sizeof(*ipe));
-+    av_assert0((ipe->q = av_mallocz(n * sizeof(*ipe->q))) != NULL);
-+    ipe->n = n;
++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
 +    ipe->n_grp = n_grp;
-+    ipe->q1_size = q1_size;
-+    ipe->max_fill = ipe->q1_size - min_gap;
++    ipe->min_gap = min_gap;
 +
 +#if RPI_CACHE_UNIF_MVS
-+    gpu_malloc_cached(n * q1_size, &ipe->gptr);
++    gpu_malloc_cached(total_size, &ipe->gptr);
 +#else
-+    gpu_malloc_uncached(n * q1_size, &ipe->gptr);
++    gpu_malloc_uncached(total_size, &ipe->gptr);
 +#endif
-+
-+    for(i = 0; i < n; i++) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base =
-+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+        q->code_setup = qpu_fn(setup_fns[i]);
-+        q->code_sync = qpu_fn(sync_fns[i]);
-+        q->code_exit = qpu_fn(exit_fns[i]);
-+    }
 +}
 +
 +
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
++{
++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
++           wt << (s->ps.sps->bit_depth - 8);
++}
++
 +static void
 +rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
 +           const int nPbW, const int nPbH,
@@ -7469,175 +11420,157 @@ index f9e8ff0..8a3d874 100644
 +           const int weight_offset,
 +           AVFrame *const src_frame)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
 +    const unsigned int mx          = mv->x & 3;
 +    const unsigned int my          = mv->y & 3;
 +    const unsigned int my_mx       = (my << 8) | mx;
 +    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+    const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
-+    uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
-+    const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
-+    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
 +
 +    if (my_mx == 0)
 +    {
 +        const int x1 = x0 + (mv->x >> 2);
 +        const int y1 = y0 + (mv->y >> 2);
-+
-+#if Y_P_MAX_H == 0
 +        const int bh = nPbH;
-+        const int start_y = 0;
-+#else
-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H)
-+        {
-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
-+#endif
 +
-+            for (int start_x = 0; start_x < nPbW; start_x += 16)
-+            {
-+                const int bw = FFMIN(nPbW - start_x, 16);
-+                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_p00);
-+                qpu_mc_src_t *const src1 = yp->last_l0;
-+                qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
 +
 +#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    ++ts->y_pred1_x0y0;
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred1_x0y0;
 +
-+                    if (nPbW > 8)
-+                        ++ts->y_pred1_wgt8;
-+                    else
-+                        ++ts->y_pred1_wle8;
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
 +
-+                    if (nPbH > 16)
-+                        ++ts->y_pred1_hgt16;
-+                    else
-+                        ++ts->y_pred1_hle16;
-+                }
-+#endif
-+
-+                src1->x = x1 + start_x;
-+                src1->y = y1 + start_y;
-+                src1->base = src_vc_address_y;
-+                cmd_y->w = bw;
-+                cmd_y->h = bh;
-+                cmd_y->wo1 = wo;
-+                cmd_y->dst_addr =  dst_addr + start_x;
-+                yp->last_l0 = &cmd_y->next_src1;
-+                *(qpu_mc_pred_y_p00_t **)&yp->qpu_mc_curr = cmd_y + 1;
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
 +            }
-+#if Y_P_MAX_H != 0
-+        }
 +#endif
++
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
 +    }
 +    else
 +    {
 +        const int x1_m3 = x0 + (mv->x >> 2) - 3;
 +        const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+
-+#if Y_P_MAX_H == 0
-+        const int bh = nPbH;
-+        const int start_y = 0;
-+#else
-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H)
-+        {
-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
-+#endif
-+            const uint32_t src_yx_y = y1_m3 + start_y;
-+            int start_x = 0;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
 +
 +#if 1
-+            // As Y-pred operates on two independant 8-wide src blocks we can merge
-+            // this pred with the previous one if it the previous one is 8 pel wide,
-+            // the same height as the current block, immediately to the left of our
-+            // current dest block and mono-pred.
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
 +
-+            qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
-+            if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + 8 == dst_addr)
-+            {
-+                const int bw = FFMIN(nPbW, 8);
-+                qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
++        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
 +
-+                last_y8_src2->x = x1_m3;
-+                last_y8_src2->y = src_yx_y;
-+                last_y8_src2->base = src_vc_address_y;
-+                last_y8_p->w += bw;
-+                last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+                last_y8_p->wo2 = wo;
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
 +
-+                s->last_y8_p = NULL;
-+                s->last_y8_l1 = NULL;
-+                start_x = bw;
++            s->last_y8_p = NULL;
++            s->last_y8_l1 = NULL;
++            start_x = bw;
 +#if RPI_TSTATS
-+                ++s->tstats.y_pred1_y8_merge;
++            ++s->tstats.y_pred1_y8_merge;
 +#endif
-+            }
-+#endif
-+
-+            for (; start_x < nPbW; start_x += 16)
-+            {
-+                const int bw = FFMIN(nPbW - start_x, 16);
-+                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter);
-+                qpu_mc_src_t *const src1 = yp->last_l0;
-+                qpu_mc_src_t *const src2 = yp->last_l1;
-+                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    if (mx == 0 && my == 0)
-+                        ++ts->y_pred1_x0y0;
-+                    else if (mx == 0)
-+                        ++ts->y_pred1_x0;
-+                    else if (my == 0)
-+                        ++ts->y_pred1_y0;
-+                    else
-+                        ++ts->y_pred1_xy;
-+
-+                    if (nPbW > 8)
-+                        ++ts->y_pred1_wgt8;
-+                    else
-+                        ++ts->y_pred1_wle8;
-+
-+                    if (nPbH > 16)
-+                        ++ts->y_pred1_hgt16;
-+                    else
-+                        ++ts->y_pred1_hle16;
-+                }
-+#endif
-+                src1->x = x1_m3 + start_x;
-+                src1->y = src_yx_y;
-+                src1->base = src_vc_address_y;
-+                if (bw <= 8)
-+                {
-+                    src2->x = MC_DUMMY_X;
-+                    src2->y = MC_DUMMY_Y;
-+                    src2->base = s->qpu_dummy_frame;
-+                }
-+                else
-+                {
-+                    src2->x = x1_m3 + start_x + 8;
-+                    src2->y = src_yx_y;
-+                    src2->base = src_vc_address_y;
-+                }
-+                cmd_y->w = bw;
-+                cmd_y->h = bh;
-+                cmd_y->mymx21 = my2_mx2_my_mx;
-+                cmd_y->wo1 = wo;
-+                cmd_y->wo2 = wo;
-+                cmd_y->dst_addr =  dst_addr + start_x;
-+                yp->last_l0 = &cmd_y->next_src1;
-+                yp->last_l1 = &cmd_y->next_src2;
-+                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
-+
-+                if (bw == 8) {
-+                    s->last_y8_l1 = src2;
-+                    s->last_y8_p = cmd_y;
-+                }
-+            }
-+#if Y_P_MAX_H != 0
 +        }
 +#endif
++
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                s->last_y8_l1 = src2;
++                s->last_y8_p = cmd_y;
++            }
++        }
 +    }
 +}
 +
@@ -7649,7 +11582,7 @@ index f9e8ff0..8a3d874 100644
 +           AVFrame *const src_frame,
 +           AVFrame *const src_frame2)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
 +    const Mv * const mv  = mv_field->mv + 0;
 +    const Mv * const mv2 = mv_field->mv + 1;
 +
@@ -7662,15 +11595,16 @@ index f9e8ff0..8a3d874 100644
 +    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
 +    const unsigned int ref_idx0 = mv_field->ref_idx[0];
 +    const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+    const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
-+                 s->sh.luma_offset_l1[ref_idx1] + 1;
++    const uint32_t wt_offset =
++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
 +    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
 +    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
 +
-+    uint32_t dst = get_vc_address_y(s->frame) + y_off;
-+    const uint32_t src1_base = get_vc_address_y(src_frame);
-+    const uint32_t src2_base = get_vc_address_y(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
 +
 +    if (my2_mx2_my_mx == 0)
 +    {
@@ -7678,52 +11612,42 @@ index f9e8ff0..8a3d874 100644
 +        const int y1 = y0 + (mv->y >> 2);
 +        const int x2 = x0 + (mv2->x >> 2);
 +        const int y2 = y0 + (mv2->y >> 2);
-+
-+#if Y_B_MAX_H == 0
 +        const int bh = nPbH;
-+        const int start_y = 0;
-+#else
-+        for (int start_y = 0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H)
-+        {
-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
-+#endif
-+            // Can do chunks a full 16 wide if we don't want the H filter
-+            for (int start_x=0; start_x < nPbW; start_x += 16)
-+            {
-+                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_b00);
-+                qpu_mc_src_t *const src1 = yp->last_l0;
-+                qpu_mc_src_t *const src2 = yp->last_l1;
-+                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    ++ts->y_pred2_x0y0;
 +
-+                    if (nPbH > 16)
-+                        ++ts->y_pred2_hgt16;
-+                    else
-+                        ++ts->y_pred2_hle16;
-+                }
-+#endif
-+                src1->x = x1 + start_x;
-+                src1->y = y1 + start_y;
-+                src1->base = src1_base;
-+                src2->x = x2 + start_x;
-+                src2->y = y2 + start_y;
-+                src2->base = src2_base;
-+                cmd_y->w = FFMIN(nPbW - start_x, 16);
-+                cmd_y->h = bh;
-+                cmd_y->mymx21 = 0;
-+                cmd_y->wo1 = wo1;
-+                cmd_y->wo2 = wo2;
-+                cmd_y->dst_addr =  dst + start_x;
-+                yp->last_l0 = &cmd_y->next_src1;
-+                yp->last_l1 = &cmd_y->next_src2;
-+                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
 +            }
-+#if Y_P_MAX_H != 0
-+        }
 +#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
 +    }
 +    else
 +    {
@@ -7732,118 +11656,106 @@ index f9e8ff0..8a3d874 100644
 +        const int y1 = y0 + (mv->y >> 2) - 3;
 +        const int x2 = x0 + (mv2->x >> 2) - 3;
 +        const int y2 = y0 + (mv2->y >> 2) - 3;
-+
-+#if Y_B_MAX_H == 0
 +        const int bh = nPbH;
-+        const int start_y = 0;
-+#else
-+        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H)
-+        {
-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
-+#endif
-+            for (int start_x=0; start_x < nPbW; start_x += 8)
-+            { // B blocks work 8 at a time
-+                // B weights aren't doubled as the QPU code does the same
-+                // amount of work as it does for P
-+                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter_b);
-+                qpu_mc_src_t *const src1 = yp->last_l0;
-+                qpu_mc_src_t *const src2 = yp->last_l1;
-+                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    const unsigned int mmx = mx | mx2;
-+                    const unsigned int mmy = my | my2;
-+                    if (mmx == 0 && mmy == 0)
-+                        ++ts->y_pred2_x0y0;
-+                    else if (mmx == 0)
-+                        ++ts->y_pred2_x0;
-+                    else if (mmy == 0)
-+                        ++ts->y_pred2_y0;
-+                    else
-+                        ++ts->y_pred2_xy;
 +
-+                    if (nPbH > 16)
-+                        ++ts->y_pred2_hgt16;
-+                    else
-+                        ++ts->y_pred2_hle16;
-+                }
-+#endif
-+                src1->x = x1 + start_x;
-+                src1->y = y1 + start_y;
-+                src1->base = src1_base;
-+                src2->x = x2 + start_x;
-+                src2->y = y2 + start_y;
-+                src2->base = src2_base;
-+                cmd_y->w = FFMIN(nPbW - start_x, 8);
-+                cmd_y->h = bh;
-+                cmd_y->mymx21 = my2_mx2_my_mx;
-+                cmd_y->wo1 = wo1;
-+                cmd_y->wo2 = wo2;
-+                cmd_y->dst_addr =  dst + start_x;
-+                yp->last_l0 = &cmd_y->next_src1;
-+                yp->last_l1 = &cmd_y->next_src2;
-+                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
 +            }
-+#if Y_B_MAX_H != 0
-+        }
 +#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
 +    }
 +}
 +
-+
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
-+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
 +  const Mv * const mv,
 +  const int16_t * const c_weights,
 +  const int16_t * const c_offsets,
 +  AVFrame * const src_frame)
 +{
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift           = s->ps.sps->hshift[1];
-+    const int vshift           = s->ps.sps->vshift[1];
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
 +
 +    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
 +    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
-+    const uint32_t src_base_u = get_vc_address_u(src_frame);
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
 +    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
 +    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
-+    const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
-+    const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
-+    uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip;
++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
 +
-+    for(int start_y=0;start_y < nPbH_c;start_y+=16)
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const int bh = FFMIN(nPbH_c-start_y, 16);
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+        {
-+            HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, s->qpu_filter_uv);
-+            qpu_mc_pred_c_p_t * const u = &cp->qpu_mc_curr->c.p;
-+            qpu_mc_src_t * const last_l0 = cp->last_l0;
-+            const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+            last_l0->x = x1_c + start_x;
-+            last_l0->y = y1_c + start_y;
-+            last_l0->base = src_base_u;
-+            u[0].h = bh;
-+            u[0].w = bw;
-+            u[0].coeffs_x = x_coeffs;
-+            u[0].coeffs_y = y_coeffs;
-+            u[0].wo_u = wo_u;
-+            u[0].wo_v = wo_v;
-+            u[0].dst_addr_c = dst_base_u + start_x * 2;
-+            cp->last_l0 = &u->next_src;
-+            *(qpu_mc_pred_c_p_t **)&cp->qpu_mc_curr = u + 1;
-+        }
-+
-+        dst_base_u += s->frame->linesize[1] * 16;
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
 +    }
 +    return;
 +}
 +
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
 +rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
@@ -7855,9 +11767,9 @@ index f9e8ff0..8a3d874 100644
 +  AVFrame * const src_frame,
 +  AVFrame * const src_frame2)
 +{
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = s->ps.sps->hshift[1];
-+    const int vshift = s->ps.sps->vshift[1];
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
 +    const Mv * const mv = mv_field->mv + 0;
 +    const Mv * const mv2 = mv_field->mv + 1;
 +
@@ -7876,52 +11788,53 @@ index f9e8ff0..8a3d874 100644
 +    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
 +    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
 +
-+    uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+    const uint32_t src1_base = get_vc_address_u(src_frame);
-+    const uint32_t src2_base = get_vc_address_u(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip;
++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
 +
-+    for (int start_y = 0; start_y < nPbH_c; start_y += 16)
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+        {
-+            const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
 +
-+            HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu_filter_uv_b0);
-+            qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+            qpu_mc_src_t * const src_l0 = cp->last_l0;
-+            qpu_mc_src_t * const src_l1 = cp->last_l1;
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
 +
-+            src_l0->x = x1_c + start_x;
-+            src_l0->y = y1_c + start_y;
-+            src_l0->base = src1_base;
-+            src_l1->x = x2_c + start_x;
-+            src_l1->y = y2_c + start_y;
-+            src_l1->base = src2_base;
-+
-+            u[0].h = bh;
-+            u[0].w = bw;
-+            u[0].coeffs_x1 = coefs0_x;
-+            u[0].coeffs_y1 = coefs0_y;
-+            u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+            u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+            u[0].coeffs_x2 = coefs1_x;
-+            u[0].coeffs_y2 = coefs1_y;
-+            u[0].wo_u2 = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
-+            u[0].wo_v2 = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
-+            u[0].dst_addr_c = dst_base_u + start_x * 2;
-+
-+            cp->last_l0 = &u[0].next_src1;
-+            cp->last_l1 = &u[0].next_src2;
-+            *(qpu_mc_pred_c_b_t **)&cp->qpu_mc_curr = u + 1;
-+        }
-+
-+        dst_base_u += s->frame->linesize[1] * 16;
-+    }
-+}
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
 +
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+     }
+ }
+ 
+-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                                int nPbW, int nPbH,
+-                                int log2_cb_size, int partIdx, int idx)
 +
 +#endif
 +
@@ -7939,7 +11852,7 @@ index f9e8ff0..8a3d874 100644
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1720,8 +2788,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1741,8 +3002,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -7949,7 +11862,7 @@ index f9e8ff0..8a3d874 100644
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1765,12 +2832,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1786,12 +3046,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -7971,7 +11884,7 @@ index f9e8ff0..8a3d874 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
 +                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
 +                  ref0->frame);
 +                return;
@@ -7980,7 +11893,7 @@ index f9e8ff0..8a3d874 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-@@ -1784,12 +2868,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1805,12 +3082,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -8002,7 +11915,7 @@ index f9e8ff0..8a3d874 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
 +                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
 +                  ref1->frame);
 +                return;
@@ -8011,7 +11924,7 @@ index f9e8ff0..8a3d874 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-@@ -1804,11 +2905,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1825,11 +3119,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -8044,7 +11957,7 @@ index f9e8ff0..8a3d874 100644
              chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
-@@ -2083,7 +3204,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -2104,7 +3418,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                  intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
                  ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
                  if (s->ps.sps->pcm.loop_filter_disable_flag)
@@ -8054,21 +11967,22 @@ index f9e8ff0..8a3d874 100644
  
                  if (ret < 0)
                      return ret;
-@@ -2306,6 +3429,373 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+@@ -2327,6 +3643,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
 +#ifdef RPI
 +static void rpi_execute_dblk_cmds(HEVCContext *s)
 +{
-+    int n;
-+    int job = s->pass1_job;
-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+    int (*p)[2] = s->dblk_cmds[job];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
++    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
++    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
++    unsigned int i;
++
++    for (i = 0; i != de->n; ++i)
++    {
++        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
 +    }
-+    s->num_dblk_cmds[job] = 0;
++    de->n = 0;
 +}
 +
 +#if 0
@@ -8101,10 +12015,11 @@ index f9e8ff0..8a3d874 100644
 +#endif
 +
 +
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
 +#define RPI_OPT_SEP_PRED 0
 +
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
 +#if RPI_OPT_SEP_PRED
 +static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
 +#else
@@ -8112,15 +12027,15 @@ index f9e8ff0..8a3d874 100644
 +#endif
 +{
 +  int i;
-+  int job = s->pass1_job;
-+  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+#ifdef RPI_WORKER
++  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
++  const HEVCPredCmd *cmd = iap->cmds;
++#ifdef RPI
 +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
 +#else
 +  HEVCLocalContext *lc = s->HEVClc;
 +#endif
 +
-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
++  for(i = iap->n; i > 0; i--, cmd++) {
 +//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
 +#if RPI_OPT_SEP_PRED
 +      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
@@ -8137,7 +12052,7 @@ index f9e8ff0..8a3d874 100644
 +              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
 +              lc->na.cand_up           = (cmd->na >> 1) & 1;
 +              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
 +                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
 +              else
 +                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
@@ -8146,17 +12061,25 @@ index f9e8ff0..8a3d874 100644
 +          case RPI_PRED_ADD_RESIDUAL:
 +              s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
 +              break;
++          case RPI_PRED_ADD_DC:
++              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
 +#if RPI_HEVC_SAND
 +          case RPI_PRED_ADD_RESIDUAL_U:
-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
 +          case RPI_PRED_ADD_RESIDUAL_V:
-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
 +          case RPI_PRED_ADD_RESIDUAL_C:
 +              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
 +              break;
++          case RPI_PRED_ADD_DC_U:
++          case RPI_PRED_ADD_DC_V:
++              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
 +#endif
++
 +          case RPI_PRED_I_PCM:
 +              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
 +              break;
@@ -8170,7 +12093,7 @@ index f9e8ff0..8a3d874 100644
 +  if (do_luma)
 +#endif
 +  {
-+      s->num_pred_cmds[job] = 0;
++      iap->n = 0;
 +  }
 +}
 +
@@ -8183,9 +12106,8 @@ index f9e8ff0..8a3d874 100644
 +static void rpi_begin(HEVCContext *s)
 +{
 +#if RPI_INTER
-+    int job = s->pass0_job;
-+    int i;
-+    HEVCRpiJob * const jb = s->jobs + job;
++    unsigned int i;
++    HEVCRpiJob * const jb = s->jb0;
 +    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
 +    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
 +
@@ -8196,7 +12118,7 @@ index f9e8ff0..8a3d874 100644
 +    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
 +
 +    rpi_inter_pred_reset(cipe);
-+    for(i=0; i < QPU_N_UV;i++) {
++    for (i = 0; i < cipe->n; i++) {
 +        HEVCRpiInterPredQ * const cp = cipe->q + i;
 +        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
 +
@@ -8205,9 +12127,9 @@ index f9e8ff0..8a3d874 100644
 +        u->next_src1.base = 0;
 +        u->pic_cw = pic_width_c;
 +        u->pic_ch = pic_height_c;
-+        u->stride2 = rpi_sliced_frame_stride2(s->frame);
-+        u->stride1 = s->frame->linesize[1];
-+        u->wdenom = s->sh.chroma_log2_weight_denom + 6;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        u->wdenom = s->sh.chroma_log2_weight_denom;
 +        cp->last_l0 = &u->next_src1;
 +
 +        u->next_fn = 0;
@@ -8216,12 +12138,12 @@ index f9e8ff0..8a3d874 100644
 +        u->next_src2.base = 0;
 +        cp->last_l1 = &u->next_src2;
 +
-+        *(qpu_mc_pred_c_s_t **)&cp->qpu_mc_curr = u + 1;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
 +    }
 +
 +    rpi_inter_pred_reset(yipe);
-+    for(i=0;i < QPU_N_Y;i++) {
-+        HEVCRpiInterPredQ * const yp = s->jobs[job].luma_ip.q + i;
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
 +        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
 +
 +        y->next_src1.x = 0;
@@ -8232,18 +12154,23 @@ index f9e8ff0..8a3d874 100644
 +        y->next_src2.base = 0;
 +        y->pic_h = pic_height_y;
 +        y->pic_w = pic_width_y;
-+        y->stride2 = rpi_sliced_frame_stride2(s->frame);
-+        y->stride1 = s->frame->linesize[0];
-+        y->wdenom = s->sh.luma_log2_weight_denom + 6;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->wdenom = s->sh.luma_log2_weight_denom;
 +        y->next_fn = 0;
 +        yp->last_l0 = &y->next_src1;
 +        yp->last_l1 = &y->next_src2;
 +
-+        *(qpu_mc_pred_y_s_t **)&yp->qpu_mc_curr = y + 1;
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
 +    }
 +
 +    s->last_y8_p = NULL;
 +    s->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++        jb->progress[i] = -1;
++    }
++
 +#endif
 +    s->ctu_count = 0;
 +}
@@ -8251,13 +12178,15 @@ index f9e8ff0..8a3d874 100644
 +
 +
 +#if RPI_INTER
-+static unsigned int mc_terminate_add(HEVCContext * const s,
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
 +                                     const vpu_qpu_job_h vqj,
 +                                     rpi_cache_flush_env_t * const rfe,
 +                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
 +    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
 +
 +    if (!ipe->used) {
 +        return 0;
@@ -8272,18 +12201,20 @@ index f9e8ff0..8a3d874 100644
 +        HEVCRpiInterPredQ * const yp = ipe->q + i;
 +        qpu_mc_src_t *const p0 = yp->last_l0;
 +        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
 +
-+        ((uint32_t *)yp->qpu_mc_curr)[-1] = yp->code_exit;
++        if (block_size > max_block)
++            max_block = block_size;
 +
-+        av_assert0((char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base <= ipe->q1_size);
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
 +
 +        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
 +        p0->x = MC_DUMMY_X;
 +        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame;
++        p0->base = s->qpu_dummy_frame_qpu;
 +        p1->x = MC_DUMMY_X;
 +        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame;
++        p1->base = s->qpu_dummy_frame_qpu;
 +
 +        yp->last_l0 = NULL;
 +        yp->last_l1 = NULL;
@@ -8294,13 +12225,73 @@ index f9e8ff0..8a3d874 100644
 +    }
 +
 +#if RPI_CACHE_UNIF_MVS
-+    rpi_cache_flush_add_gm_ptr(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
 +#endif
-+    vpu_qpu_job_add_qpu(vqj, QPU_N_UV, (uint32_t *)mail);
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
 +
 +    return 1;
 +}
++#endif
 +
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
 +#endif
 +
 +#ifdef RPI
@@ -8322,47 +12313,33 @@ index f9e8ff0..8a3d874 100644
 +#endif
 +    vpu_qpu_wait_h sync_y;
 +
-+    const int job = s->pass1_job;
-+    unsigned int flush_start = 0;
-+    unsigned int flush_count = 0;
++    HEVCRpiJob * const jb = s->jb1;
++    int pred_y, pred_c;
 +
 +    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +
-+    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
-+        vpu_qpu_job_add_vpu(vqj,
-+            vpu_get_fn(),
-+            vpu_get_constants(),
-+            s->coeffs_buf_vc[job][2],
-+            s->num_coeffs[job][2] >> 8,
-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+            s->num_coeffs[job][3] >> 10,
-+            0);
-+
-+        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    }
-+
-+
-+#if RPI_INTER
 +    {
-+        int (*d)[2] = s->dblk_cmds[job];
-+        unsigned int high=(*d)[1];
-+        int n;
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                cf->s[2].n >> 8,
++                cf->gptr.vc + offset32,
++                cf->s[3].n >> 10,
++                0);
 +
-+        flush_start = high;
-+        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+            unsigned int y = (*d)[1];
-+            flush_start = FFMIN(flush_start, y);
-+            high=FFMAX(high,y);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
 +        }
-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->ps.sps->height) - flush_start;
 +    }
 +
-+    if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].chroma_ip) != 0)
-+    {
-+        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 0, 1);
-+    }
++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
 +
 +// We can take a sync here and try to locally overlap QPU processing with ARM
 +// but testing showed a slightly negative benefit with noticable extra complexity
@@ -8370,25 +12347,109 @@ index f9e8ff0..8a3d874 100644
 +    vpu_qpu_job_add_sync_this(vqj, &sync_c);
 +#endif
 +
-+    if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].luma_ip) != 0)
-+    {
-+        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 1, 0);
-+    }
-+#endif
++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
 +
 +    vpu_qpu_job_add_sync_this(vqj, &sync_y);
 +
++
++    // We are expecting a contiguous Z-shaped set of blocks
++    // So generate up to 3 blocks:
++    //   1st line
++    //   body
++    //   last line
++    // This will work even if we don't have the expected geometry
++    if (pred_y || pred_c)
++    {
++        const HEVCRpiDeblkEnv *const de = &jb->deblk;
++        const HEVCRpiDeblkBlk * db = de->blks + 0;
++        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++        unsigned int x0 = db->x_ctb;
++        unsigned int xx = x0 + ctb_size;
++        unsigned int y0 = db->y_ctb;
++
++        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
++        unsigned int b = 0;
++        unsigned int i;
++
++        for (i = 1, ++db; i < de->n; ++i, ++db)
++        {
++            if (db->x_ctb == xx && db->y_ctb == y0) {
++                xx += ctb_size;
++            }
++            else
++            {
++                unsigned int * const tlbr = blks_tlbr[b];
++                if (tlbr[0] > y0)
++                    tlbr[0] = y0;
++                if (tlbr[1] > x0)
++                    tlbr[1] = x0;
++                if (tlbr[2] < y0 + ctb_size)
++                    tlbr[2] = y0 + ctb_size;
++                if (tlbr[3] < xx)
++                    tlbr[3] = xx;
++                x0 = db->x_ctb;
++                xx = x0 + ctb_size;
++                y0 = db->y_ctb;
++                b = 1;
++            }
++        }
++
++        if (blks_tlbr[b][0] != ~0U)
++            ++b;
++
++        {
++            unsigned int * const tlbr = blks_tlbr[b];
++            tlbr[0] = y0;
++            tlbr[1] = x0;
++            tlbr[2] = y0 + ctb_size;
++            tlbr[3] = xx;
++        }
++
++        // ??? Coalesce blocks ???
++        for (i = 0; i <= b; ++i) {
++            const unsigned int * const tlbr = blks_tlbr[i];
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
++        }
++    }
++
++
 +    // Having accumulated some commands - do them
 +    rpi_cache_flush_finish(rfe);
++
++    // Await progress as required
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++            if (jb->progress[i] >= 0) {
++                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
++            }
++        }
++    }
++
 +    vpu_qpu_job_finish(vqj);
 +
-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));
++    worker_pic_reset(&jb->coeffs);
 +
-+    // We would do ARM inter prediction here but no longer
-+    // Look back in git if you find you want it back - As we have
-+    // no arm/neon sand pred code there doesn't seem a lot of point
-+    // keeping it around
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    else
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++#endif
 +
 +#if RPI_OPT_SEP_PRED
 +    // Wait for transform completion
@@ -8416,6 +12477,9 @@ index f9e8ff0..8a3d874 100644
 +
 +static void rpi_do_all_passes(HEVCContext *s)
 +{
++    // Called from main thread - must be no pending background jobs
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
++
 +    // Do the various passes - common with the worker code
 +    worker_core(s);
 +    // Prepare next batch
@@ -8428,68 +12492,66 @@ index f9e8ff0..8a3d874 100644
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2315,6 +3805,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2336,6 +4170,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
-+        s->frame->format == AV_PIX_FMT_SAND128 &&
-+        !s->ps.pps->cross_component_prediction_enabled_flag;
-+
-+    if (!s->enable_rpi) {
-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
-+        printf("Cross component\n");
-+    }
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++    s->enable_rpi =
++        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
++         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
 +#endif
 +    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
 +
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2328,6 +3830,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2349,8 +4194,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          }
      }
  
-+#ifdef RPI_WORKER
-+    s->pass0_job = 0;
-+    s->pass1_job = 0;
-+#endif
 +#ifdef RPI
++    // Worker must be idle at start
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
 +    rpi_begin(s);
 +#endif
 +
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2335,6 +3845,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
          y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
- 
-+
-         ff_hevc_cabac_init(s, ctb_addr_ts);
- 
-         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-@@ -2344,6 +3855,49 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2365,6 +4216,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
 +#ifdef RPI
++        // Report progress so we can use our MVs in other frames
++        // If we are tiled then this isn't really optimal but given that tiling
++        // can change on a per pic basis (described in PPS) other schemes are
++        // quite a lot harder
++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
++        }
++
 +        if (s->enable_rpi) {
-+            int q_full = (s->ctu_count >= s->max_ctu_count);
++            int q_full = (++s->ctu_count >= s->max_ctu_count);
 +
-+            if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].luma_ip) != 0)
++            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
 +                q_full = 1;
-+            if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].chroma_ip) != 0)
++            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
 +                q_full = 1;
 +
-+            s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+            s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+            s->ctu_count++;
++            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
++            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
 +
 +            if (q_full) {
-+#ifdef RPI_WORKER
 +                if (s->used_for_ref)
 +                {
 +//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
@@ -8508,9 +12570,6 @@ index f9e8ff0..8a3d874 100644
 +                    // Non-ref frame so do it all on this thread
 +                    rpi_do_all_passes(s);
 +                }
-+#else
-+                rpi_do_all_passes(s);
-+#endif
 +            }
 +
 +        }
@@ -8520,7 +12579,7 @@ index f9e8ff0..8a3d874 100644
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2352,9 +3906,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2373,9 +4270,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -8533,12 +12592,10 @@ index f9e8ff0..8a3d874 100644
  
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    // Wait for the worker to finish all its jobs
 +    if (s->enable_rpi) {
 +        worker_wait(s);
 +    }
-+#endif
 +
 +    // Finish off any half-completed rows
 +    if (s->enable_rpi && s->ctu_count) {
@@ -8563,7 +12620,7 @@ index f9e8ff0..8a3d874 100644
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2389,6 +3976,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+@@ -2410,6 +4338,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -8575,7 +12632,7 @@ index f9e8ff0..8a3d874 100644
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2771,6 +4363,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+@@ -2792,6 +4725,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
          if (ret < 0)
              return ret;
  
@@ -8609,44 +12666,77 @@ index f9e8ff0..8a3d874 100644
          if (s->sh.first_slice_in_pic_flag) {
              if (s->max_ra == INT_MAX) {
                  if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-@@ -2894,10 +4513,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+@@ -2915,10 +4875,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
          }
      }
  
 -fail:
 -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-+fail:  // Also success path
-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#if RPI_INTER
-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
-+#endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    }
-+#if RPI_INTER
-+    else if (s->ref && s->enable_rpi) {
-+      // When running single threaded we need to flush the whole frame
-+      flush_frame(s,s->frame);
-+    }
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++#ifdef RPI
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
 +#endif
++            ff_hevc_progress_signal_all_done(s);
++        }
++#ifdef RPI
++        // * Flush frame will become confused if we pass it something
++        //   that doesn't have an expected number of planes (e.g. 400)
++        //   So only flush if we are sure we can.
++        else if (s->enable_rpi) {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++#endif
++    }
      return ret;
  }
  
-@@ -3150,6 +4778,48 @@ fail:
+@@ -3171,6 +5146,83 @@ fail:
      return AVERROR(ENOMEM);
  }
  
-+#ifdef RPI_WORKER
-+static av_cold void hevc_init_worker(HEVCContext *s)
++#ifdef RPI
++static av_cold void hevc_init_worker(HEVCContext * const s)
 +{
 +    int err;
-+    pthread_cond_init(&s->worker_cond_head, NULL);
-+    pthread_cond_init(&s->worker_cond_tail, NULL);
-+    pthread_mutex_init(&s->worker_mutex, NULL);
 +
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
++    memset(s->jobs, 0, sizeof(s->jobs));
++
++    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
++        HEVCRpiJob * const jb = s->jobs + job;
++
++        sem_init(&jb->sem_in, 0, 0);
++        sem_init(&jb->sem_out, 0, 0);
++        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++        jb->intra.n = 0;
++        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
++
++        // ** Sizeof the union structure might be overkill but at the moment it
++        //    is correct (it certainly isn't going to be too small)
++
++        rpi_inter_pred_alloc(&jb->chroma_ip,
++                             QPU_N_MAX, QPU_N_GRP,
++                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++        rpi_inter_pred_alloc(&jb->luma_ip,
++                             QPU_N_MAX,  QPU_N_GRP,
++                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++
++        jb->deblk.n = 0;
++        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
++    }
++    s->pass0_job = 0;
++    s->pass1_job = 0;
++    s->jb0 = s->jobs + 0;
++    s->jb1 = s->jobs + 0;
++
 +    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
 +    if (err) {
 +        printf("Failed to create worker thread\n");
@@ -8654,62 +12744,66 @@ index f9e8ff0..8a3d874 100644
 +    }
 +}
 +
-+static av_cold void hevc_exit_worker(HEVCContext *s)
-+{
-+    void *res;
-+    s->kill_worker=1;
-+    pthread_cond_broadcast(&s->worker_cond_tail);
-+    pthread_join(s->worker_thread, &res);
-+
-+    pthread_cond_destroy(&s->worker_cond_head);
-+    pthread_cond_destroy(&s->worker_cond_tail);
-+    pthread_mutex_destroy(&s->worker_mutex);
-+
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
-+}
-+
 +static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
 +{
 +    av_freep(&ipe->q);
 +    gpu_free(&ipe->gptr);
 +}
 +
++static av_cold void hevc_exit_worker(HEVCContext *s)
++{
++    void *res;
++    unsigned int i;
++
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        s->jobs[i].terminate = 1;
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        sem_post(&s->jobs[i].sem_in);
++    pthread_join(s->worker_thread, &res);
++
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiJob * const jb = s->jobs + i;
++
++        sem_destroy(&jb->sem_in);
++        sem_destroy(&jb->sem_out);
++        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++        av_freep(&jb->intra.cmds);
++        av_freep(&jb->deblk.blks);
++        rpi_free_inter_pred(&jb->chroma_ip);
++        rpi_free_inter_pred(&jb->luma_ip);
++    }
++}
++
 +#endif
 +
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3161,6 +4831,27 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3182,10 +5234,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
  
      av_freep(&s->cabac_state);
  
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    hevc_exit_worker(s);
-+#endif
-+
-+    for(i=0;i<RPI_MAX_JOBS;i++) {
-+
-+        av_freep(&s->univ_pred_cmds[i]);
-+
-+#if RPI_INTER
-+        rpi_free_inter_pred(&s->jobs[i].chroma_ip);
-+        rpi_free_inter_pred(&s->jobs[i].luma_ip);
-+#endif
-+    }
-+
 +    vpu_qpu_term();
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
+     }
 +
 +    av_rpi_zc_uninit(avctx);
 +#endif
 +
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-         av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3202,10 +4893,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
+     av_frame_free(&s->output_frame);
+ 
+     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+@@ -3223,6 +5284,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
      return 0;
  }
  
@@ -8717,14 +12811,7 @@ index f9e8ff0..8a3d874 100644
  static av_cold int hevc_init_context(AVCodecContext *avctx)
  {
      HEVCContext *s = avctx->priv_data;
-     int i;
-+#ifdef RPI
-+    unsigned int job;
-+#endif
- 
-     s->avctx = avctx;
- 
-@@ -3215,6 +4910,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+@@ -3236,6 +5298,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
@@ -8738,53 +12825,39 @@ index f9e8ff0..8a3d874 100644
 +    if (vpu_qpu_init() != 0)
 +        goto fail;
 +
-+    for(job = 0; job < RPI_MAX_JOBS; job++) {
-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+        if (!s->univ_pred_cmds[job])
-+            goto fail;
-+    }
-+
 +#if RPI_INTER
-+
-+    for (job = 0; job < RPI_MAX_JOBS; job++) {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+        // ** Sizeof the union structure might be overkill but at the moment it
-+        //    is correct (it certainly isn't going to be too samll)
-+
-+        rpi_alloc_inter_pred(&jb->chroma_ip,
-+                             QPU_N_UV, QPU_N_GRP_UV,
-+                             UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t),
-+                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t),
-+                             inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu);
-+        rpi_alloc_inter_pred(&jb->luma_ip,
-+                             QPU_N_Y,  QPU_N_GRP_Y,
-+                             Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t),
-+                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t),
-+                             inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu);
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
 +    }
-+
-+    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
-+    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
-+    s->qpu_dummy_frame = qpu_fn(mc_start);  // Use our code as a dummy frame
-+    s->qpu_filter = qpu_fn(mc_filter);
-+    s->qpu_filter_y_p00 = qpu_fn(mc_filter_y_p00);
-+    s->qpu_filter_y_b00 = qpu_fn(mc_filter_y_b00);
-+    s->qpu_filter_b = qpu_fn(mc_filter_b);
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
++#endif
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
 +
 +    s->enable_rpi = 0;
 +
-+#ifdef RPI_WORKER
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
 +    hevc_init_worker(s);
 +#endif
-+
-+#endif
 +
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
-@@ -3357,9 +5105,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+@@ -3249,6 +5342,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+         if (!s->DPB[i].frame)
+             goto fail;
+         s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
+     }
+ 
+     s->max_ra = INT_MAX;
+@@ -3378,9 +5472,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
      }
  
      if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
@@ -8797,7 +12870,7 @@ index f9e8ff0..8a3d874 100644
  
      return 0;
  }
-@@ -3418,6 +5166,8 @@ AVCodec ff_hevc_decoder = {
+@@ -3439,6 +5533,8 @@ AVCodec ff_hevc_decoder = {
      .update_thread_context = hevc_update_thread_context,
      .init_thread_copy      = hevc_init_thread_copy,
      .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
@@ -8807,7 +12880,7 @@ index f9e8ff0..8a3d874 100644
      .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
      .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
 diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h
-index 0c78812..c268d39 100644
+index 0c7881286c..70394aab76 100644
 --- a/libavcodec/hevcdec.h
 +++ b/libavcodec/hevcdec.h
 @@ -334,17 +334,6 @@ typedef struct CodingUnit {
@@ -8828,11 +12901,17 @@ index 0c78812..c268d39 100644
  typedef struct NeighbourAvailable {
      int cand_bottom_left;
      int cand_left;
-@@ -421,7 +410,17 @@ typedef struct HEVCFrame {
+@@ -419,9 +408,23 @@ typedef struct HEVCFrame {
+      * A combination of HEVC_FRAME_FLAG_*
+      */
      uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
  } HEVCFrame;
  
-+#ifdef RPI_WORKER
++#ifdef RPI
 +typedef struct HEVCLocalContextIntra {
 +    TransformUnit tu;
 +    NeighbourAvailable na;
@@ -8846,7 +12925,7 @@ index 0c78812..c268d39 100644
      uint8_t cabac_state[HEVC_CONTEXTS];
  
      uint8_t stat_coeff[4];
-@@ -436,8 +435,6 @@ typedef struct HEVCLocalContext {
+@@ -436,8 +439,6 @@ typedef struct HEVCLocalContext {
  
      int qPy_pred;
  
@@ -8855,7 +12934,7 @@ index 0c78812..c268d39 100644
      uint8_t ctb_left_flag;
      uint8_t ctb_up_flag;
      uint8_t ctb_up_right_flag;
-@@ -453,7 +450,6 @@ typedef struct HEVCLocalContext {
+@@ -453,7 +454,6 @@ typedef struct HEVCLocalContext {
      int ct_depth;
      CodingUnit cu;
      PredictionUnit pu;
@@ -8863,7 +12942,7 @@ index 0c78812..c268d39 100644
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -464,6 +460,149 @@ typedef struct HEVCLocalContext {
+@@ -464,6 +464,207 @@ typedef struct HEVCLocalContext {
      int boundary_flags;
  } HEVCLocalContext;
  
@@ -8874,6 +12953,7 @@ index 0c78812..c268d39 100644
 +// but allocate more memory and increase the latency before data in the next frame can be processed
 +#define RPI_NUM_CHUNKS 4
 +#define RPI_CHUNK_SIZE 12
++#define RPI_ROUND_TO_LINES 0
 +
 +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
 +#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
@@ -8920,6 +13000,9 @@ index 0c78812..c268d39 100644
 +    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
 +    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
 +    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
 +    RPI_PRED_INTRA,
 +    RPI_PRED_I_PCM,
 +    RPI_PRED_CMD_MAX
@@ -8934,8 +13017,14 @@ index 0c78812..c268d39 100644
 +        struct {  // TRANSFORM_ADD
 +            uint8_t * dst;
 +            const int16_t * buf;
-+            uint32_t stride;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
 +        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
 +        struct {  // INTRA
 +            uint16_t x;
 +            uint16_t y;
@@ -8953,6 +13042,7 @@ index 0c78812..c268d39 100644
 +#endif
 +
 +#ifdef RPI
++#include <semaphore.h>
 +
 +union qpu_mc_pred_cmd_s;
 +struct qpu_mc_pred_y_p_s;
@@ -8979,13 +13069,60 @@ index 0c78812..c268d39 100644
 +    int used;              // 0 if nothing in any Q, 1 otherwise
 +    int used_grp;          // 0 if nothing in any Q in the current group
 +    unsigned int max_fill;
++    unsigned int min_gap;
 +    GPU_MEM_PTR_T gptr;
-+    unsigned int q1_size;  // size of 1 uniform Q
 +} HEVCRpiInterPredEnv;
 +
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCeoffEnv {
++    unsigned int n;
++    uint16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCeoffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiDeblkBlk {
++    uint16_t x_ctb;
++    uint16_t y_ctb;
++} HEVCRpiDeblkBlk;
++
++typedef struct HEVCRpiDeblkEnv {
++    unsigned int n;
++    HEVCRpiDeblkBlk * blks;
++} HEVCRpiDeblkEnv;
++
++typedef struct HEVCRPiFrameProgressWait {
++    int req;
++    struct HEVCRPiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRPiFrameProgressWait;
++
++typedef struct HEVCRPiFrameProgressState {
++    struct HEVCRPiFrameProgressWait * first;
++    struct HEVCRPiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRPiFrameProgressState;
++
 +typedef struct HEVCRpiJob {
++    volatile int terminate;
++    int pending;
++    sem_t sem_in;       // set by main
++    sem_t sem_out;      // set by worker
 +    HEVCRpiInterPredEnv chroma_ip;
 +    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress[32];  // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiDeblkEnv deblk;
++    HEVCRPiFrameProgressWait progress_wait;
 +} HEVCRpiJob;
 +
 +#if RPI_TSTATS
@@ -9013,43 +13150,20 @@ index 0c78812..c268d39 100644
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -472,6 +611,9 @@ typedef struct HEVCContext {
- 
-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-     HEVCLocalContext    *HEVClc;
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra HEVClcIntra;
-+#endif
- 
-     uint8_t             threads_type;
-     uint8_t             threads_number;
-@@ -479,6 +621,90 @@ typedef struct HEVCContext {
+@@ -479,6 +680,69 @@ typedef struct HEVCContext {
      int                 width;
      int                 height;
  
 +    int used_for_ref;  // rpi
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+    int buf_width;
-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-+    int num_coeffs[RPI_MAX_JOBS][4];
-+    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds_y[RPI_MAX_JOBS];
-+    int num_mv_cmds_c[RPI_MAX_JOBS];
-+    int num_pred_cmds[RPI_MAX_JOBS];
-+    int num_dblk_cmds[RPI_MAX_JOBS];
-+    int vpu_id;
-+    int pass0_job; // Pass0 does coefficient decode
-+    int pass1_job; // Pass1 does pixel processing
++    unsigned int pass0_job; // Pass0 does coefficient decode
++    unsigned int pass1_job; // Pass1 does pixel processing
 +    int ctu_count; // Number of CTUs done in pass0 so far
 +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
 +
++    HEVCRpiJob * jb0;
++    HEVCRpiJob * jb1;
 +    HEVCRpiJob jobs[RPI_MAX_JOBS];
 +#if RPI_TSTATS
 +    HEVCRpiStats tstats;
@@ -9059,29 +13173,19 @@ index 0c78812..c268d39 100644
 +    struct qpu_mc_src_s * last_y8_l1;
 +
 +    // Function pointers
-+    uint32_t qpu_filter_uv;
-+    uint32_t qpu_filter_uv_b0;
-+    uint32_t qpu_dummy_frame;  // Not a frame - just a bit of memory
-+    uint32_t qpu_filter;
-+    uint32_t qpu_filter_b;
-+    uint32_t qpu_filter_y_p00;
-+    uint32_t qpu_filter_y_b00;
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
 +#endif
 +
-+#ifdef RPI_WORKER
 +    pthread_t worker_thread;
-+    pthread_cond_t worker_cond_head;
-+    pthread_cond_t worker_cond_tail;
-+    pthread_mutex_t worker_mutex;
-+
-+    int worker_tail; // Contains the number of posted jobs
-+    int worker_head; // Contains the number of completed jobs
-+    int kill_worker; // set to 1 to terminate the worker
-+#endif
-+
-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +
 +#ifdef RPI_DEBLOCK_VPU
++#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +    int enable_rpi_deblock;
 +
 +    int uv_setup_width;
@@ -9109,22 +13213,22 @@ index 0c78812..c268d39 100644
 +    unsigned int dvq_n;
 +
 +#endif
++    HEVCLocalContextIntra HEVClcIntra;
++    HEVCRPiFrameProgressState progress_states[2];
 +#endif
 +
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -596,6 +822,9 @@ typedef struct HEVCContext {
+@@ -595,7 +859,6 @@ typedef struct HEVCContext {
+     uint16_t white_point[2];
      uint32_t max_mastering_luminance;
      uint32_t min_mastering_luminance;
- 
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
+-
  } HEVCContext;
  
  int ff_hevc_decode_nal_sei(HEVCContext *s);
-@@ -703,6 +932,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -703,6 +966,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
  
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
@@ -9136,7 +13240,7 @@ index 0c78812..c268d39 100644
  /**
   * Reset SEI values that are stored on the Context.
   * e.g. Caption data that was extracted during NAL
-@@ -716,4 +950,15 @@ extern const uint8_t ff_hevc_qpel_extra_before[4];
+@@ -716,4 +984,89 @@ extern const uint8_t ff_hevc_qpel_extra_before[4];
  extern const uint8_t ff_hevc_qpel_extra_after[4];
  extern const uint8_t ff_hevc_qpel_extra[4];
  
@@ -9149,11 +13253,85 @@ index 0c78812..c268d39 100644
 +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
 +#endif
 +
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
++{
++    if (s->enable_rpi && s->used_for_ref)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
++{
++    if (s->used_for_ref)
++    {
++        if (s->enable_rpi)
++            ff_hevc_rpi_progress_signal_field(s, y, 0);
++        else
++            ff_thread_report_progress(&s->ref->tf, y, 0);
++    }
++}
++
++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
++{
++    if (s->enable_rpi)
++    {
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++    }
++    else
++        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
++}
++
++#else
++
++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_signal_mv(s, y)
++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
++
 +#endif
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)&ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
 +
  #endif /* AVCODEC_HEVCDEC_H */
 diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 23e923f..c4f1a6c 100644
+index 23e923f8e5..82009c4ed4 100644
 --- a/libavcodec/hevcdsp.c
 +++ b/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
@@ -9277,13 +13455,14 @@ index 23e923f..c4f1a6c 100644
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -193,12 +307,38 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -193,12 +307,54 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
  
 +#if !RPI_HEVC_SAND
 +#define SLICED_LOOP_FILTERS(depth)
 +#define SLICED_ADD_RESIDUAL(depth)
++#define SLICED_SAO(depth)
 +#else
 +#define SLICED_ADD_RESIDUAL(depth)\
 +    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
@@ -9298,13 +13477,24 @@ index 23e923f..c4f1a6c 100644
 +    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
 +    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
 +    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
 +#define SLICED_LOOP_FILTERS(depth)\
 +    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
 +    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
 +    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#endif
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
 +
++#endif
 +
  #define HEVC_DSP(depth)                                                     \
      hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
@@ -9312,31 +13502,39 @@ index 23e923f..c4f1a6c 100644
      hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
      hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
      hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
 +    SLICED_ADD_RESIDUAL(depth);                                             \
      hevcdsp->dequant                = FUNC(dequant, depth);                 \
      hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
      hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
-@@ -225,6 +365,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
+                                                                             \
+-    hevcdsp->sao_band_filter[0] =                                              \
+-    hevcdsp->sao_band_filter[1] =                                              \
+-    hevcdsp->sao_band_filter[2] =                                              \
+-    hevcdsp->sao_band_filter[3] =                                              \
+-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+-    hevcdsp->sao_edge_filter[0] =                                              \
+-    hevcdsp->sao_edge_filter[1] =                                              \
+-    hevcdsp->sao_edge_filter[2] =                                              \
+-    hevcdsp->sao_edge_filter[3] =                                              \
+-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
      hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
      hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
                                                                                 \
-+    hevcdsp->sao_band_filter_c[0] =                                            \
-+    hevcdsp->sao_band_filter_c[1] =                                            \
-+    hevcdsp->sao_band_filter_c[2] =                                            \
-+    hevcdsp->sao_band_filter_c[3] =                                            \
-+    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
-+    hevcdsp->sao_edge_filter_c[0] =                                            \
-+    hevcdsp->sao_edge_filter_c[1] =                                            \
-+    hevcdsp->sao_edge_filter_c[2] =                                            \
-+    hevcdsp->sao_edge_filter_c[3] =                                            \
-+    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
-+                                                                               \
      QPEL_FUNCS(depth);                                                         \
      QPEL_UNI_FUNCS(depth);                                                     \
-     QPEL_BI_FUNCS(depth);                                                      \
-@@ -232,6 +385,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      EPEL_UNI_FUNCS(depth);                                                     \
      EPEL_BI_FUNCS(depth);                                                      \
                                                                                 \
@@ -9344,7 +13542,7 @@ index 23e923f..c4f1a6c 100644
      hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
      hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
      hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-@@ -257,6 +411,8 @@ int i = 0;
+@@ -257,6 +409,8 @@ int i = 0;
          break;
      }
  
@@ -9354,7 +13552,7 @@ index 23e923f..c4f1a6c 100644
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
 diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index eefb3cd..9e44e7f 100644
+index eefb3cd152..4b48055def 100644
 --- a/libavcodec/hevcdsp.h
 +++ b/libavcodec/hevcdsp.h
 @@ -25,6 +25,7 @@
@@ -9365,7 +13563,7 @@ index eefb3cd..9e44e7f 100644
  #include "get_bits.h"
  
  #define MAX_PB_SIZE 64
-@@ -42,11 +43,30 @@ typedef struct SAOParams {
+@@ -42,11 +43,39 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -9379,48 +13577,67 @@ index eefb3cd..9e44e7f 100644
 +    int8_t ref_idx[2];
 +    int8_t pred_flag;
 +} MvField;
++
++#ifdef RPI
++#define SAO_FILTER_N 6
++#else
++#define SAO_FILTER_N 5
++#endif
++
 +
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
  
      void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
 +#if RPI_HEVC_SAND
-+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
 +
 +    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
 +    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
 +                    struct GetBitContext *gb, int pcm_bit_depth);
 +#endif
  
      void (*dequant)(int16_t *coeffs, int16_t log2_size);
  
-@@ -60,14 +80,23 @@ typedef struct HEVCDSPContext {
+@@ -58,16 +87,31 @@ typedef struct HEVCDSPContext {
  
-     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+     void (*idct_dc[4])(int16_t *coeffs);
+ 
+-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                 int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++#if RPI_HEVC_SAND
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                               const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                               const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                               int width, int height);
++#endif
  
      /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
                                 int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++#if RPI_HEVC_SAND
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++#endif
  
      void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                  struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
                                  uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#if RPI_HEVC_SAND
 +    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
 +                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#endif
  
      void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
                                      int height, intptr_t mx, intptr_t my, int width);
-@@ -120,6 +149,22 @@ typedef struct HEVCDSPContext {
+@@ -120,6 +164,22 @@ typedef struct HEVCDSPContext {
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
@@ -9444,24 +13661,23 @@ index eefb3cd..9e44e7f 100644
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
-index 25f1a81..d475b3d 100644
+index 75763ce85e..60053d4a95 100644
 --- a/libavcodec/hevcdsp_template.c
 +++ b/libavcodec/hevcdsp_template.c
-@@ -26,6 +26,10 @@
+@@ -26,6 +26,8 @@
  #include "bit_depth_template.c"
  #include "hevcdsp.h"
  
-+#ifdef RPI
-+#include "rpi_zc.h"
-+#endif
++#include "rpi_shader_template.h"
 +
  static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                            GetBitContext *gb, int pcm_bit_depth)
  {
-@@ -41,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+@@ -41,6 +43,30 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
      }
  }
  
++#if RPI_HEVC_SAND
 +static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
 +                          GetBitContext *gb, int pcm_bit_depth)
 +{
@@ -9483,18 +13699,34 @@ index 25f1a81..d475b3d 100644
 +        dst += stride;
 +    }
 +}
-+
++#endif
 +
  static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
                                                  ptrdiff_t stride, int size)
  {
-@@ -58,6 +85,44 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+@@ -58,6 +84,106 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
      }
  }
  
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
 +#if RPI_HEVC_SAND
-+static av_always_inline void FUNC(add_residual_u_v)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, int size)
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
 +{
 +    int x, y;
 +    pixel *dst = (pixel *)_dst;
@@ -9504,6 +13736,25 @@ index 25f1a81..d475b3d 100644
 +    for (y = 0; y < size; y++) {
 +        for (x = 0; x < size * 2; x += 2) {
 +            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
 +            res++;
 +        }
 +        dst += stride;
@@ -9518,6 +13769,10 @@ index 25f1a81..d475b3d 100644
 +    const int16_t * ru = res;
 +    const int16_t * rv = res + size * size;
 +
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
 +    stride /= sizeof(pixel);
 +
 +    for (y = 0; y < size; y++) {
@@ -9527,39 +13782,82 @@ index 25f1a81..d475b3d 100644
 +        }
 +        dst += stride;
 +    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
 +}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
 +#endif
 +
  static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
                                    ptrdiff_t stride)
  {
-@@ -82,6 +147,90 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+@@ -82,6 +208,132 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
      FUNC(add_residual)(_dst, res, stride, 32);
  }
  
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
 +#if RPI_HEVC_SAND
 +// -- U -- (plaited)
 +
 +static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_u_v)(_dst, res, stride, 4);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
 +}
 +
 +static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_u_v)(_dst, res, stride, 8);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
 +}
 +
 +static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
++                                    ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_u_v)(_dst, res, stride, 16);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
 +}
 +
 +static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
++                                    ptrdiff_t stride, int dc_u)
 +{
 +    // Should never occur for 420, which is all that sand supports
 +    av_assert0(0);
@@ -9568,25 +13866,25 @@ index 25f1a81..d475b3d 100644
 +// -- V -- (plaited)
 +
 +static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
++                                  ptrdiff_t stride, int dc_v)
 +{
-+    FUNC(add_residual_u_v)(_dst + 1, res, stride, 4);
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
 +}
 +
 +static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
++                                  ptrdiff_t stride, int dc_v)
 +{
-+    FUNC(add_residual_u_v)(_dst + 1, res, stride, 8);
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
 +}
 +
 +static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
++                                    ptrdiff_t stride, int dc_v)
 +{
-+    FUNC(add_residual_u_v)(_dst + 1, res, stride, 16);
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
 +}
 +
 +static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
++                                    ptrdiff_t stride, int dc_v)
 +{
 +    // Should never occur for 420, which is all that sand supports
 +    av_assert0(0);
@@ -9618,13 +13916,68 @@ index 25f1a81..d475b3d 100644
 +    // Should never occur for 420, which is all that sand supports
 +    av_assert0(0);
 +}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
 +#endif
 +
 +
  static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  {
      int16_t *coeffs = (int16_t *) _coeffs;
-@@ -361,7 +510,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -352,6 +604,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
+     }
+ }
+ 
++
++#if BIT_DEPTH == 10
++#if RPI_HEVC_SAND
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++#endif
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
+ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+                                     int *borders, int _width, int _height,
+@@ -361,7 +639,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9632,7 +13985,7 @@ index 25f1a81..d475b3d 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, width = _width, height = _height;
  
-@@ -370,33 +518,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -370,33 +647,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9670,7 +14023,7 @@ index 25f1a81..d475b3d 100644
              height--;
          }
      }
-@@ -411,7 +555,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -411,7 +684,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9678,7 +14031,7 @@ index 25f1a81..d475b3d 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, init_y = 0, width = _width, height = _height;
  
-@@ -420,34 +563,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -420,34 +692,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9717,24 +14070,22 @@ index 25f1a81..d475b3d 100644
              height--;
          }
      }
-@@ -488,6 +627,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -487,6 +755,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+ 
      }
  }
- 
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
 +
 +// --- Plaited chroma versions
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++#if RPI_HEVC_SAND
++
 +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
@@ -9760,23 +14111,17 @@ index 25f1a81..d475b3d 100644
 +    for (y = 0; y < height; y++) {
 +        for (x = 0; x < width; x += 2)
 +        {
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
 +        }
 +        dst += stride_dst;
 +        src += stride_src;
 +    }
 +}
-+#endif
-+
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
 +
 +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
@@ -9794,9 +14139,12 @@ index 25f1a81..d475b3d 100644
 +    int a_stride, b_stride;
 +    int x, y;
 +    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
 +    stride_dst /= sizeof(pixel);
 +    width *= 2;
 +
++    av_assert0(width <= 64);
++
 +    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
 +    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
 +    for (y = 0; y < height; y++) {
@@ -9814,43 +14162,42 @@ index 25f1a81..d475b3d 100644
 +        dst += stride_dst;
 +    }
 +}
-+#endif
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++// Do once
++#if BIT_DEPTH == 8
 +// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
 +#endif
 +
++#endif  // RPI_HEVC_SAND
 +
+ 
  #undef CMP
  
- ////////////////////////////////////////////////////////////////////////////////
-@@ -1690,3 +1950,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+@@ -1690,3 +2073,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
  #undef TQ1
  #undef TQ2
  #undef TQ3
 +
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +
 +// line zero
 +#define P3 pix_l[0 * xstride]
@@ -10064,7 +14411,7 @@ index 25f1a81..d475b3d 100644
 +#endif
 +
 diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
-index 7a86ed3..7d32c4a 100644
+index 7a86ed3d31..7d32c4ab14 100644
 --- a/libavcodec/hevcpred.c
 +++ b/libavcodec/hevcpred.c
 @@ -24,6 +24,7 @@
@@ -10146,7 +14493,7 @@ index 7a86ed3..7d32c4a 100644
      case 9:
          HEVC_PRED(9);
 diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
-index eb17663..00ba3f9 100644
+index eb17663683..00ba3f94c0 100644
 --- a/libavcodec/hevcpred.h
 +++ b/libavcodec/hevcpred.h
 @@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
@@ -10168,10 +14515,10 @@ index eb17663..00ba3f9 100644
  
  void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6ae87cc..c14dddd 100644
+index 6fe33546b1..2f9f5f2798 100644
 --- a/libavcodec/hevcpred_template.c
 +++ b/libavcodec/hevcpred_template.c
-@@ -20,13 +20,55 @@
+@@ -20,13 +20,110 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -10183,34 +14530,90 @@ index 6ae87cc..c14dddd 100644
  #include "hevcpred.h"
  
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +#define DUMP_PRED 0
 +
  #define POS(x, y) src[(x) + stride * (y)]
  
-+#if PRED_C
-+
++// REPEAT_INCLUDE defined at EOF
++#if defined(RPI) && !defined(INCLUDED_ONCE)
 +typedef uint8_t (* c8_dst_ptr_t)[2];
 +typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
 +
 +#if BIT_DEPTH == 8
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 16
-+#include "bit_depth_template.c"
-+#undef FUNC
-+#define FUNC(a) FUNC3(a, 8, _c)
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
 +#else
-+#undef FUNC
-+#define FUNC FUNCC
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
 +#endif
 +
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
 +#endif
 +
-+#if DUMP_PRED
-+#ifndef DEBUG_ONCE
-+#define DEBUG_ONCE
++
++#if DUMP_PRED && !defined(INCLUDE_ONCE)
 +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
 +{
 +    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
@@ -10222,17 +14625,16 @@ index 6ae87cc..c14dddd 100644
 +    printf("\n");
 +}
 +#endif
-+#endif
 +
  static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
                                                int log2_size, int c_idx)
  {
-@@ -69,8 +111,11 @@ do {                                  \
+@@ -69,8 +166,11 @@ do {                                  \
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
 -
-+#ifdef RPI_WORKER
++#ifdef RPI
 +    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
 +#else
      HEVCLocalContext *lc = s->HEVClc;
@@ -10240,7 +14642,7 @@ index 6ae87cc..c14dddd 100644
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -79,15 +124,23 @@ do {                                  \
+@@ -79,15 +179,23 @@ do {                                  \
      int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
      int size_in_luma_v = size << vshift;
      int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
@@ -10256,18 +14658,18 @@ index 6ae87cc..c14dddd 100644
 -    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +#if defined(RPI)
-+    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
 +            (pixel*)s->frame->data[c_idx] + x + y * stride :
 +        c_idx == 0 ?
-+            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
-+            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
 +#else
      pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
 +#endif
  
      int min_pu_width = s->ps.sps->min_pu_width;
  
-@@ -95,14 +148,20 @@ do {                                  \
+@@ -95,14 +203,20 @@ do {                                  \
                                lc->tu.intra_pred_mode;
      pixel4 a;
      pixel  left_array[2 * MAX_TB_SIZE + 1];
@@ -10288,7 +14690,7 @@ index 6ae87cc..c14dddd 100644
      int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
      int cand_left        = lc->na.cand_left;
      int cand_up_left     = lc->na.cand_up_left;
-@@ -114,6 +173,26 @@ do {                                  \
+@@ -114,6 +228,27 @@ do {                                  \
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
@@ -10301,10 +14703,11 @@ index 6ae87cc..c14dddd 100644
 +#endif
 +
 +#if defined(RPI)
-+    if (s->frame->format == AV_PIX_FMT_SAND128) {
++    if (av_rpi_is_sand_frame(s->frame)) {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
 +        const AVFrame * const frame = s->frame;
 +        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
 +        if ((x & mask) == 0)
 +            src_l -= stripe_adj;
 +        if (((x + size) & mask) == 0)
@@ -10315,7 +14718,7 @@ index 6ae87cc..c14dddd 100644
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
-@@ -163,23 +242,24 @@ do {                                  \
+@@ -163,23 +298,24 @@ do {                                  \
          top[-1] = 128;
      }
      if (cand_up_left) {
@@ -10347,29 +14750,29 @@ index 6ae87cc..c14dddd 100644
                 size - bottom_left_size);
      }
  
-@@ -268,7 +348,11 @@ do {                                  \
+@@ -268,7 +404,11 @@ do {                                  \
              cand_up_left = 1;
              cand_left    = 1;
          } else { // No samples available
-+#if PRED_C && BIT_DEPTH == 16
-+            left[-1] = 0x8080;
++#if PRED_C
++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
 +#else
              left[-1] = (1 << (BIT_DEPTH - 1));
 +#endif
              EXTEND(top,  left[-1], 2 * size);
              EXTEND(left, left[-1], 2 * size);
          }
-@@ -287,6 +371,9 @@ do {                                  \
+@@ -287,6 +427,9 @@ do {                                  \
      top[-1] = left[-1];
  
      // Filtering process
-+    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
 +    // worry about chroma smoothing for that case
 +#if !PRED_C
      if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
          if (mode != INTRA_DC && size != 4){
              int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-@@ -342,13 +429,46 @@ do {                                  \
+@@ -342,6 +485,30 @@ do {                                  \
                                             mode);
          break;
      }
@@ -10399,24 +14802,8 @@ index 6ae87cc..c14dddd 100644
 +#endif
  }
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define INTRA_PRED(size)                                                            \
- static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
- {                                                                                   \
-     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
- }
-+#else
-+#define INTRA_PRED(size)                                                            \
-+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
-+{                                                                                   \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#endif
- 
- INTRA_PRED(2)
- INTRA_PRED(3)
-@@ -357,6 +477,7 @@ INTRA_PRED(5)
+@@ -357,6 +524,7 @@ INTRA_PRED(5)
  
  #undef INTRA_PRED
  
@@ -10424,7 +14811,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
                                    const uint8_t *_left, ptrdiff_t stride,
                                    int trafo_size)
-@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
              POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
                           (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
  }
@@ -10435,9 +14822,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    int x, y;
 +    int size = 1 << trafo_size;
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    for (y = 0; y < size; y++, src += stride)
 +    {
@@ -10452,26 +14839,9 @@ index 6ae87cc..c14dddd 100644
 +}
 +#endif
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define PRED_PLANAR(size)\
  static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-                                        const uint8_t *left, ptrdiff_t stride)   \
- {                                                                               \
-     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
- }
-+#else
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
-+    abort();                                                                    \
-+}
-+#endif
- 
- PRED_PLANAR(0)
- PRED_PLANAR(1)
-@@ -386,6 +540,7 @@ PRED_PLANAR(3)
+@@ -386,6 +577,7 @@ PRED_PLANAR(3)
  
  #undef PRED_PLANAR
  
@@ -10479,7 +14849,7 @@ index 6ae87cc..c14dddd 100644
  static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
                            const uint8_t *_left,
                            ptrdiff_t stride, int log2_size, int c_idx)
-@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
              POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
      }
  }
@@ -10490,9 +14860,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    unsigned int i, j;
 +    const unsigned int size = (1 << log2_size);
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +    unsigned int dc0 = size;
 +    unsigned int dc1 = size;
 +
@@ -10533,7 +14903,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
                                                  const uint8_t *_top,
                                                  const uint8_t *_left,
-@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
      const pixel *top  = (const pixel *)_top;
      const pixel *left = (const pixel *)_left;
  
@@ -10549,7 +14919,7 @@ index 6ae87cc..c14dddd 100644
      int angle = intra_pred_angle[mode - 2];
      pixel ref_array[3 * MAX_TB_SIZE + 4];
      pixel *ref_tmp = ref_array + size;
-@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
          }
      }
  }
@@ -10561,26 +14931,26 @@ index 6ae87cc..c14dddd 100644
 +                                                int mode, int size)
 +{
 +    int x, y;
-+    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
-+    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
-+    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    const int angle = intra_pred_angle[mode - 2];
-+    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c8_dst_ptr_t ref_tmp = ref_array + size;
-+    c8_src_ptr_t ref;
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
 +    const int last = (size * angle) >> 5;
 +
 +    if (mode >= 18) {
 +        ref = top - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (y = 0; y < size; y++, src += stride) {
@@ -10594,19 +14964,19 @@ index 6ae87cc..c14dddd 100644
 +                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
 +                }
 +            } else {
-+                memcpy(src, ref + idx + 1, size * 2);
++                memcpy(src, ref + idx + 1, size * 2 * PW);
 +            }
 +        }
 +    } else {
 +        ref = left - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (x = 0; x < size; x++, src++) {
@@ -10633,8 +15003,29 @@ index 6ae87cc..c14dddd 100644
  
  static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
                                   const uint8_t *left,
+@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
+ }
+ 
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
+ #undef EXTEND_LEFT_CIP
+ #undef EXTEND_RIGHT_CIP
+ #undef EXTEND_UP_CIP
+@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+ #undef EXTEND
+ #undef MIN_TB_ADDR_ZS
+ #undef POS
++#undef PW
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
++
 diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 81fcebc..7858478 100644
+index 81fcebce23..7858478b5d 100644
 --- a/libavcodec/mmaldec.c
 +++ b/libavcodec/mmaldec.c
 @@ -24,6 +24,9 @@
@@ -10656,10 +15047,10 @@ index 81fcebc..7858478 100644
  
  #include "avcodec.h"
 diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 54b7be1..894dcdc 100644
+index 8f85e9362d..23080e8910 100644
 --- a/libavcodec/mpeg4videodec.c
 +++ b/libavcodec/mpeg4videodec.c
-@@ -2247,6 +2247,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
  
          if (ctx->divx_version >= 0)
              s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
@@ -10669,7 +15060,7 @@ index 54b7be1..894dcdc 100644
      }
  
      if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2271,6 +2274,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
                 s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
                 ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
  
@@ -10678,95 +15069,40 @@ index 54b7be1..894dcdc 100644
          s->codec_id == AV_CODEC_ID_MPEG4 &&
          avctx->idct_algo == FF_IDCT_AUTO) {
 diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index 7146e3a..240b274 100644
+index 7146e3a0f8..a8dcb1c251 100644
 --- a/libavcodec/raw.c
 +++ b/libavcodec/raw.c
-@@ -273,6 +273,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+@@ -273,6 +273,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
      { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
      { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
  
 +    /* RPI */
 +#ifdef RPI
 +    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
 +#endif
 +
      /* special */
      { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
      { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
 diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74..84f8e8c 100644
+index d181b74570..3fe75711c1 100644
 --- a/libavcodec/rawenc.c
 +++ b/libavcodec/rawenc.c
-@@ -31,6 +31,7 @@
+@@ -31,6 +31,8 @@
  #include "libavutil/intreadwrite.h"
  #include "libavutil/imgutils.h"
  #include "libavutil/internal.h"
 +#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
  
  static av_cold int raw_encode_init(AVCodecContext *avctx)
  {
-@@ -49,6 +50,101 @@ FF_ENABLE_DEPRECATION_WARNINGS
+@@ -49,6 +51,71 @@ FF_ENABLE_DEPRECATION_WARNINGS
      return 0;
  }
  
-+// x0 & width in luma units (so chroma * 2)
-+// x0 odd for v
-+static uint8_t * sand_copy_line_u(uint8_t * dst, const uint8_t * src,
-+                           unsigned int x0, const unsigned int width,
-+                           const unsigned int stride1, const unsigned int stride2)
-+{
-+    unsigned int xend;
-+
-+    // Skip any empty slices
-+    src += (x0 & ~(stride1 - 1)) * stride2;
-+    x0 &= (stride1 - 1);
-+
-+    xend = x0 + width;
-+    for (unsigned int x = 0; x < xend; x += stride1)
-+    {
-+        const unsigned int w = FFMIN(stride1, xend - x) - x0;
-+        for (unsigned int i = 0; i < w; i += 2)
-+            *dst++ = src[x0 + i];
-+        src += stride1 * stride2;
-+        x0 &= 1;
-+    }
-+
-+    return dst;
-+}
-+
-+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame,
-+                            const unsigned int x0, const unsigned int y0,
-+                            const unsigned int width, const unsigned int height)
-+{
-+    for (unsigned int y = y0; y < height + y0; ++y) {
-+        dst = sand_copy_line_u(dst, frame->data[1] + y * frame->linesize[1], x0, width, frame->linesize[1], frame->linesize[3]);
-+    }
-+    return dst;
-+}
-+
-+static uint8_t * sand_copy_line_y(uint8_t * dst, const uint8_t * src,
-+                           unsigned int x0, const unsigned int width,
-+                           const unsigned int stride1, const unsigned int stride2)
-+{
-+    unsigned int xend;
-+
-+    // Skip any empty slices
-+    src += (x0 & ~(stride1 - 1)) * stride2;
-+    x0 &= (stride1 - 1);
-+
-+    xend = x0 + width;
-+    for (unsigned int x = 0; x < xend; x += stride1)
-+    {
-+        const unsigned int w = FFMIN(stride1, xend - x) - x0;
-+        memcpy(dst, src + x0, w);
-+        dst += w;
-+        src += stride1 * stride2;
-+        x0 = 0;
-+    }
-+    return dst;
-+}
-+
-+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 +                      const AVFrame *frame)
 +{
 +    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
@@ -10781,8 +15117,6 @@ index d181b74..84f8e8c 100644
 +    if (sd != NULL) {
 +        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
 +
-+//        printf("PScan: h/w=%d/%d, off=%d,%d\n", pscan->height, pscan->width, pscan->position[0][0], pscan->position[0][0]);
-+
 +        x0 = si->left_offset;
 +        y0 = si->top_offset;
 +    }
@@ -10793,26 +15127,55 @@ index d181b74..84f8e8c 100644
 +
 +    dst = pkt->data;
 +
-+    // Luma is "easy"
-+    for (int y = y0; y < height + y0; ++y) {
-+        dst = sand_copy_line_y(dst, frame->data[0] + y * frame->linesize[0], x0, width, frame->linesize[0], frame->linesize[3]);
-+    }
-+
-+    // Chroma is dull
-+    dst = cpy_sand_c(dst, frame, x0 & ~1, y0 / 2, width, height / 2);
-+    dst = cpy_sand_c(dst, frame, x0 | 1,  y0 / 2, width, height / 2);
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
 +    return 0;
 +}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
++    uint8_t * dst;
++    int ret;
++
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3;
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++
 +
  static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                        const AVFrame *frame, int *got_packet)
  {
-@@ -58,6 +154,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+@@ -58,6 +125,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
      if (ret < 0)
          return ret;
  
-+    if (frame->format == AV_PIX_FMT_SAND128) {
-+        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++    if (av_rpi_is_sand_frame(frame)) {
++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
 +        *got_packet = (ret == 0);
 +        return ret;
 +    }
@@ -10820,13 +15183,4018 @@ index d181b74..84f8e8c 100644
      if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
          return ret;
      if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
 new file mode 100644
-index 0000000..4309f1c
+index 0000000000..391f761df9
 --- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.h
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,923 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++#             fe
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  cmp r5,1
++  beq memclear16
++  cmp r5,2
++  beq hevc_deblock_16x16
++  cmp r5,3
++  beq hevc_uv_deblock_16x16
++  cmp r5,4
++  beq hevc_uv_deblock_16x16_with_clear
++  cmp r5,5
++  beq hevc_run_command_list
++
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++  eor r0,r8
++  add r1,r7
++  # Prefetch the next block
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  eor r0,r8
++  sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++  # set r8 to 32byte aligned stack pointer
++  add r8,sp,31
++  lsr r8,5
++  lsl r8,5
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++
++  add sp,sp,32*32*2+32 # Restore stack
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++memclear16:
++  # r0 is address
++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++  vmov HX(0++,0),0 REP 16
++  mov r2,32
++loop:
++  vsth HX(0++,0),(r0+=r2) REP 16
++  add r0,16*16*2
++  sub r1,16*16
++  cmp r1,0
++  bgt loop
++  b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++#   has 8 bytes per edge
++#   has 16 bytes per direction
++#   has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++  push r6-r15, lr
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl vert_filter
++  sub r3,8
++  b start_deblock_loop
++deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt start_again
++  pop r6-r15, pc
++start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++  push lr
++
++  vmov HX(P3,0), V(16,12)+r3
++  vmov HX(P2,0), V(16,13)+r3
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++  vmov HX(Q2,0), V(16,18)+r3
++  vmov HX(Q3,0), V(16,19)+r3
++
++  bl do_luma_filter
++
++  vadds V(16,13)+r3, HX(P2,0), 0
++  vadds V(16,14)+r3, HX(P1,0), 0
++  vadds V(16,15)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds V(16,16)+r3, HX(Q0,0), 0
++  vadds V(16,17)+r3, HX(Q1,0), 0
++  vadds V(16,18)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++  push lr
++
++  vmov HX(P3,0), H(12,0)+r3
++  vmov HX(P2,0), H(13,0)+r3
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++  vmov HX(Q2,0), H(18,0)+r3
++  vmov HX(Q3,0), H(19,0)+r3
++
++  bl do_luma_filter
++
++  vadds H(13,0)+r3, HX(P2,0), 0
++  vadds H(14,0)+r3, HX(P1,0), 0
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++  vadds H(17,0)+r3, HX(Q1,0), 0
++  vadds H(18,0)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++  valtl HX(beta,0),H(setup,0),H(setup,0)
++  valtu HX(tc,0),H(setup,0),H(setup,0)
++  vmul HX(tc25,0), HX(tc,0), 5
++  vadd HX(tc25,0),HX(tc25,0), 1
++  vasr HX(tc25,0), HX(tc25,0), 1
++
++  # Compute decision
++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++  vadd HX(d,0), HX(dp,0), HX(dq,0)
++  vasr HX(beta2,0),HX(beta,0),2
++  vasr HX(beta3,0),HX(beta,0),3
++
++  # Compute flags that are negative if all conditions pass
++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++  vmov HX(decision,0), 1 IFNN
++  vadd H(decision,0),H(decision,3),0 IFN
++  vadd H(decision,16),H(decision,19),0 IFN
++  vmov -,HX(decision,0) SETF   # N marks strong filter
++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
++
++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++  vmov HX(decision,0),0 IFNN # Z marks no filter
++
++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
++  # First extract out even terms
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
++  # Now expand back
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++  # Do a quick check to see if there is anything to do
++  mov r11, 0 # Signal no filtering
++  vmov -,1 IFNZ SUMS r5
++  cmp r5,0
++  beq filtering_done
++  mov r11, 1 # Signal some filtering
++  # And whether there is any strong filtering
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq normal_filtering
++
++  ##############################################################################
++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++  # Take a copy of the original pixels for use in decision calculation
++  vmov HX(P0,32),HX(P0,0)
++  vmov HX(Q0,32),HX(Q0,0)
++  vmov HX(P1,32),HX(P1,0)
++  vmov HX(Q1,32),HX(Q1,0)
++  vmov HX(P2,32),HX(P2,0)
++  vmov HX(Q2,32),HX(Q2,0)
++
++  vadd -,HX(P2,32),4 CLRA SACC
++  vshl -,HX(P1,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl HX(delta,0),HX(Q1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++  vadd -,HX(P2,32),2 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vshl HX(delta,0),HX(Q0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++  vadd -,HX(Q0,32),4 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vmul -,HX(P2,32),3 SACC
++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++  #vmov HX(P2,0),3 IFN
++
++  # Now reverse all P/Qs
++
++  vadd -,HX(Q2,32),4 CLRA SACC
++  vshl -,HX(Q1,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl HX(delta,0),HX(P1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++  vadd -,HX(Q2,32),2 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vshl HX(delta,0),HX(P0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++  vadd -,HX(P0,32),4 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vmul -,HX(Q2,32),3 SACC
++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++  ##############################################################################
++  # Normal filtering
++normal_filtering:
++  # Invert the decision flags
++  # make instruction more complicated as assembler has error and loses SETF
++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
++
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq filtering_done
++
++  vasr HX(tc2,0), HX(tc,0), 1
++  vmul HX(tc10,0), HX(tc,0), 10
++
++  vasr HX(thresh,0), HX(beta,0), 1
++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++  # Expand ptest and qtest together
++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++  vmov -,8 CLRA SACC
++  vmul -,HX(delta0,0), 9 SACC
++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
++  vasr HX(delta0,0), HX(delta0,0), 4
++  vdist HX(deltatest,0), HX(delta0,0), 0
++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++  vadd HX(deltap1,0), HX(deltap1,0), 1
++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++  vasr HX(deltap1,0), HX(deltap1,0), 1
++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++  vrsub -, HX(delta0,0), 0 SACC
++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++  vmov -,HX(deltatest,0) SETF
++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++  #vmov HX(P2,0),1 IFN
++
++filtering_done:
++  b lr
++
++
++hevc_uv_deblock_16x16:
++  push r6-r15, lr
++  mov r14,0
++  b hevc_uv_start
++hevc_uv_deblock_16x16_with_clear:
++  push r6-r15, lr
++  mov r14,1
++  b hevc_uv_start
++
++hevc_uv_start:
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++# r14 is 1 if we should clear the old contents, or 0 if not
++
++uv_process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  cmp r14,1
++  bne uv_skip0
++  vstb H(zeros,0),(r4)
++uv_skip0:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl uv_vert_filter
++  sub r3,8
++  b uv_start_deblock_loop
++uv_deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  cmp r14,1
++  bne uv_skip1
++  vstb H(zeros,0),(r4)
++uv_skip1:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip3
++  vstb H(zeros,0),-16(r4)
++uv_skip3:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,uv_skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++uv_start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt uv_deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip2
++  vstb H(zeros,0),-16(r4)
++uv_skip2:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,uv_skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt uv_start_again
++  pop r6-r15, pc
++uv_start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b uv_process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++uv_vert_filter:
++  push lr
++
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++
++  bl do_chroma_filter
++
++  vadds V(16,15)+r3, HX(P0,0), 0
++  vadds V(16,16)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++uv_horz_filter:
++  push lr
++
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++
++  bl do_chroma_filter
++
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_chroma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
++  valtl HX(tc,0),H(setup,0),H(setup,0)
++
++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
++  vsub -,HX(P1,0),HX(Q1,0) SACC
++  vmov HX(delta,0),4 SACC
++  vasr HX(delta,0),HX(delta,0),3
++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
++  b lr
++
++# r0 = list
++# r1 = number
++hevc_run_command_list:
++  push r6-r7, lr
++  mov r6, r0
++  mov r7, r1
++loop_cmds:
++  ld r0,(r6) # How to encode r6++?
++  add r6,4
++  ld r1,(r6)
++  add r6,4
++  ld r2,(r6)
++  add r6,4
++  ld r3,(r6)
++  add r6,4
++  ld r4,(r6)
++  add r6,4
++  ld r5,(r6)
++  add r6,4
++  bl hevc_trans_16x16
++  sub r7,1
++  cmp r7,0
++  bgt loop_cmds
++
++  pop r6-r7, pc
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..b0e9902d82
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
 @@ -0,0 +1,3070 @@
-+unsigned char rpi_hevc_transform [] = {
++static const unsigned char rpi_hevc_transform10 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++220,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++2,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++6,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++2,
++0,
++0,
++101,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++143,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++135,
++0,
++131,
++102,
++0,
++158,
++71,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++112,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++104,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++123,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++112,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++178,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++95,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++8,
++250,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..2901b6568d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,3070 @@
++static const unsigned char rpi_hevc_transform8 [] = {
 +21,
 +106,
 +0,
@@ -13896,932 +22264,9 @@ index 0000000..4309f1c
 +33,
 +3,
 +};
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000..5543093
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,917 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+hevc_trans_16x16:
-+  cmp r5,1
-+  beq memclear16
-+  cmp r5,2
-+  beq hevc_deblock_16x16
-+  cmp r5,3
-+  beq hevc_uv_deblock_16x16
-+  cmp r5,4
-+  beq hevc_uv_deblock_16x16_with_clear
-+  cmp r5,5
-+  beq hevc_run_command_list
-+
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,1<<11 # Constant used for rounding second pass
-+
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+block_loop:
-+  eor r0,r8
-+  add r1,r7
-+  # Prefetch the next block
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  eor r0,r8
-+  sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+  # set r8 to 32byte aligned stack pointer
-+  add r8,sp,31
-+  lsr r8,5
-+  lsl r8,5
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+block_loop32:
-+
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, 1<<11 # Constant used for rounding second pass
-+  mov r5, 4 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+
-+  add sp,sp,32*32*2+32 # Restore stack
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+memclear16:
-+  # r0 is address
-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-+  vmov HX(0++,0),0 REP 16
-+  mov r2,32
-+loop:
-+  vsth HX(0++,0),(r0+=r2) REP 16
-+  add r0,16*16*2
-+  sub r1,16*16
-+  cmp r1,0
-+  bgt loop
-+  b lr
-+
-+
-+################################################################################
-+# HEVC VPU Deblock
-+#
-+# Vertical edges before horizontal
-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-+#
-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-+# The VPU code works in units of 16x16 blocks.
-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-+# One final horizontal filter is required at the end.
-+# PCM is not allowed in this code.
-+#
-+#
-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-+
-+.set P0,63
-+.set P1,62
-+.set P2,61
-+.set P3,60
-+.set Q0,59
-+.set Q1,58
-+.set Q2,57
-+.set Q3,56
-+
-+.set dp,32
-+.set dq,33
-+.set d,34
-+.set decision,35
-+.set beta,36
-+.set beta2,37
-+.set beta3,38
-+.set ptest,39
-+.set qtest,40
-+.set pqtest,41
-+.set thresh,42
-+.set deltatest, 44
-+.set deltap1, 45
-+.set tc25, 46
-+.set setup,47
-+.set tc,48
-+.set tc25,49
-+.set tc2, 50
-+.set do_filter, 51
-+.set delta, 52
-+.set tc10, 53
-+.set delta0, 54
-+.set delta1, 55
-+.set zeros, 0
-+.set setup_input, 1
-+.set deltaq1, 2
-+
-+
-+
-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-+# Row has num16 16x16 blocks across
-+# Beta goes from 0 to 64
-+# tc goes from 0 to 24
-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-+#   has 8 bytes per edge
-+#   has 16 bytes per direction
-+#   has 32 bytes per 16x16 block
-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-+hevc_deblock_16x16:
-+  push r6-r15, lr
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+
-+process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl vert_filter
-+  sub r3,8
-+  b start_deblock_loop
-+deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt start_again
-+  pop r6-r15, pc
-+start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+vert_filter:
-+  push lr
-+
-+  vmov HX(P3,0), V(16,12)+r3
-+  vmov HX(P2,0), V(16,13)+r3
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+  vmov HX(Q2,0), V(16,18)+r3
-+  vmov HX(Q3,0), V(16,19)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds V(16,13)+r3, HX(P2,0), 0
-+  vadds V(16,14)+r3, HX(P1,0), 0
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+  vadds V(16,17)+r3, HX(Q1,0), 0
-+  vadds V(16,18)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+horz_filter:
-+  push lr
-+
-+  vmov HX(P3,0), H(12,0)+r3
-+  vmov HX(P2,0), H(13,0)+r3
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+  vmov HX(Q2,0), H(18,0)+r3
-+  vmov HX(Q3,0), H(19,0)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds H(13,0)+r3, HX(P2,0), 0
-+  vadds H(14,0)+r3, HX(P1,0), 0
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+  vadds H(17,0)+r3, HX(Q1,0), 0
-+  vadds H(18,0)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_luma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-+  valtl HX(beta,0),H(setup,0),H(setup,0)
-+  valtu HX(tc,0),H(setup,0),H(setup,0)
-+  vmul HX(tc25,0), HX(tc,0), 5
-+  vadd HX(tc25,0),HX(tc25,0), 1
-+  vasr HX(tc25,0), HX(tc25,0), 1
-+
-+  # Compute decision
-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-+
-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-+
-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
-+  vasr HX(beta2,0),HX(beta,0),2
-+  vasr HX(beta3,0),HX(beta,0),3
-+
-+  # Compute flags that are negative if all conditions pass
-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-+
-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-+  vmov HX(decision,0), 1 IFNN
-+  vadd H(decision,0),H(decision,3),0 IFN
-+  vadd H(decision,16),H(decision,19),0 IFN
-+  vmov -,HX(decision,0) SETF   # N marks strong filter
-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-+
-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-+  vmov HX(decision,0),0 IFNN # Z marks no filter
-+
-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-+  # First extract out even terms
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-+  # Now expand back
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-+
-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-+
-+  # Do a quick check to see if there is anything to do
-+  mov r11, 0 # Signal no filtering
-+  vmov -,1 IFNZ SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+  mov r11, 1 # Signal some filtering
-+  # And whether there is any strong filtering
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq normal_filtering
-+
-+  ##############################################################################
-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-+
-+  # Take a copy of the original pixels for use in decision calculation
-+  vmov HX(P0,32),HX(P0,0)
-+  vmov HX(Q0,32),HX(Q0,0)
-+  vmov HX(P1,32),HX(P1,0)
-+  vmov HX(Q1,32),HX(Q1,0)
-+  vmov HX(P2,32),HX(P2,0)
-+  vmov HX(Q2,32),HX(Q2,0)
-+
-+  vadd -,HX(P2,32),4 CLRA SACC
-+  vshl -,HX(P1,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl HX(delta,0),HX(Q1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P2,32),2 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vshl HX(delta,0),HX(Q0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q0,32),4 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vmul -,HX(P2,32),3 SACC
-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-+  #vmov HX(P2,0),3 IFN
-+
-+  # Now reverse all P/Qs
-+
-+  vadd -,HX(Q2,32),4 CLRA SACC
-+  vshl -,HX(Q1,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl HX(delta,0),HX(P1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q2,32),2 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vshl HX(delta,0),HX(P0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P0,32),4 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vmul -,HX(Q2,32),3 SACC
-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-+
-+  ##############################################################################
-+  # Normal filtering
-+normal_filtering:
-+  # Invert the decision flags
-+  # make instruction more complicated as assembler has error and loses SETF
-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-+
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+
-+  vasr HX(tc2,0), HX(tc,0), 1
-+  vmul HX(tc10,0), HX(tc,0), 10
-+
-+  vasr HX(thresh,0), HX(beta,0), 1
-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-+
-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-+  # Expand ptest and qtest together
-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-+
-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-+  vmov -,8 CLRA SACC
-+  vmul -,HX(delta0,0), 9 SACC
-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-+  vasr HX(delta0,0), HX(delta0,0), 4
-+  vdist HX(deltatest,0), HX(delta0,0), 0
-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-+
-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-+
-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-+  vadd HX(deltap1,0), HX(deltap1,0), 1
-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-+  vasr HX(deltap1,0), HX(deltap1,0), 1
-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-+
-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-+  vrsub -, HX(delta0,0), 0 SACC
-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-+
-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-+
-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-+
-+  vmov -,HX(deltatest,0) SETF
-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-+
-+  #vmov HX(P2,0),1 IFN
-+
-+filtering_done:
-+  b lr
-+
-+
-+hevc_uv_deblock_16x16:
-+  push r6-r15, lr
-+  mov r14,0
-+  b hevc_uv_start
-+hevc_uv_deblock_16x16_with_clear:
-+  push r6-r15, lr
-+  mov r14,1
-+  b hevc_uv_start
-+
-+hevc_uv_start:
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+# r14 is 1 if we should clear the old contents, or 0 if not
-+
-+uv_process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  cmp r14,1
-+  bne uv_skip0
-+  vstb H(zeros,0),(r4)
-+uv_skip0:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl uv_vert_filter
-+  sub r3,8
-+  b uv_start_deblock_loop
-+uv_deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  cmp r14,1
-+  bne uv_skip1
-+  vstb H(zeros,0),(r4)
-+uv_skip1:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip3
-+  vstb H(zeros,0),-16(r4)
-+uv_skip3:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,uv_skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+uv_start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt uv_deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip2
-+  vstb H(zeros,0),-16(r4)
-+uv_skip2:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,uv_skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt uv_start_again
-+  pop r6-r15, pc
-+uv_start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b uv_process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+uv_vert_filter:
-+  push lr
-+
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+uv_horz_filter:
-+  push lr
-+
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_chroma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-+  valtl HX(tc,0),H(setup,0),H(setup,0)
-+
-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-+  vsub -,HX(P1,0),HX(Q1,0) SACC
-+  vmov HX(delta,0),4 SACC
-+  vasr HX(delta,0),HX(delta,0),3
-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-+  b lr
-+
-+# r0 = list
-+# r1 = number
-+hevc_run_command_list:
-+  push r6-r7, lr
-+  mov r6, r0
-+  mov r7, r1
-+loop_cmds:
-+  ld r0,(r6) # How to encode r6++?
-+  add r6,4
-+  ld r1,(r6)
-+  add r6,4
-+  ld r2,(r6)
-+  add r6,4
-+  ld r3,(r6)
-+  add r6,4
-+  ld r4,(r6)
-+  add r6,4
-+  ld r5,(r6)
-+  add r6,4
-+  bl hevc_trans_16x16
-+  sub r7,1
-+  cmp r7,0
-+  bgt loop_cmds
-+
-+  pop r6-r7, pc
 diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..0255f5d
+index 0000000000..0255f5dd44
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.c
 @@ -0,0 +1,149 @@
@@ -14976,7 +22421,7 @@ index 0000000..0255f5d
 +
 diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..b316878
+index 0000000000..b3168788d2
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.h
 @@ -0,0 +1,58 @@
@@ -15040,10 +22485,10 @@ index 0000000..b316878
 +#endif
 diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..36c8ab6
+index 0000000000..e872b855b7
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,878 @@
+@@ -0,0 +1,935 @@
 +#ifdef RPI
 +#include <stdio.h>
 +#include <stdlib.h>
@@ -15062,8 +22507,9 @@ index 0000000..36c8ab6
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
-+#include "rpi_hevc_transform.h"
-+#include "rpi_zc.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
 +
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
@@ -15090,7 +22536,7 @@ index 0000000..36c8ab6
 +#define vcos_verify_ge0(x) ((x)>=0)
 +
 +// Size in 32bit words
-+#define QPU_CODE_SIZE 2048
++#define QPU_CODE_SIZE 4098
 +#define VPU_CODE_SIZE 2048
 +
 +static const short rpi_transMatrix2even[32][16] = { // Even rows first
@@ -15133,7 +22579,8 @@ index 0000000..36c8ab6
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
-+  unsigned int vpu_code[VPU_CODE_SIZE];
++  unsigned int vpu_code8[VPU_CODE_SIZE];
++  unsigned int vpu_code10[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
 +};
 +
@@ -15145,8 +22592,9 @@ index 0000000..36c8ab6
 +#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
 +
 +struct rpi_cache_flush_env_s {
-+    unsigned int n;
-+    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++//    unsigned int n;
++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++  struct vcsm_user_clean_invalid2_s v;
 +};
 +
 +#define WAIT_COUNT_MAX 16
@@ -15188,6 +22636,7 @@ index 0000000..36c8ab6
 +  int open_count;
 +  int init_count;
 +  int mb;
++  int vpu_i_cache_flushed;
 +  GPU_MEM_PTR_T code_gm_ptr;
 +  vq_wait_pool_t wait_pool;
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -15260,8 +22709,8 @@ index 0000000..36c8ab6
 +
 +// GPU_MEM_PTR_T alloc fns
 +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  p->numbytes = (numbytes + 255) & ~255;  // Round up
++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
@@ -15272,12 +22721,14 @@ index 0000000..36c8ab6
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
++
 +  return 0;
 +}
 +
 +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
 +  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
 +  av_assert0(p->vcsm_handle);
 +  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
 +  av_assert0(p->vc_handle);
@@ -15285,6 +22736,7 @@ index 0000000..36c8ab6
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
 +  return 0;
 +}
 +
@@ -15293,6 +22745,7 @@ index 0000000..36c8ab6
 +  vcsm_unlock_ptr(p->arm);
 +  vcsm_free(p->vcsm_handle);
 +  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++//  printf("***** %s\n", __func__);
 +}
 +
 +
@@ -15349,9 +22802,14 @@ index 0000000..36c8ab6
 +  }
 +  // And the VPU code
 +  {
-+    int num_bytes = sizeof(rpi_hevc_transform);
++    int num_bytes = sizeof(rpi_hevc_transform8);
 +    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++  }
++  {
++    int num_bytes = sizeof(rpi_hevc_transform10);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
 +  }
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
@@ -15442,10 +22900,18 @@ index 0000000..36c8ab6
 +  gpu_unlock_unref(ge);
 +}
 +
-+unsigned int vpu_get_fn(void) {
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
 +  // Make sure that the gpu is initialized
 +  av_assert0(gpu != NULL);
-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
++  switch (bit_depth){
++    case 8:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++    case 10:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++    default:
++      av_assert0(0);
++  }
++  return 0;
 +}
 +
 +unsigned int vpu_get_constants(void) {
@@ -15475,95 +22941,75 @@ index 0000000..36c8ab6
 +//
 +// Cache flush functions
 +
++#define CACHE_EL_MAX 16
 +
 +rpi_cache_flush_env_t * rpi_cache_flush_init()
 +{
-+    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
-+    if (rfe == NULL)
-+        return NULL;
++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
++  if (rfe == NULL)
++    return NULL;
 +
-+    rfe->n = 0;
-+    return rfe;
++  rfe->v.op_count = 0;
++  return rfe;
 +}
 +
 +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
 +{
-+    if (rfe != NULL)
-+        free(rfe);
++  if (rfe != NULL)
++    free(rfe);
 +}
 +
 +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
 +{
-+    int rc = 0;
-+    unsigned int na;
-+    unsigned int nr;
++  int rc = 0;
 +
-+    // Clear any reamaining ents in the final block
-+    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
-+        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++  if (vcsm_clean_invalid2(&rfe->v) != 0)
++    rc = -1;
 +
-+    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
-+    {
-+        if (vcsm_clean_invalid(rfe->a + na) != 0)
-+            rc = -1;
-+    }
++  free(rfe);
 +
-+    free(rfe);
++  if (rc == 0)
++    return 0;
 +
-+    if (rc == 0)
-+        return 0;
-+
-+    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
-+    return rc;
++  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++  return rc;
 +}
 +
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || gm->numbytes == 0)
-+        return;
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
 +
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
 +
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm;
-+        a->s[n].size = gm->numbytes;
-+        ++rfe->n;
-+    }
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
 +}
 +
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
 +  const unsigned int offset, const unsigned int size)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || size == 0)
-+        return;
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
 +
-+//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++  av_assert0(offset <= gm->numbytes);
++  av_assert0(size <= gm->numbytes);
++  av_assert0(offset + size <= gm->numbytes);
 +
-+    av_assert0(offset <= gm->numbytes);
-+    av_assert0(size <= gm->numbytes);
-+    av_assert0(offset + size <= gm->numbytes);
-+
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
-+
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm + offset;
-+        a->s[n].size = size;
-+        ++rfe->n;
-+    }
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
 +}
 +
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
 +{
 +#if !RPI_ONE_BUF
@@ -15580,6 +23026,8 @@ index 0000000..36c8ab6
 +  }
 +}
 +
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
 +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
 +  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
 +  const unsigned int uv_shift, const int do_luma, const int do_chroma)
@@ -15610,7 +23058,7 @@ index 0000000..36c8ab6
 +      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
 +    }
 +  }
-+  else if (!rpi_sliced_frame(frame))
++  else if (!av_rpi_is_sand_frame(frame))
 +  {
 +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
 +    if (do_luma) {
@@ -15623,17 +23071,30 @@ index 0000000..36c8ab6
 +  }
 +  else
 +  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
-+    // **** Use x0!
-+    for (int x = 0; x < x0 + width; x += frame->linesize[0]) {
-+      if (do_luma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, y0), y_size);
-+      }
-+      if (do_chroma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode,
-+                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, y0 >> 1), uv_size);
-+      }
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
 +    }
 +  }
 +}
@@ -15788,13 +23249,17 @@ index 0000000..36c8ab6
 +    vqj->mask |= VPU_QPU_MASK_VPU;
 +
 +    j->command = EXECUTE_VPU;
-+    j->u.v.q[0] = vpu_code;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
 +    j->u.v.q[1] = r0;
 +    j->u.v.q[2] = r1;
 +    j->u.v.q[3] = r2;
 +    j->u.v.q[4] = r3;
 +    j->u.v.q[5] = r4;
 +    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
 +  }
 +}
 +
@@ -15921,13 +23386,50 @@ index 0000000..36c8ab6
 +  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
++
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
++
 +#endif // RPI
 diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..636e420
+index 0000000000..485a08f8ba
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,201 @@
+@@ -0,0 +1,206 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
@@ -16072,6 +23574,8 @@ index 0000000..636e420
 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
 +  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
 +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
 +  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
@@ -16082,12 +23586,22 @@ index 0000000..636e420
 +
 +
 +// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
 +uint32_t qpu_fn(const int * const mc_fn);
 +
-+#define QPU_N_GRP_UV 4
-+#define QPU_N_UV     12
-+#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
-+#define QPU_N_Y      12
++#define QPU_N_GRP    4
 +#define QPU_N_MAX    12
 +
 +#define QPU_MAIL_EL_VALS  2
@@ -16109,8 +23623,7 @@ index 0000000..636e420
 +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
 +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
 +
-+
-+extern unsigned int vpu_get_fn(void);
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
 +extern unsigned int vpu_get_constants(void);
 +
 +// Waits for previous post_codee to complete and Will null out *wait_h after use
@@ -16118,12 +23631,6 @@ index 0000000..636e420
 +int vpu_qpu_init(void);
 +void vpu_qpu_term(void);
 +
-+// Simple test of shader code
-+extern int rpi_test_shader(void);
-+
-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+
 +extern int gpu_get_mailbox(void);
 +void gpu_ref(void);
 +void gpu_unref(void);
@@ -16131,10 +23638,10 @@ index 0000000..636e420
 +#endif
 diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..f2842b6
+index 0000000000..2c6541a8fb
 --- /dev/null
 +++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,734 @@
+@@ -0,0 +1,1570 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -16164,706 +23671,1542 @@ index 0000000..f2842b6
 +// ::mc_setup_c_qn
 +/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
 +/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00000028] */ 0x0c9e7000, 0x10021667, // add rb_max_x, r0, r0
-+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000038] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
-+/* [0x00000040] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000048] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000050] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
-+/* [0x00000058] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
-+/* [0x00000060] */ 0x00000000, 0xe00059ce, // nop                   ; mov ra14, 0
-+/* [0x00000068] */ 0x8c5103f6, 0x1802560f, // add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0
-+/* [0x00000070] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x00000078] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
-+/* [0x00000080] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
-+/* [0x00000088] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b
-+/* [0x00000090] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
-+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000a8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000000b0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000000b8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000000c0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x000000c8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000000d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000000d8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000e0] */ 0x0c809f80, 0xd0021367, // add rb_wt_den_p15, 9, unif
-+/* [0x000000e8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000f0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000100] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000108] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000110] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000118] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000138] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x00000140] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
-+/* [0x00000148] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b
-+/* [0x00000150] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
-+/* [0x00000158] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000160] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000168] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000170] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000178] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00000180] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00000188] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000190] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
-+/* [0x00000198] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
-+/* [0x000001a0] */ 0x95442ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
-+// :c_preload
-+/* [0x000001a8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000001b0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000001b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001c0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
-+/* [0x000001c8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
-+/* [0x000001d0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000001d8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:c_preload
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001e8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
-+/* [0x000001f0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
-+/* [0x000001f8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000200] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x00000208] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000210] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00000218] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00000220] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+// ::mc_filter_uv
-+/* [0x00000228] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00000230] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1
-+/* [0x00000238] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1
-+/* [0x00000240] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00000248] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000250] */ 0x935401f6, 0xd4024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+/* [0x00000258] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000260] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000268] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4        ; mov ra0, unif
-+/* [0x00000270] */ 0x800a7036, 0x122059d3, // nop                   ; mov ra_y_next, ra2.16a
-+/* [0x00000278] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x00000280] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000288] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x00000290] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0
-+/* [0x00000298] */ 0x119c73c0, 0xd0020827, // shl r0, r1, 7
-+/* [0x000002a0] */ 0x8d818eb6, 0x10025743, // sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif
-+/* [0x000002a8] */ 0x8c8013f6, 0xd0025456, // add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif
-+/* [0x000002b0] */ 0x8c8033f6, 0xd002d496, // add rb_lcount, r1, 3  ; mov.ifnz ra_wt_off_mul_l0, unif
-+/* [0x000002b8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2        ; mov rb8,  ra3.8a
-+/* [0x000002c0] */ 0x910d01f6, 0xda024809, // shl r0, r0, i_shift16 ; mov rb9,  ra3.8b
-+/* [0x000002c8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000002d0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif     ; mov ra9, rb_max_y
-+/* [0x000002d8] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c
-+/* [0x000002e0] */ 0x950c0ff6, 0xde02494b, // mov r5quad, 0         ; mov rb11, ra3.8d
-+/* [0x000002e8] */ 0x8f8013f6, 0xd002531e, // asr rb_wt_off, r1, 1  ; mov ra_link, unif
-+/* [0x000002f0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
-+/* [0x000002f8] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-+// :uvloop
-+/* [0x00000300] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
-+/* [0x00000308] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+/* [0x00000310] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, 8         ; mov.ifnz r3, ra_y
-+/* [0x00000318] */ 0x8c6817f6, 0xd0029818, // add r0, r3, 1         ; mov.ifz ra_base, ra_base_next
-+/* [0x00000320] */ 0x94981f80, 0xd02279d1, // and.setf -, 1, elem_num ; mov ra_y, r0
-+/* [0x00000328] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+/* [0x00000330] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
-+/* [0x00000338] */ 0x559d049f, 0x10044822, // mov.ifz r0, r2        ; mul24 r2, r3, rb_pitch
-+/* [0x00000340] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00000348] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3       ; mov ra4, ra5
-+/* [0x00000350] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000358] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000360] */ 0x40034031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000368] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000370] */ 0x40032031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000378] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d      , r1
-+/* [0x00000380] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000388] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
-+/* [0x00000390] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
-+/* [0x00000398] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
-+/* [0x000003a0] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
-+/* [0x000003a8] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
-+/* [0x000003b0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000003b8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003c8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
-+/* [0x000003e0] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb_wt_den_p15
-+/* [0x000003f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000003f8] */ 0x15067d80, 0x18020c27, // mov vpm, ra1.8a
-+/* [0x00000400] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000408] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0
-+/* [0x00000410] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1
-+/* [0x00000418] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
-+// ::mc_filter_uv_b0
-+/* [0x00000420] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00000428] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1
-+/* [0x00000430] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1
-+/* [0x00000438] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+/* [0x00000440] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000448] */ 0x935401f6, 0xd4125815, // max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
-+/* [0x00000450] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000460] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4        ; mov ra0, unif
-+/* [0x00000468] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x00000470] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000478] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x00000480] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0
-+/* [0x00000488] */ 0x918073f6, 0xd0025802, // shl r0, r1, 7         ; mov ra2, unif
-+/* [0x00000490] */ 0x0d9d8e80, 0x10021767, // sub rb_dma1, rb_dma1_base, r2
-+/* [0x00000498] */ 0x0c9c13c0, 0xd0021467, // add rb_i_tmu, r1, 3 - PREREAD
-+/* [0x000004a0] */ 0x0c9c33c0, 0xd00214a7, // add rb_lcount, r1, 3
-+/* [0x000004a8] */ 0x8c8270b6, 0x10125816, // add r0, r0, r2        ; mov ra_wt_mul_l0, unif
-+/* [0x000004b0] */ 0x915201bf, 0x1c12d816, // shl r0, r0, ra_k16    ; mov.ifnz ra_wt_mul_l0, unif
-+/* [0x000004b8] */ 0x8c81b1f6, 0x10025683, // add rb_dma0, r0, rb_dma0_base ; mov ra3, unif
-+/* [0x000004c0] */ 0x159defc0, 0x10020267, // mov ra9, rb_max_y
-+/* [0x000004c8] */ 0xec0e7d89, 0x14024821, // add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1
-+/* [0x000004d0] */ 0x8c0c21f6, 0x12125813, // add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a
-+/* [0x000004d8] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x000004e0] */ 0x935011bf, 0x18024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000004e8] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x000004f0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x000004f8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000500] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
-+/* [0x00000508] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000510] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
-+/* [0x00000518] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
-+/* [0x00000520] */ 0x950e0ff6, 0x1a024489, // mov ra_wt_off_mul_l1, unif        ; mov rb9,  ra3.8b
-+/* [0x00000528] */ 0x950e0ff6, 0x1c06448a, // mov.ifnz ra_wt_off_mul_l1, unif   ; mov rb10, ra3.8c
-+/* [0x00000530] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
-+/* [0x00000538] */ 0x950c0ff6, 0xde02494b, // mov r5quad,0          ; mov rb11, ra3.8d
-+/* [0x00000540] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
-+/* [0x00000548] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
-+/* [0x00000550] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-+// :uvloop_b
-+/* [0x00000558] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
-+/* [0x00000560] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00000568] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, 8         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00000570] */ 0x95685ff6, 0x10029118, // mov rb4, rb5          ; mov.ifz ra_base, ra_base_next
-+/* [0x00000578] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
-+/* [0x00000580] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num
-+/* [0x00000588] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+/* [0x00000590] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
-+/* [0x00000598] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
-+/* [0x000005a0] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_k255
-+/* [0x000005a8] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3       ; mov ra4, ra5
-+/* [0x000005b0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x000005b8] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x000005c0] */ 0x40034031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000005c8] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000005d0] */ 0x40032031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000005d8] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
-+/* [0x000005e0] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
-+/* [0x000005e8] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
-+/* [0x000005f0] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, 8         ; mov r3, ra_y2
-+/* [0x000005f8] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
-+/* [0x00000600] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num
-+/* [0x00000608] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+/* [0x00000610] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
-+/* [0x00000618] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
-+/* [0x00000620] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255
-+/* [0x00000628] */ 0x950c3ff6, 0x100269c7, // mov.setf -, rb3       ; mov rb7, ra3
-+/* [0x00000630] */ 0x540563f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra1.8a,       r0
-+/* [0x00000638] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000640] */ 0x40074031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000648] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000650] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000658] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
-+/* [0x00000660] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000668] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
-+/* [0x00000670] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
-+/* [0x00000678] */ 0x4d08443e, 0x180241e0, // sub ra7, r2, r0       ; mul24 r0, rb4, ra2.8a
-+/* [0x00000680] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
-+/* [0x00000688] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
-+/* [0x00000690] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
-+/* [0x00000698] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
-+/* [0x000006a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
-+/* [0x000006a8] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
-+/* [0x000006b0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
-+/* [0x000006b8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
-+/* [0x000006c0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x000006c8] */ 0x0c9e7280, 0x10020867, // add r1, r1, r2
-+/* [0x000006d0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+/* [0x000006d8] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000006e0] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
-+/* [0x000006e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000006f0] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a
-+/* [0x000006f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000700] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0
-+/* [0x00000708] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1
-+/* [0x00000710] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_sync_q0
-+/* [0x00000718] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000720] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000728] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000730] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000738] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000740] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000748] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000750] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000758] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q1
-+/* [0x00000760] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000770] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000778] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000780] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000788] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q2
-+/* [0x00000790] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000798] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000007a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000007a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000007b0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000007b8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q3
-+/* [0x000007c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000007c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000007d0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000007d8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000007e0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000007e8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_sync_q4
-+/* [0x000007f0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000007f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000800] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000808] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000810] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000818] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000820] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000828] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000830] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q5
-+/* [0x00000838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000850] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000858] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000860] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q6
-+/* [0x00000868] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000878] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000880] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000888] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000890] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q7
-+/* [0x00000898] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000008a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000008a8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000008b0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000008b8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_sync_q8
-+/* [0x000008c8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000008d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000008d8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000008e0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000008e8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000008f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000008f8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000900] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000908] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q9
-+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000928] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000930] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000938] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q10
-+/* [0x00000940] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000948] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000950] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000958] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000960] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000968] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_sync_q11
-+/* [0x00000970] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000980] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000988] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000990] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000998] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit
-+// ::mc_exit_c
-+/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000009a8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x000009b0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000009b8] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait        ; nop           ; ldtmu1
-+/* [0x000009c0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x000009c8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000009d0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_interrupt_exit12
-+// ::mc_interrupt_exit12c
-+/* [0x000009d8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000009e0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000009f0] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait        ; nop           ; ldtmu1
-+/* [0x000009f8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a00] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000a08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_setup_y_q0
-+/* [0x00000a18] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
 +// ::mc_setup_y_qn
-+/* [0x00000a20] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
-+/* [0x00000a28] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000a30] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000a38] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000a40] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
-+/* [0x00000a48] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000a50] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000a58] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
-+/* [0x00000a60] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000a68] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000a70] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000a78] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000a80] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00000a88] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
-+/* [0x00000a90] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00000a98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000aa0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000aa8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000ab0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000ab8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000ac0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000ac8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ad0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000ad8] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000ae0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000ae8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000af0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000af8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000b00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000b08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000b10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000b18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b20] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
-+/* [0x00000b28] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
-+/* [0x00000b30] */ 0x95042ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
-+// :y_preload
-+/* [0x00000b38] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000b40] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x00000b48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000b50] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
-+/* [0x00000b58] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
-+/* [0x00000b60] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00000b68] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:y_preload
-+/* [0x00000b70] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000b78] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
-+/* [0x00000b80] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
-+/* [0x00000b88] */ 0x0c809dc0, 0xd0021367, // add rb_wt_den_p15, unif, 9
-+/* [0x00000b90] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000b98] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000ba0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000ba8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000bb0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000bb8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000bc0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000bc8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000bd0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000bd8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000be0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000be8] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
-+/* [0x00000bf0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000bf8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
-+/* [0x00000c00] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
-+/* [0x00000c08] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
-+// :per_block_setup
-+/* [0x00000c10] */ 0x935401f6, 0xd4125815, // max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
-+/* [0x00000c18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000c20] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000c28] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000c30] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
-+/* [0x00000c38] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
-+/* [0x00000c40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000c48] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
-+/* [0x00000c50] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x00000c58] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000c60] */ 0x930401f6, 0xd2125813, // max r0, r0, 0         ; mov ra_y2_next, ra1.16a
-+/* [0x00000c68] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
-+/* [0x00000c70] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000c78] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
-+/* [0x00000c80] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000c88] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000c90] */ 0x8c9dc07f, 0x10024831, // add r0, r0, r1        ; mov vw_setup, rb_vpm_init
-+/* [0x00000c98] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00000ca0] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width
-+/* [0x00000ca8] */ 0x8c405df6, 0xd2025460, // add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height
-+/* [0x00000cb0] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16
-+/* [0x00000cb8] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
-+/* [0x00000cc0] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
-+/* [0x00000cc8] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
-+/* [0x00000cd0] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000cd8] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00000ce0] */ 0x918101f6, 0xd0045816, // shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000ce8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
-+/* [0x00000cf0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000cf8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000d00] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000d08] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00000d10] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000d18] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000d20] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000d28] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000d30] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000d38] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000d40] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000d48] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000d50] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000d58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d    ; mov ra_wt_off_mul_l1, unif
-+/* [0x00000d60] */ 0x90216387, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255
-+/* [0x00000d68] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000d70] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
-+/* [0x00000d78] */ 0x90216387, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255
-+/* [0x00000d80] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000d88] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
-+/* [0x00000d90] */ 0x90216387, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255
-+/* [0x00000d98] */ 0x954a0dbf, 0x10064597, // mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
-+/* [0x00000da0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000da8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
-+/* [0x00000db0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000db8] */ 0x90216387, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255
-+/* [0x00000dc0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
-+/* [0x00000dc8] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
-+// ::mc_filter
-+/* [0x00000dd0] */ 0xfffffe20, 0xf0f807a7, // brr ra_link, r:per_block_setup
-+/* [0x00000dd8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
-+/* [0x00000de0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000de8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000df0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
-+// :yloop
-+/* [0x00000df8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
-+/* [0x00000e00] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
-+/* [0x00000e08] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00000e10] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00000e18] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x00000e20] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x00000e28] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00000e30] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000e38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000e40] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x00000e48] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255
-+/* [0x00000e50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000e58] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
-+/* [0x00000e60] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00000e68] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00000e70] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00000e78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x00000e80] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x00000e88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x00000e90] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x00000e98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x00000ea0] */ 0x40074031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x00000ea8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x00000eb0] */ 0x40073031, 0xda00c9e3, // nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x00000eb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x00000ec0] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x00000ec8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00000ed0] */ 0x40071031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00000ed8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8     ; mov r1,   ra8
-+/* [0x00000ee0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9         ; mov rb8,  rb9
-+/* [0x00000ee8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000ef0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10        ; mov rb9,  rb10
-+/* [0x00000ef8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
-+/* [0x00000f00] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3      ; mov rb11, r1
-+/* [0x00000f08] */ 0x4008803e, 0x180049e0, // nop                   ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000f10] */ 0x4008903e, 0x1a0049e1, // nop                   ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000f18] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
-+/* [0x00000f20] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
-+/* [0x00000f28] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
-+/* [0x00000f30] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
-+/* [0x00000f38] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
-+/* [0x00000f40] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
-+/* [0x00000f48] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000f50] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+/* [0x00000f58] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000f60] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+/* [0x00000f68] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
-+/* [0x00000f70] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8         ; mov r0, ra_height
-+/* [0x00000f78] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000f80] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
-+/* [0x00000f88] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
-+/* [0x00000f90] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
-+/* [0x00000f98] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
-+/* [0x00000fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00000fa8] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
-+/* [0x00000fb0] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
-+/* [0x00000fb8] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
-+/* [0x00000fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000fc8] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
-+/* [0x00000fd0] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
-+/* [0x00000fd8] */ 0xfffffe00, 0xf0f809e7, // brr -, r:yloop
-+/* [0x00000fe0] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
-+/* [0x00000fe8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x00000ff0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
-+// ::mc_filter_b
-+/* [0x00000ff8] */ 0xfffffbf8, 0xf0f807a7, // brr ra_link, r:per_block_setup
-+/* [0x00001000] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
-+/* [0x00001008] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001010] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
-+// :yloopb
-+/* [0x00001018] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1             ; ldtmu1
-+/* [0x00001020] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
-+/* [0x00001028] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001030] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001038] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x00001040] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x00001048] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00001050] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00001058] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001060] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x00001068] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255
-+/* [0x00001070] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001078] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
-+/* [0x00001080] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00001088] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00001090] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00001098] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x000010a0] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x000010a8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x000010b0] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x000010b8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x000010c0] */ 0x40074031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x000010c8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x000010d0] */ 0x40073031, 0xda00c9e3, // nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x000010d8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x000010e0] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x000010e8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x000010f0] */ 0x40071031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x000010f8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8     ; mov r1,   ra8
-+/* [0x00001100] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9         ; mov rb8,  rb9
-+/* [0x00001108] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001110] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10        ; mov rb9,  rb10
-+/* [0x00001118] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
-+/* [0x00001120] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3      ; mov rb11, r1
-+/* [0x00001128] */ 0x4008803e, 0x180049e0, // nop                   ; mul24 r0, rb8,  ra2.8a
-+/* [0x00001130] */ 0x4008903e, 0x1a0049e1, // nop                   ; mul24 r1, rb9,  ra2.8b
-+/* [0x00001138] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
-+/* [0x00001140] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
-+/* [0x00001148] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
-+/* [0x00001150] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
-+/* [0x00001158] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
-+/* [0x00001160] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
-+/* [0x00001168] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
-+/* [0x00001170] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+/* [0x00001178] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001180] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001188] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
-+/* [0x00001190] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00001198] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8         ; mov r0, ra_height
-+/* [0x000011a0] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x000011a8] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
-+/* [0x000011b0] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
-+/* [0x000011b8] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
-+/* [0x000011c0] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
-+/* [0x000011c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000011d0] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
-+/* [0x000011d8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
-+/* [0x000011e0] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
-+/* [0x000011e8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000011f0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
-+/* [0x000011f8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
-+/* [0x00001200] */ 0xfffffdf8, 0xf0f809e7, // brr -, r:yloopb
-+/* [0x00001208] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
-+/* [0x00001210] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x00001218] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y_pxx
++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_filter_y_p00
-+/* [0x00001220] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
-+/* [0x00001228] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
-+/* [0x00001230] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00001238] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00001240] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00001248] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00001250] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00001258] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
-+/* [0x00001260] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
-+/* [0x00001268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00001270] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
-+/* [0x00001278] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00001280] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width
-+/* [0x00001288] */ 0x8d402df6, 0xd2025460, // sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height
-+/* [0x00001290] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16
-+/* [0x00001298] */ 0x8c8001f6, 0xd0025496, // add rb_lcount, r0, 0  ; mov ra_wt_off_mul_l0, unif
-+/* [0x000012a0] */ 0x918071f6, 0xd0024817, // shl r0,   r0, 7       ; mov rb_dest, unif
-+/* [0x000012a8] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
-+/* [0x000012b0] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x000012b8] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
-+/* [0x000012c0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
-+/* [0x000012c8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
-+// :yloop_p00
-+/* [0x000012d0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
-+/* [0x000012d8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
-+/* [0x000012e0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x000012e8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000012f0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
-+/* [0x000012f8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+/* [0x00001300] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00001308] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00001310] */ 0x9140f3f6, 0xd2024860, // shl r1, r1, 15        ; mov r0, ra_height
-+/* [0x00001318] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
-+/* [0x00001320] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:yloop_p00
-+/* [0x00001328] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
-+/* [0x00001330] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
-+/* [0x00001338] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
-+/* [0x00001340] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
-+/* [0x00001348] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001350] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
-+/* [0x00001358] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
-+/* [0x00001360] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
-+/* [0x00001368] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001370] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
-+/* [0x00001378] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
-+/* [0x00001380] */ 0xffffff30, 0xf0f809e7, // brr -, r:yloop_p00
-+/* [0x00001388] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
-+/* [0x00001390] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x00001398] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_filter_y_b00
-+/* [0x000013a0] */ 0xfffff850, 0xf0f807a7, // brr ra_link, r:per_block_setup
-+/* [0x000013a8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
-+/* [0x000013b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000013b8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000013c0] */ 0x00000007, 0xe0020827, // mov r0, 7
-+/* [0x000013c8] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
-+/* [0x000013d0] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
-+/* [0x000013d8] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
-+/* [0x000013e0] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
-+/* [0x000013e8] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :yloop_b00
-+/* [0x000013f0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
-+/* [0x000013f8] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
-+/* [0x00001400] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x00001408] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001410] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
-+/* [0x00001418] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+/* [0x00001420] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00001428] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00001430] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001438] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
-+/* [0x00001440] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
-+/* [0x00001448] */ 0x545963c6, 0x12024860, // and r1, r1, rb_k255   ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x00001450] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x00001458] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
-+/* [0x00001460] */ 0x119ce3c0, 0xd0020867, // shl r1, r1, 14
-+/* [0x00001468] */ 0x8c40c3f6, 0x12024860, // add r1, r1, rb_wt_off ; mov r0, ra_height
-+/* [0x00001470] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:yloop_b00
-+/* [0x00001478] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
-+/* [0x00001480] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
-+/* [0x00001488] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
-+/* [0x00001490] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
-+/* [0x00001498] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000014a0] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
-+/* [0x000014a8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
-+/* [0x000014b0] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
-+/* [0x000014b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000014c0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
-+/* [0x000014c8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
-+/* [0x000014d0] */ 0xffffff00, 0xf0f809e7, // brr -, r:yloop_b00
-+/* [0x000014d8] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
-+/* [0x000014e0] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x000014e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q7
++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q9
++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y10_pxx
++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
@@ -16871,10 +25214,10 @@ index 0000000..f2842b6
 +#endif
 diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..a44bce9
+index 0000000000..82bf380eb4
 --- /dev/null
 +++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,35 @@
+@@ -0,0 +1,63 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
@@ -16883,39 +25226,67 @@ index 0000000..a44bce9
 +#define mc_setup_c_q0 (rpi_shader + 0)
 +#define mc_start (rpi_shader + 0)
 +#define mc_setup_c_qn (rpi_shader + 2)
-+#define mc_filter_uv (rpi_shader + 138)
-+#define mc_filter_uv_b0 (rpi_shader + 264)
-+#define mc_sync_q0 (rpi_shader + 454)
-+#define mc_sync_q1 (rpi_shader + 472)
-+#define mc_sync_q2 (rpi_shader + 484)
-+#define mc_sync_q3 (rpi_shader + 496)
-+#define mc_sync_q4 (rpi_shader + 508)
-+#define mc_sync_q5 (rpi_shader + 526)
-+#define mc_sync_q6 (rpi_shader + 538)
-+#define mc_sync_q7 (rpi_shader + 550)
-+#define mc_sync_q8 (rpi_shader + 562)
-+#define mc_sync_q9 (rpi_shader + 580)
-+#define mc_sync_q10 (rpi_shader + 592)
-+#define mc_sync_q11 (rpi_shader + 604)
-+#define mc_exit (rpi_shader + 616)
-+#define mc_exit_c (rpi_shader + 616)
-+#define mc_interrupt_exit12 (rpi_shader + 630)
-+#define mc_interrupt_exit12c (rpi_shader + 630)
-+#define mc_setup_y_q0 (rpi_shader + 646)
-+#define mc_setup_y_qn (rpi_shader + 648)
-+#define mc_filter (rpi_shader + 884)
-+#define mc_filter_b (rpi_shader + 1022)
-+#define mc_filter_y_p00 (rpi_shader + 1160)
-+#define mc_filter_y_b00 (rpi_shader + 1256)
-+#define mc_end (rpi_shader + 1340)
++#define mc_filter_c_p (rpi_shader + 142)
++#define mc_filter_c_p_l1 (rpi_shader + 272)
++#define mc_filter_c_b (rpi_shader + 402)
++#define mc_sync_q0 (rpi_shader + 590)
++#define mc_sync_q1 (rpi_shader + 608)
++#define mc_sync_q2 (rpi_shader + 620)
++#define mc_sync_q3 (rpi_shader + 632)
++#define mc_sync_q4 (rpi_shader + 644)
++#define mc_sync_q5 (rpi_shader + 662)
++#define mc_sync_q6 (rpi_shader + 674)
++#define mc_sync_q7 (rpi_shader + 686)
++#define mc_sync_q8 (rpi_shader + 698)
++#define mc_sync_q9 (rpi_shader + 716)
++#define mc_sync_q10 (rpi_shader + 728)
++#define mc_sync_q11 (rpi_shader + 740)
++#define mc_exit_c_qn (rpi_shader + 752)
++#define mc_exit_y_qn (rpi_shader + 752)
++#define mc_exit_c_q0 (rpi_shader + 770)
++#define mc_exit_y_q0 (rpi_shader + 770)
++#define mc_setup_y_q0 (rpi_shader + 790)
++#define mc_setup_y_qn (rpi_shader + 792)
++#define mc_filter_y_pxx (rpi_shader + 1032)
++#define mc_filter_y_bxx (rpi_shader + 1162)
++#define mc_filter_y_p00 (rpi_shader + 1292)
++#define mc_filter_y_b00 (rpi_shader + 1382)
++#define mc_setup_c10_q0 (rpi_shader + 1462)
++#define mc_setup_c10_qn (rpi_shader + 1464)
++#define mc_filter_c10_p (rpi_shader + 1600)
++#define mc_filter_c10_p_l1 (rpi_shader + 1728)
++#define mc_filter_c10_b (rpi_shader + 1856)
++#define mc_sync10_q0 (rpi_shader + 2042)
++#define mc_sync10_q1 (rpi_shader + 2060)
++#define mc_sync10_q2 (rpi_shader + 2072)
++#define mc_sync10_q3 (rpi_shader + 2084)
++#define mc_sync10_q4 (rpi_shader + 2096)
++#define mc_sync10_q5 (rpi_shader + 2114)
++#define mc_sync10_q6 (rpi_shader + 2126)
++#define mc_sync10_q7 (rpi_shader + 2138)
++#define mc_sync10_q8 (rpi_shader + 2150)
++#define mc_sync10_q9 (rpi_shader + 2168)
++#define mc_sync10_q10 (rpi_shader + 2180)
++#define mc_sync10_q11 (rpi_shader + 2192)
++#define mc_exit_c10_q0 (rpi_shader + 2204)
++#define mc_exit_y10_q0 (rpi_shader + 2204)
++#define mc_exit_c10_qn (rpi_shader + 2224)
++#define mc_exit_y10_qn (rpi_shader + 2224)
++#define mc_setup_y10_q0 (rpi_shader + 2242)
++#define mc_setup_y10_qn (rpi_shader + 2244)
++#define mc_filter_y10_pxx (rpi_shader + 2494)
++#define mc_filter_y10_p00 (rpi_shader + 2624)
++#define mc_filter_y10_bxx (rpi_shader + 2716)
++#define mc_filter_y10_b00 (rpi_shader + 2846)
++#define mc_end (rpi_shader + 2926)
 +
 +#endif
 diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..58fd911
+index 0000000000..ba6cc13a95
 --- /dev/null
 +++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1349 @@
+@@ -0,0 +1,1741 @@
 +
 +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
 +# the warning that we are using rotation & ra/rb registers. r0..3 can be
@@ -16935,8 +25306,22 @@ index 0000000..58fd911
 +# However in the current world there seems to be no benefit (and a small
 +# overhead) in setting this bigger than 2.
 +
-+.set PREREAD, 2
++.set PREREAD,                      4
 +
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
 +
 +# register allocation
 +#
@@ -16995,7 +25380,13 @@ index 0000000..58fd911
 +.set ra_wt_mul_l0,                 ra22.16a
 +.set ra_wt_off_l0,                 ra22.16b
 +
-+# -- free --                       ra23
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# -- free --                       ra23.8d
 +
 +# Loop:  src frame base (L0)
 +.set ra_base,                      ra24
@@ -17021,9 +25412,9 @@ index 0000000..58fd911
 +# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
 +.set rb_elem_x,                    rb2
 +
-+# rb3
-+# C: Temp (U/V flag)
-+# Y: free
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++.set rb_ef,                        rb3
 +
 +# rb4-7
 +# C-B: L1 H filter out FIFO
@@ -17062,8 +25453,8 @@ index 0000000..58fd911
 +
 +# -- free --                       rb21
 +
-+# Setup: 255
-+.set rb_k255,                      rb22
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb22
 +
 +# Loop: destination address
 +.set rb_dest,                      rb23
@@ -17072,7 +25463,7 @@ index 0000000..58fd911
 +.set rb_dma1_base,                 rb24
 +
 +# Setup: pic width - 1
-+# In the case of chroma it is in bytes so 2 * (pic_width_c - 1)
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
 +.set rb_max_x,                     rb25
 +
 +# Loop: height<<23 + width<<16 + vdw_setup_0
@@ -17105,8 +25496,10 @@ index 0000000..58fd911
 +# Macros that express this - obviously these can't be overlapped
 +# so are probably unsuitable for loop code
 +
-+.macro m_calc_dma_regs, r_vpm, r_dma
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
 +  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
 +  asr r1, r2, 2
 +  shl r1, r1, 6
 +  and r0, r2, 3
@@ -17117,9 +25510,31 @@ index 0000000..58fd911
 +
 +  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
 +  shl r0, r0, 5
++
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
++  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
++  shl r1, r1, 5
++.endif
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
++  shl r0, r0, 6
++.endif
 +  add r_dma, r0, r1  # DMA out
 +.endm
 +
++
 +.macro m_setup_q0
 +  srel -, 12
 +.endm
@@ -17129,66 +25544,90 @@ index 0000000..58fd911
 +
 +################################################################################
 +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_c_q0
-+  m_setup_q0
-+::mc_setup_c_qn
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
 +  mov tmurs, 1                                  # No swap TMUs
 +
 +# Load first request location
-+  mov ra0, unif         # next_x_y
++  mov ra0, unif                                 # next_x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
 +
 +  mov ra_base, unif                             # Store frame c base
 +
 +# Read image dimensions
 +  sub r0, unif, 1                               # pic c width
-+  add rb_max_x, r0, r0
-+  sub rb_max_y, unif, 1     # pic c height
++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                         # pic c height
 +
 +# load constants
 +  mov ra_kff100100, 0xff100100
-+  mov rb_k255, 255
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
 +
-+
-+  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
-+
-+# touch registers to keep simulator happy
-+# ; ra12..15: vertical scroll registers
 +# get source pitch
-+  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
-+  mov rb_pitch, unif    ; mov ra13, 0           # stride1
-+  nop                   ; mov ra14, 0
-+# get destination vdw setup
-+  add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
++  mov rb_xpitch, unif                           # stride2
++  mov rb_pitch, unif                            # stride1
++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
 +
 +  and r0, 1, elem_num
 +  nop                   ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
 +  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
 +
 +# Compute base address for first and second access
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  add r0, ra0.16b, ra0.16b                      # [rb_elem_x delay]
++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
 +  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
 +  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
 +  min r0, r0, rb_max_x
 +
 +# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
 +  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
 +
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
 +
++.if v_bit_depth <= 8
 +  and r0, r0, -4
++.endif
 +  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
 +  add r0, r0, r1
 +  add ra_base, ra_base, r0
 +
-+  add rb_wt_den_p15, 9, unif     # denominator
++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
 +
 +# Compute part of VPM to use for DMA output
-+  m_calc_dma_regs rb_vpm_init, rb_dma0_base
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
 +
 +# And again for L1, but only worrying about frame2 stuff
 +
@@ -17201,17 +25640,21 @@ index 0000000..58fd911
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  add r0, ra0.16b, ra0.16b                      # Load x
++  shl r0, ra0.16b, v_x_shift
 +  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
 +
-+# Get shift
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
 +  shl rb_xshift2_next, r0, 3
++.endif
 +
 +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
 +
++.if v_bit_depth <= 8
 +  and r0, r0, -4
++.endif
 +  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
@@ -17222,7 +25665,7 @@ index 0000000..58fd911
 +# r0 = ra_y, r2 = ra_y2
 +  mov r3, PREREAD       ; mov r0, ra_y
 +
-+:c_preload
++:1
 +  sub.setf r3, r3, 1
 +  max r1, r0, 0
 +  min r1, r1, rb_max_y
@@ -17230,11 +25673,11 @@ index 0000000..58fd911
 +  add t0s, ra_base, r1  ; mov ra_y, r0
 +
 +  max r1, r2, 0
-+  brr.anynz -, r:c_preload
++  brr.anynz -, r:1b
 +  min r1, r1, rb_max_y
 +  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
 +  add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz c_preload
++# >>> .anynz 1b
 +
 +  mov ra_link, unif                             # link
 +# touch registers to keep simulator happy
@@ -17245,6 +25688,12 @@ index 0000000..58fd911
 +  mov ra6, 0 ; mov rb6, 0
 +  mov ra7, 0 ; mov rb7, 0
 +# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
 +
 +################################################################################
 +
@@ -17252,85 +25701,116 @@ index 0000000..58fd911
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv
-+# per-channel shifts were calculated on the *previous* invocation
 +
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          ra_base2
++.set vr_txs,            t1s
++.endif
++
++# per-channel shifts were calculated on the *previous* invocation
 +# get base addresses and per-channel shifts for *next* invocation
 +  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  and.setf -, elem_num, 1                       # [ra2 delay]
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
 +
-+  add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1  # x ; r1=0
-+  add r0, r0, rb_elem_x
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
 +
-+  shl ra_xshift_next, r0, 3
-+
-+  and r0, r0, -4        ; mov ra0, unif         # H filter coeffs
-+  nop                   ; mov ra_y_next, ra2.16a
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0
-+  shl r0, r1, 7
++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
 +
 +# set up VPM write
-+
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
-+  add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif         # ; U offset/weight
-+  add rb_lcount, r1, 3  ; mov.ifnz ra_wt_off_mul_l0, unif    # ; V offset/weight
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
 +
 +# ; unpack filter coefficients
 +
-+  add r0, r0, r2        ; mov rb8,  ra3.8a      # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+  shl r0, r0, i_shift16 ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
 +  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
 +
-+  mov rb_dest, unif     ; mov ra9, rb_max_y     # dst_addr ; alias rb_max_y
++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
 +
-+  shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c
-+  mov r5quad, 0         ; mov rb11, ra3.8d      # Loop count (r5rep is B, r5quad is A)
++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
 +
-+  asr rb_wt_off, r1, 1  ; mov ra_link, unif     # Link
-+  shl ra_wt_mul_l0, ra_wt_mul_l0, 1             # weight*2
++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
++  sub ra3, rb_wt_den_p15, ra_k1
 +
-+# ra9 alias for rb_max_y
-+# ra_wt_mul_l0 - weight L0 * 2
-+# rb_wt_den_p15 = weight denom + 6 + 9
-+# rb_wt_off = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++# r5           = 0 (loop counter)
++# ra9          = alias for rb_max_y
++# ra_wt_mul_l0 = weight L0
++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
 +
 +# We want (r0r1)
 +# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
 +# We fetch (after shift)
 +#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
 +
-+  mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-+
-+# r5 = 0 (loop counter)
-+:uvloop
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+  shr r1, r2, 8         ; mov.ifnz r3, ra_y
-+  add r0, r3, 1         ; mov.ifz ra_base, ra_base_next
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++.endif
 +
-+  and.setf -, 1, elem_num ; mov ra_y, r0
-+  max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r0, r2
 +
-+  mov.ifz r0, r2        ; mul24 r2, r3, rb_pitch
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
-+
-+# ra4 not really needed; this could be a mul24 rather than a mov but current
-+# register usage means this wouldn't help
-+  mov.setf -, rb3       ; mov ra4, ra5
++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
 +# apply horizontal filter
 +# The filter coeffs for the two halves of this are the same (unlike in the
@@ -17338,18 +25818,29 @@ index 0000000..58fd911
 +# Also as the two halves are locked together we don't need to separate the 1st
 +# r0 mul or the last r1 mul as they are vaild for all QPUs
 +
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
 +  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+  sub.setf -, r5, 4     ; mul24      r0, ra0.8d      , r1
-+  brr.anyn -, r:uvloop
-+  add r2, r2, r3        ; mov ra5, ra6
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++
 +# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++# Have to dup block as we need to move the brr - code is more common than it
++# looks at first glance
++.if v_bit_depth <= 8
++  brr.anyn -, r:1b
++  add r2, r2, r3        ; mov ra5, ra6
 +  mov ra6, ra7          ; mul24 r1, ra7, rb10
 +  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
-+# >>> .anyn uvloop
++.else
++  add r2, r2, r3        ; mov ra5, ra6
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
++  asr ra7, r2, v_bit_depth - 8
++.endif
++# >>> .anyn 1b
 +
 +  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
 +  add r1, r1, r0        ; mul24 r0, ra7, rb11
@@ -17357,84 +25848,146 @@ index 0000000..58fd911
 +  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +  asr r1, r1, 14
 +  nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+  shl r1, r1, 8
++  shl r1, r1, 8         ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, ra3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
 +
-+  add r1, r1, rb_wt_off
-+  brr.anyn -, r:uvloop
-+  asr ra1.8as, r1, rb_wt_den_p15
-+  mov -, vw_wait
-+  mov vpm, ra1.8a
-+# >>> .anyn uvloop
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+# DMA out for U & stash for V
-+  bra -, ra_link
-+  mov vw_setup, rb_dma0
-+  mov vw_setup, rb_dma1
-+  mov vw_addr, rb_dest     # u_dst_addr
-+# >>> ra_link
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++# At 10 bits
++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
++# (P)
++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
++# ... should be OK
++#
++# (B)
++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
++# So signed overflow if we sign extend here :-(
++#
++# In practice this doesn't happen (we need a maximal offset and a very unlucky
++# filter).
++#
++# This could be fixed by offsetting the filters s.t. they are unsigned until
++# weight mul and then removing the offset with the weighting offset (I think
++# this should work) or splitting the rounding & offsetting
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_c_b
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv_b0
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
++
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
 +  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  and.setf -, elem_num, 1                       # Also acts as delay slot for ra2
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
 +
-+  add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1  # x ; r1=0
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
 +  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
 +
++.if v_bit_depth <= 8
 +  shl ra_xshift_next, r0, 3
++.endif
 +
-+  and r0, r0, -4        ; mov ra0, unif         # L0 H filter coeffs
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0
-+  shl r0, r1, 7         ; mov ra2, unif         # ; L0 V filter coeffs
++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
 +
 +# set up VPM write
 +
-+  sub rb_dma1, rb_dma1_base, r2                 # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, r1, 3 - PREREAD
-+  add rb_lcount, r1, 3
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
 +
-+  add r0, r0, r2        ; mov ra_wt_mul_l0, unif # ; U weight
-+  shl r0, r0, ra_k16    ; mov.ifnz ra_wt_mul_l0, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; V weight
-+  add rb_dma0, r0, rb_dma0_base ; mov ra3, unif  # ; x2_y2
++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
 +
 +# L1 - uniform layout could possibly be optimized
 +
-+  mov ra9, rb_max_y                             # [ra3 delay]
-+
-+  add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1  # r0=x*2 ; r1=0
-+  add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # H filter coeffs
++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
 +
++.if v_bit_depth <= 8
 +  shl rb_xshift2_next, r0, 3
++.endif
 +
-+  and r0, r0, -4
-+  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1        ; mov rb10, ra3.8c
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
 +  add rb_base2_next, r3, r0
 +
-+  mov ra_wt_off_mul_l1, unif        ; mov rb9,  ra3.8b      # U offset/weight
-+  mov.ifnz ra_wt_off_mul_l1, unif   ; mov rb10, ra3.8c      # V offset/weight
-+
-+  mov rb_dest, unif                             # dst_addr
-+  mov r5quad,0          ; mov rb11, ra3.8d
++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
 +  shl r1, ra_wt_off_l1, rb_wt_den_p15
 +  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
 +
@@ -17448,64 +26001,66 @@ index 0000000..58fd911
 +# rb8-rb11  V coeffs L1
 +# ra9       rb_max_y alias
 +
-+  mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-+
-+:uvloop_b
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
 +  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
-+  shr r1, r2, 8         ; mov.ifz ra_y_y2, ra_y_y2_next
-+  mov rb4, rb5          ; mov.ifz ra_base, ra_base_next
++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
 +  add ra_y, 1, ra_y     ; mov r3, ra_y
 +
-+  and.setf -, 1, elem_num
 +  max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+  mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
-+  add t0s, ra_base, r3  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
 +# L0 H-filter
 +# H FIFO scrolls are spread all over this loop
-+  mov.setf -, rb3       ; mov ra4, ra5
++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
 +
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
 +  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++.if v_bit_depth <= 8
 +  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++.else
++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++  asr ra3, r2, (v_bit_depth - 8)
++.endif
 +
 +  shr r2, r4, rb_xshift2 ; mov ra5, ra6
-+  shr r1, r2, 8         ; mov r3, ra_y2
++  shr r1, r2, v_v_shift ; mov r3, ra_y2
 +  add ra_y2, r3, ra_k1  ; mov rb6, rb7
 +
-+  and.setf -, 1, elem_num
 +  max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+  mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
-+  add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
 +# L1 H-filter
-+  mov.setf -, rb3       ; mov rb7, ra3
 +
-+  and r1, r1, rb_k255   ; mul24      r3, ra1.8a,       r0
++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
 +  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
 +  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
-+  brr.anyn -, r:uvloop_b
 +# V filters - start in branch delay slots of H
++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
 +  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++  brr.anyn -, r:1b
 +  mov ra6, ra7          ; mul24 r3, ra7, rb10
-+  sub ra7, r2, r0       ; mul24 r0, rb4, ra2.8a
-+# >>> .anyn uvloop_b0
++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++# >>> .anyn 1b
 +
-+  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
 +  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
 +  sub r2, r1, r0        ; mul24 r0, ra4, rb8
 +  sub r1, r3, r0        ; mul24 r0, ra5, rb9
@@ -17516,22 +26071,46 @@ index 0000000..58fd911
 +  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
 +
 +  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
-+  add r1, r1, r2
++  add r1, r1, r2        ; mov r3, ra_blk_height
 +
 +  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
 +
-+  brr.anyn -, r:uvloop_b
-+  asr ra3.8as, r1, rb_wt_den_p15
-+  mov -, vw_wait
-+  mov vpm, ra3.8a
-+# >>> .anyn uvloop_b
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
-+  bra -, ra_link
-+  mov vw_setup, rb_dma0
-+  mov vw_setup, rb_dma1
-+  mov vw_addr, rb_dest
-+# >>> ra_link
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++  m_filter_c_b 8
 +
 +################################################################################
 +# Exit code used by both Luma & Chroma so place between them to avoid I-cache
@@ -17570,9 +26149,11 @@ index 0000000..58fd911
 +# The code stalled when I had many waiters on a single sem so we have a
 +# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
 +# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu
-+  mov ra_link, unif
-+  mov -, vw_wait
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
 +
 +.set n_sem_sync, n_qpu - (n_qpu % 4)
 +.set n_sem_in, n_qpu
@@ -17581,7 +26162,7 @@ index 0000000..58fd911
 +.if n_qpu % 4 == 0
 +
 +.set n_sem_quad_in,  12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % 3)
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
 +
 +  sacq -, n_sem_sync
 +  sacq -, n_sem_sync
@@ -17601,51 +26182,67 @@ index 0000000..58fd911
 +  nop
 +.endif
 +.endif
++.endif
 +.endm
 +
++.set v_quads8, N_QPU_8 / 4
++
 +::mc_sync_q0
-+  m_sync_q 0
++  m_sync_q 0, v_quads8
 +::mc_sync_q1
-+  m_sync_q 1
++  m_sync_q 1, v_quads8
 +::mc_sync_q2
-+  m_sync_q 2
++  m_sync_q 2, v_quads8
 +::mc_sync_q3
-+  m_sync_q 3
++  m_sync_q 3, v_quads8
 +::mc_sync_q4
-+  m_sync_q 4
++  m_sync_q 4, v_quads8
 +::mc_sync_q5
-+  m_sync_q 5
++  m_sync_q 5, v_quads8
 +::mc_sync_q6
-+  m_sync_q 6
++  m_sync_q 6, v_quads8
 +::mc_sync_q7
-+  m_sync_q 7
++  m_sync_q 7, v_quads8
 +::mc_sync_q8
-+  m_sync_q 8
++  m_sync_q 8, v_quads8
 +::mc_sync_q9
-+  m_sync_q 9
++  m_sync_q 9, v_quads8
 +::mc_sync_q10
-+  m_sync_q 10
++  m_sync_q 10, v_quads8
 +::mc_sync_q11
-+  m_sync_q 11
++  m_sync_q 11, v_quads8
 +
 +# mc_exit()
 +# Chroma & Luma the same now
-+::mc_exit_c
-+::mc_exit
++
++.macro m_exit_qn
 +  m_exit_drain
 +  nop                   ; nop           ; thrend
 +  nop
 +  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
++
++
 +
 +# mc_interrupt_exit12()
-+::mc_interrupt_exit12c
-+::mc_interrupt_exit12
++
++.macro m_exit_q0
 +  m_exit_drain
 +  sacq -, 12
 +  nop                   ; nop           ; thrend
 +  mov interrupt, 1
 +  nop
 +# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
 +
 +# LUMA CODE
 +
@@ -17667,9 +26264,20 @@ index 0000000..58fd911
 +#    uint32_t next_fn;
 +# } qpu_mc_pred_y_s_t;
 +
-+::mc_setup_y_q0
-+  m_setup_q0
-+::mc_setup_y_qn
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
 +  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
 +  mov ra9, unif                                 # ref_y_base
@@ -17677,18 +26285,27 @@ index 0000000..58fd911
 +  mov ra11, unif                                # ref_y2_base
 +
 +# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
++
 +
 +  mov ra_kff100100, 0xff100100
-+  mov rb_k255, 255
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
 +
 +# Compute part of VPM to use
 +
 +# Read image dimensions
-+  mov ra3, unif         # width_height
-+  mov rb_xpitch, unif   # stride2
++  mov ra3, unif                                 # width_height
++  mov rb_xpitch, unif                           # stride2
++.if v_x_shift == 0
 +  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
 +  sub rb_max_y, ra3.16a, 1
-+  mov rb_pitch, unif    # stride1
++  mov rb_pitch, unif                            # stride1
 +
 +# get destination pitch
 +  mov r1, vdw_setup_1(0)
@@ -17696,38 +26313,44 @@ index 0000000..58fd911
 +
 +# Compute base address for first and second access
 +  mov r3, elem_num
-+  add r0, ra0.16b, r3   # Load x + elem_num
++  add r0, ra0.16b, r3                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
 +  shl ra_xshift_next, r0, 3 # Compute shifts
 +
-+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++# X is byte offset - we can only load words - mask
 +
 +  and r0, r0, -4        ; v8subs r2, r2, r2
 +  sub r2, r2, rb_pitch
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base, ra9, r0
 +
 +  # r3 still contains elem_num
-+  add r0, ra1.16b, r3  # Load x
++  add r0, ra1.16b, r3                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3 # Compute shifts
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
 +
 +  # r2 still contains mask
 +  and r0, r0, -4
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base2, ra11, r0
 +
 +# Do preloads
 +  nop                   ; mov r0, ra0.16a       # ; r0 = y
 +  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
 +
-+:y_preload
++:1
 +  sub.setf r3, r3, 1
 +  max r1, r0, 0
 +  min r1, r1, rb_max_y
@@ -17735,15 +26358,15 @@ index 0000000..58fd911
 +  add t0s, ra_base, r1  ; mov ra_y, r0
 +
 +  max r1, r2, 0
-+  brr.anynz -, r:y_preload
++  brr.anynz -, r:1b
 +  min r1, r1, rb_max_y
 +  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
 +  add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz y_preload
++# >>> .anynz 1b
 +
-+  add rb_wt_den_p15, unif, 9                    # weight denom + 6
++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
 +
-+  m_calc_dma_regs rb_vpm_init, rb_dma0_base
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
 +
 +  mov ra_link, unif                             # Next fn
 +
@@ -17754,6 +26377,12 @@ index 0000000..58fd911
 +  mov ra10, 0           ; mov rb10, 0
 +  mov ra11, 0           ; mov rb11, 0
 +# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
 +
 +################################################################################
 +#
@@ -17780,48 +26409,73 @@ index 0000000..58fd911
 +# } qpu_mc_pred_y_p_t;
 +#
 +
-+.macro luma_setup
-+  brr ra_link, r:per_block_setup
-+  mov ra0, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # [ra0 delay]
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
 +  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
 +.endm
 +
-+:per_block_setup
-+  max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
 +  min r0, r0, rb_max_x
 +
 +  shl ra_xshift_next, r0, 3         # Compute shifts
-+  and r0, r0, -4        ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r0, r0, -4
++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
 +  and r1, r0, r2        ; mov ra_y_next, ra0.16a
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
 +  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
 +  add ra_base_next, ra_base_next, r0            # [ra1 delay]
 +
 +  add r0, ra1.16b, r3                           # Load x2
-+  max r0, r0, 0         ; mov ra_y2_next, ra1.16a
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
 +  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
 +  shl rb_xshift2_next, r0, 3                    # Compute shifts
 +  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
-+  and r1, r0, r2
++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov vw_setup, rb_vpm_init # Add stripe offsets ; set up VPM write
++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
 +  add rb_base2_next, rb_base2_next, r0
 +
-+# get width,height of block (unif load above)
-+  sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height
-+  min r0, r0, ra_k16
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
 +  add rb_lcount, r0, 7
-+  shl r0,   r0, 7
-+  add r0,   r0, ra_width                        # Combine width and height of destination area
-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
++  shl r0,   r0, v_dma_h_shift
++  add r0,   r0, r1                              # Combine width and height of destination area
++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
 +  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
-+  shl ra8, r0, 3
++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3        ; mov r3, ra_k255
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
 +# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
@@ -17845,35 +26499,41 @@ index 0000000..58fd911
 +# In the 2nd vertical half we use b registers due to using a-side fifo regs
 +
 +  mov r1,0x3a281100
-+  ror r0, r1, ra8.8d    ; mov ra_wt_off_mul_l1, unif
-+  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255
++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
 +
 +  mov r1,0x0a0b0500  # -ve
 +  ror r0, r1, ra8.8d
-+  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255
++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
 +
 +  mov r1,0x04040100
 +  ror r0, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255
++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
 +
-+  mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
 +
 +  mov r1,0x01010000  # -ve
 +  ror r0, r1, ra8.8d
-+  bra -, ra_link
-+  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255
 +
-+  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  bra -, ra_link
++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++
++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
 +  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
 +  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
 +# >>> branch ra_link
 +
-+# r3 = 0
++# r5 = 0
 +# ra_wt_mul_l1  = weight L1
 +# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
 +# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
 +# rb_wt_den_p15 = weight denom + 6 + 9
 +# rb_wt_mul_l0  = weight L0
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
 +
 +
 +################################################################################
@@ -17881,14 +26541,14 @@ index 0000000..58fd911
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
-+::mc_filter
-+  luma_setup
++.macro m_filter_y_pxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
 +  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
 +
 +# r5 = 0 (loop count)
 +
-+:yloop
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
@@ -17906,45 +26566,39 @@ index 0000000..58fd911
 +  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
 +  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
++  min r2, r2, rb_max_y          ; mov ra7, ra8
 +  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
 +  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r5, 8     ; mov r1,   ra8
-+  mov ra8,  ra9         ; mov rb8,  rb9
-+  brr.anyn -, r:yloop
-+  mov ra9,  ra10        ; mov rb9,  rb10
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
 +  mov ra10, ra11        ; mov rb10, rb11
-+  sub ra11, r2, r3      ; mov rb11, r1
-+  # >>> .anyn yloop
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+
-+  nop                   ; mul24 r0, rb8,  ra2.8a
-+  nop                   ; mul24 r1, rb9,  ra2.8b
 +  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
 +  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
 +  add r1, r1, r0        ; mul24 r0, ra8,  rb4
@@ -17959,38 +26613,46 @@ index 0000000..58fd911
 +  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
 +  asr r1, r1, 14
 +  nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+  add r1, r1, rb_wt_off
++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
++
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
 +
-+  shl r1, r1, 8         ; mov r0, ra_height
-+  brr.anyn -, r:yloop
-+  asr ra3.8as, r1, rb_wt_den_p15
-+  mov r1, ra_k16        ; mov -, vw_wait
-+  sub r0, r0, r1        ; mov vpm, ra3.8a
 +# >>> branch.anyn yloop
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb_dma1 (stride) remains constant
-+  # rb_i_tmu remains const (based on total height)
-+  # recalc rb_dma0, rb_lcount based on new segment height
-+  # N.B. r3 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
 +  bra.anyz -, ra_link
-+  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
-+  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
-+  nop                   ; mov vw_addr, rb_dest  # start the VDW
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
 +# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
 +  add rb_lcount, rb_lcount, r0
-+  shl r0, r2, i_shift23
-+  add rb_dma0, rb_dma0, r0
-+  brr -, r:yloop
-+  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
 +  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> yloop
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
 +
 +
 +################################################################################
@@ -17998,25 +26660,15 @@ index 0000000..58fd911
 +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+# Can fill in the coefficients so only
-+# Can also assume default weighted prediction for B frames.
 +# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
 +# Or possibly by taking advantage of symmetry?
-+# From 19->7 32bits per command.
 +
-+::mc_filter_b
-+  luma_setup
++.macro m_filter_y_bxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
-+:yloopb
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1             ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
 +  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
@@ -18025,44 +26677,39 @@ index 0000000..58fd911
 +  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
 +  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
++  min r2, r2, rb_max_y          ; mov ra7, ra8
 +  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
 +  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
 +  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
 +  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r5, 8     ; mov r1,   ra8
-+  mov ra8,  ra9         ; mov rb8,  rb9
-+  brr.anyn -, r:yloopb
-+  mov ra9,  ra10        ; mov rb9,  rb10
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
 +  mov ra10, ra11        ; mov rb10, rb11
-+  sub ra11, r2, r3      ; mov rb11, r1
-+  # >>> .anyn yloopb
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+  nop                   ; mul24 r0, rb8,  ra2.8a
-+  nop                   ; mul24 r1, rb9,  ra2.8b
 +  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
 +  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
 +  add r1, r1, r0        ; mul24 r0, ra8,  rb4
@@ -18078,37 +26725,44 @@ index 0000000..58fd911
 +  nop                   ; mul24 r0, r1, ra_wt_mul_l0
 +  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
 +
-+  add r1, r1, r0
-+  shl r1, r1, 8         ; mov r0, ra_height
-+  brr.anyn -, r:yloopb
-+  asr ra3.8as, r1, rb_wt_den_p15
-+  mov r1, ra_k16        ; mov -, vw_wait
-+  sub r0, r0, r1        ; mov vpm, ra3.8a
-+# >>> branch.anyn yloop
++  add r1, r1, r0        ; mov r3, ra_blk_height
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb_dma1 (stride) remains constant
-+  # rb_i_tmu remains const (based on total height)
-+  # recalc rb_dma0, rb_lcount based on new segment height
-+  # N.B. r5 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
 +  bra.anyz -, ra_link
-+  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
-+  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
-+  nop                   ; mov vw_addr, rb_dest  # start the VDW
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
 +# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
 +  add rb_lcount, rb_lcount, r0
-+  shl r0, r2, i_shift23
-+  add rb_dma0, rb_dma0, r0
-+  brr -, r:yloopb
-+  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
 +  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> yloopb
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
 +
 +################################################################################
 +#
@@ -18121,10 +26775,28 @@ index 0000000..58fd911
 +#    uint32_t next_fn;
 +# } qpu_mc_pred_y_p00_t;
 +
-+::mc_filter_y_p00
-+  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif         ; mov r3, elem_num      # y_x
 +  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
 +  add r0, ra0.16b, r3
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
@@ -18135,23 +26807,23 @@ index 0000000..58fd911
 +  and r1, r0, r2        ; mov ra_y_next, ra0.16a
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
 +  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
-+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # ; set up VPM write
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
 +
 +# get width,height of block (unif load above)
-+  sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width)
-+  sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height
-+  min r0, r0, ra_k16
-+  add rb_lcount, r0, 0  ; mov ra_wt_off_mul_l0, unif
-+  shl r0,   r0, 7       ; mov rb_dest, unif     # Destination address
-+  add r0,   r0, ra_width                        # Combine width and height of destination area
-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
 +  add rb_dma0, r0, rb_dma0_base
 +
 +  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
 +  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
 +  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
 +
-+:yloop_p00
++:1
 +  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
 +  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
 +  shr r0, r4, ra_xshift ; mov r3, rb_pitch
@@ -18159,48 +26831,55 @@ index 0000000..58fd911
 +  max r2, ra_y, 0  # y
 +  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
 +  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
 +
 +  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+  shl r1, r1, 15        ; mov r0, ra_height
-+  add r1, r1, rb_wt_off
++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
 +
-+  brr.anyn -, r:yloop_p00
-+  asr ra3.8as, r1, rb_wt_den_p15
-+  mov r1, ra_k16        ; mov -, vw_wait
-+  sub r0, r0, r1        ; mov vpm, ra3.8a
-+# >>> branch.anyn yloop_p00
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb_dma1 (stride) remains constant
-+  # rb_i_tmu remains const (based on total height)
-+  # recalc rb_dma0, rb_lcount based on new segment height
-+  # N.B. r5 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
 +  bra.anyz -, ra_link
-+  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
-+  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
-+  nop                   ; mov vw_addr, rb_dest  # start the VDW
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
 +# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
 +  add rb_lcount, rb_lcount, r0
-+  shl r0, r2, i_shift23
-+  add rb_dma0, rb_dma0, r0
-+  brr -, r:yloop_p00
-+  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
 +  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> yloop_p00
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
 +
 +################################################################################
 +
-+::mc_filter_y_b00
++.macro m_filter_y_b00, v_bit_depth
 +# luma setup does a fair bit more than we need calculating filter coeffs
 +# that we will never use but it saves I-cache to use it (also simple!)
-+  luma_setup
++  m_luma_setup v_bit_depth
 +
 +# Fix up vals that were expecting a filter (somewhat icky)
 +  mov r0, 7
@@ -18210,7 +26889,7 @@ index 0000000..58fd911
 +  shl rb_wt_off, rb_wt_off, r0
 +  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
 +
-+:yloop_b00
++:1
 +  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
 +  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
 +  shr r0, r4, ra_xshift ; mov r3, rb_pitch
@@ -18223,64 +26902,157 @@ index 0000000..58fd911
 +  max r2, ra_y2, 0
 +  min r2, r2, rb_max_y
 +  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
-+  and r1, r1, rb_k255   ; mul24 r0, r0, ra_wt_mul_l0
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
 +
 +  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
 +  add r1, r0, r1
-+  shl r1, r1, 14
-+  add r1, r1, rb_wt_off ; mov r0, ra_height
++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
 +
-+  brr.anyn -, r:yloop_b00
-+  asr ra3.8as, r1, rb_wt_den_p15
-+  mov r1, ra_k16        ; mov -, vw_wait
-+  sub r0, r0, r1        ; mov vpm, ra3.8a
-+# >>> branch.anyn yloop
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb_dma1 (stride) remains constant
-+  # rb_i_tmu remains const (based on total height)
-+  # recalc rb_dma0, rb_lcount based on new segment height
-+  # N.B. r5 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
 +  bra.anyz -, ra_link
-+  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
-+  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
-+  nop                   ; mov vw_addr, rb_dest  # start the VDW
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
 +# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
 +  add rb_lcount, rb_lcount, r0
-+  shl r0, r2, i_shift23
-+  add rb_dma0, rb_dma0, r0
-+  brr -, r:yloop_b00
-+  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
 +  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> yloopb00
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
 +
 +################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
 +
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
 diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
 new file mode 100644
-index 0000000..838b6bd
+index 0000000000..9f8983da52
 --- /dev/null
 +++ b/libavcodec/rpi_shader_cmd.h
-@@ -0,0 +1,112 @@
+@@ -0,0 +1,128 @@
 +#ifndef RPI_SHADER_CMD_H
 +#define RPI_SHADER_CMD_H
 +
 +#pragma pack(push, 4)
 +
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
 +typedef struct qpu_mc_src_s
 +{
 +    int16_t y;
 +    int16_t x;
-+    uint32_t base;
++    qpu_mc_src_addr_t base;
 +} qpu_mc_src_t;
 +
 +
@@ -18292,7 +27064,7 @@ index 0000000..838b6bd
 +    uint32_t coeffs_y;
 +    uint32_t wo_u;
 +    uint32_t wo_v;
-+    uint32_t dst_addr_c;
++    qpu_mc_dst_addr_t dst_addr_c;
 +    uint32_t next_fn;
 +} qpu_mc_pred_c_p_t;
 +
@@ -18309,7 +27081,7 @@ index 0000000..838b6bd
 +    uint32_t coeffs_y2;
 +    uint32_t wo_u2;
 +    uint32_t wo_v2;
-+    uint32_t dst_addr_c;
++    qpu_mc_dst_addr_t dst_addr_c;
 +    uint32_t next_fn;
 +} qpu_mc_pred_c_b_t;
 +
@@ -18341,7 +27113,7 @@ index 0000000..838b6bd
 +    uint32_t mymx21;
 +    uint32_t wo1;
 +    uint32_t wo2;
-+    uint32_t dst_addr;
++    qpu_mc_dst_addr_t dst_addr;
 +    uint32_t next_fn;
 +} qpu_mc_pred_y_p_t;
 +
@@ -18350,7 +27122,7 @@ index 0000000..838b6bd
 +    uint16_t h;
 +    uint16_t w;
 +    uint32_t wo1;
-+    uint32_t dst_addr;
++    qpu_mc_dst_addr_t dst_addr;
 +    uint32_t next_fn;
 +} qpu_mc_pred_y_p00_t;
 +
@@ -18377,24 +27149,618 @@ index 0000000..838b6bd
 +typedef union qpu_mc_pred_cmd_u {
 +    qpu_mc_pred_y_t y;
 +    qpu_mc_pred_c_t c;
++    uint32_t data[1];
 +} qpu_mc_pred_cmd_t;
 +
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
 +#pragma pack(pop)
 +
 +#endif
 +
+diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
+new file mode 100644
+index 0000000000..2d763f54ef
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.c
+@@ -0,0 +1,66 @@
++#ifdef RPI
++
++#include "hevc.h"
++#include "hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_shader_cmd.h"
++#include "rpi_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_shader_template_fn.h"
++
++#endif
++
+diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
+new file mode 100644
+index 0000000000..ecf5b8185a
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.h
+@@ -0,0 +1,24 @@
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++#ifdef RPI
++struct HEVCContext;
++struct HEVCRpiInterPredEnv;
++
++void rpi_shader_c8(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_shader_c16(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++#endif
++
+diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
+new file mode 100644
+index 0000000000..b5ac2ceed6
+--- /dev/null
++++ b/libavcodec/rpi_shader_template_fn.h
+@@ -0,0 +1,477 @@
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(rpi_shader_c)(HEVCContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000..b061fe0
+index 0000000000..b502de0a2c
 --- /dev/null
 +++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,581 @@
+@@ -0,0 +1,745 @@
 +#include "config.h"
 +#ifdef RPI
++#include "libavcodec/avcodec.h"
 +#include "rpi_qpu.h"
 +#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
 +#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
 +#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
@@ -18421,21 +27787,11 @@ index 0000000..b061fe0
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+#if 1
-+//#define ALLOC_PAD       0x1000
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+//#define ALLOC_N_OFFSET  0x100
-+#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    0x80
-+#define STRIDE_OR       0x80
-+#else
 +#define ALLOC_PAD       0
 +#define ALLOC_ROUND     0x1000
 +#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    32
++#define STRIDE_ROUND    64
 +#define STRIDE_OR       0
-+#endif
 +
 +#define DEBUG_ZAP0_BUFFERS 0
 +
@@ -18612,13 +27968,22 @@ index 0000000..b061fe0
 +    {
 +        case AV_PIX_FMT_YUV420P:
 +            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
 +            geo.stride_c = geo.stride_y / 2;
-+        //    geo.height_y = (video_height + 15) & ~15;
 +            geo.height_y = (video_height + 32 + 31) & ~31;
 +            geo.height_c = geo.height_y / 2;
 +            geo.planes_c = 2;
 +            geo.stripes = 1;
++            geo.bytes_per_pel = 1;
++            break;
++
++        case AV_PIX_FMT_YUV420P10:
++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 2;
 +            break;
 +
 +        case AV_PIX_FMT_SAND128:
@@ -18653,6 +28018,7 @@ index 0000000..b061fe0
 +            geo.height_c = img.pitch / stripe_w - geo.height_y;
 +            geo.planes_c = 1;
 +            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
 +
 +            pthread_mutex_unlock(&sand_lock);
 +
@@ -18661,6 +28027,45 @@ index 0000000..b061fe0
 +            break;
 +        }
 +
++        case AV_PIX_FMT_SAND64_16:
++        case AV_PIX_FMT_SAND64_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV_16,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 2;
++
++            pthread_mutex_unlock(&sand_lock);
++            break;
++        }
++
 +        default:
 +            memset(&geo, 0, sizeof(geo));
 +            break;
@@ -18733,8 +28138,12 @@ index 0000000..b061fe0
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    // abuse: linesize[3] = "stripe stride"
++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++    // In a general case this makes the calculation an xor and multiply rather
++    // than a divide and multiply
 +    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++        frame->linesize[3] = geo.height_y + geo.height_c;
 +
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
@@ -18744,6 +28153,11 @@ index 0000000..b061fe0
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++    // *** If we intend to use this for real we will want a 2nd buffer pool
++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
++#endif
++
 +    return 0;
 +}
 +
@@ -18762,7 +28176,7 @@ index 0000000..b061fe0
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             frame->format == AV_PIX_FMT_SAND128)
++             av_rpi_is_sand_frame(frame))
 +    {
 +        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
 +    }
@@ -18792,6 +28206,7 @@ index 0000000..b061fe0
 +    unsigned int i;
 +    uint8_t * psrc, * pdest;
 +
++    dest->format = src->format;
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
@@ -18823,29 +28238,142 @@ index 0000000..b061fe0
 +}
 +
 +
++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src)
++{
++    AVFrame dest_frame;
++    AVFrame * const dest = &dest_frame;
++    unsigned int i;
++    uint8_t * psrc, * psrc2, * pdest;
++
++    memset(dest, 0, sizeof(*dest));
++    dest->format = AV_PIX_FMT_SAND128;
++    dest->width = src->width;
++    dest->height = src->height;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++         i != dest->height;
++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++    {
++        uint16_t * s = (uint16_t*)psrc;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
++        {
++            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
++            for (unsigned int j = 0; j != n; ++j)
++                *d++ = (uint8_t)(*s++ >> 2);
++            d += (dest->linesize[3] - 1) * dest->linesize[0];
++        }
++    }
++
++    // C
++    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
++    {
++        const uint16_t * su = (uint16_t*)psrc;
++        const uint16_t * sv = (uint16_t*)psrc2;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
++        {
++            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
++            for (unsigned int j = 0; j != n; ++j)
++            {
++                *d++ = (uint8_t)(*su++ >> 2);
++                *d++ = (uint8_t)(*sv++ >> 2);
++            }
++            d += (dest->linesize[3] - 1) * dest->linesize[1];
++        }
++    }
++
++    return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src, const unsigned int src_bits)
++{
++    AVFrame dest_frame = {
++        .format = AV_PIX_FMT_SAND128,
++        .width = src->width,
++        .height = src->height
++    };
++    AVFrame * const dest = &dest_frame;
++    const unsigned int shr = src_bits - 8;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height, shr);
++    // C
++    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height / 2, shr);
++
++    return dest->buf[0];
++}
++
++
++
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy)
++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
 +{
 +    assert(s != NULL);
 +
 +    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_SAND128)
++        frame->format != AV_PIX_FMT_YUV420P10 &&
++        !av_rpi_is_sand_frame(frame))
 +    {
 +        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
-+    if (frame->buf[1] != NULL)
++    if (frame->buf[1] != NULL || frame->format != expected_format)
 +    {
-+        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
++#if RPI_ZC_SAND_8_IN_10_BUF
++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++        {
++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++        }
++#endif
++
 +        if (maycopy)
 +        {
-+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            return zc_copy(s, frame);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++            switch (frame->format)
++            {
++                case AV_PIX_FMT_YUV420P10:
++                    return zc_420p10_to_sand128(s, frame);
++
++                case AV_PIX_FMT_SAND64_10:
++                    return zc_sand64_16_to_sand128(s, frame, 10);
++
++                default:
++                    return zc_copy(s, frame);
++            }
 +        }
 +        else
 +        {
-+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
 +            return NULL;
 +        }
 +    }
@@ -18972,10 +28500,10 @@ index 0000000..b061fe0
 +
 diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..f4aeb78
+index 0000000000..26fb3be999
 --- /dev/null
 +++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,137 @@
+@@ -0,0 +1,105 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
@@ -18986,23 +28514,33 @@ index 0000000..f4aeb78
 +// bit of memory for the frame when can then be reference counted until
 +// display has finished with it.
 +
-+#include "libavutil/frame.h"
-+#include "libavcodec/avcodec.h"
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++//     Only works if SAO active
++//     Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF  0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
 +
 +// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef AVBufferRef * AVRpiZcRefPtr;
++typedef struct AVBufferRef * AVRpiZcRefPtr;
 +
 +struct AVZcEnv;
 +typedef struct AVZcEnv * AVZcEnvPtr;
 +
 +typedef struct AVRpiZcFrameGeometry
 +{
-+    unsigned int stride_y;
-+    unsigned int height_y;
-+    unsigned int stride_c;
-+    unsigned int height_c;
-+    unsigned int planes_c;
-+    unsigned int stripes;
++    unsigned int stride_y;  // Luma stride (bytes)
++    unsigned int height_y;  // Luma height (lines)
++    unsigned int stride_c;  // Chroma stride (bytes)
++    unsigned int height_c;  // Chroma stride (lines)
++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
++    unsigned int stripes;   // Number of stripes (sand)
++    unsigned int bytes_per_pel;
 +} AVRpiZcFrameGeometry;
 +
 +
@@ -19028,7 +28566,7 @@ index 0000000..f4aeb78
 +//     the data, then allocate a new buffer and copy the data into it
 +//   Otherwise return NULL
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy);
++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
 +
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
@@ -19069,52 +28607,10 @@ index 0000000..f4aeb78
 +
 +
 +
-+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x = x_c * 2;
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
-+}
-+
-+static inline int rpi_sliced_frame(const AVFrame * const frame)
-+{
-+    return frame->format == AV_PIX_FMT_SAND128;
-+}
-+
-+
 +#endif
 +
 diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index 0c68836..b8139f5 100644
+index 9363026695..8a8b13f0df 100644
 --- a/libavcodec/utils.c
 +++ b/libavcodec/utils.c
 @@ -26,6 +26,12 @@
@@ -19130,7 +28626,15 @@ index 0c68836..b8139f5 100644
  #include "libavutil/atomic.h"
  #include "libavutil/attributes.h"
  #include "libavutil/avassert.h"
-@@ -64,6 +70,10 @@
+@@ -39,6 +45,7 @@
+ #include "libavutil/mathematics.h"
+ #include "libavutil/mem_internal.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/samplefmt.h"
+ #include "libavutil/dict.h"
+@@ -64,6 +71,10 @@
  #include "libavutil/ffversion.h"
  const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  
@@ -19141,7 +28645,7 @@ index 0c68836..b8139f5 100644
  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
  {
-@@ -508,6 +518,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
      return ret;
  }
  
@@ -19189,7 +28693,7 @@ index 0c68836..b8139f5 100644
  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
  {
      FramePool *pool = avctx->internal->pool;
-@@ -555,6 +606,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
              av_buffer_pool_uninit(&pool->pools[i]);
              pool->linesize[i] = linesize[i];
              if (size[i]) {
@@ -19204,20 +28708,20 @@ index 0c68836..b8139f5 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
-@@ -729,6 +788,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
  {
      int ret;
  
 +#ifdef RPI
 +    // This is going to end badly if we let it continue
-+    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++    av_assert0(!av_rpi_is_sand_frame(frame));
 +#endif
 +
      if (avctx->hw_frames_ctx)
          return av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0);
  
 diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
-index ecfb872..5fa099f 100644
+index 7d306a5c33..20eeda97aa 100644
 --- a/libavfilter/avfilter.c
 +++ b/libavfilter/avfilter.c
 @@ -969,6 +969,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
@@ -19229,7 +28733,7 @@ index ecfb872..5fa099f 100644
  #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
              if (   !strcmp(filter->filter->name, "format")     ||
 diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index 3eff152..30dfb14 100644
+index 3eff1522bd..30dfb14946 100644
 --- a/libavformat/mpegts.c
 +++ b/libavformat/mpegts.c
 @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
@@ -19242,7 +28746,7 @@ index 3eff152..30dfb14 100644
      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index a82bbc7..4bf5574 100644
+index ff55fc8d97..c233f57bbd 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
 @@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
@@ -19254,8 +28758,84 @@ index a82bbc7..4bf5574 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index 15d95dec67..3be954257b 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -60,6 +60,8 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++          rpi_sand_fns.h                                                \
++          rpi_sand_fn_pw.h                                              \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -138,6 +140,7 @@ OBJS = adler32.o                                                        \
+        reverse.o                                                        \
+        rc4.o                                                            \
+        ripemd.o                                                         \
++       rpi_sand_fns.o                                                   \
+        samplefmt.o                                                      \
+        sha.o                                                            \
+        sha512.o                                                         \
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..dbffdaefa4
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,40 @@
++#include "libavutil/arm/asm.S"
++
++@ void rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             [r0]
++@   const uint8_t * src1,       [r1]
++@   const uint8_t * src2,       [r2]
++@   unsigned int lines);        [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
 diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 8d1aa5f..649876d 100644
+index 8d1aa5fa84..649876db77 100644
 --- a/libavutil/buffer.c
 +++ b/libavutil/buffer.c
 @@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
@@ -19269,7 +28849,7 @@ index 8d1aa5f..649876d 100644
 +  return buf->opaque;
 +}
 diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 73b6bd0..d907de3 100644
+index 73b6bd0b14..d907de3f1c 100644
 --- a/libavutil/buffer.h
 +++ b/libavutil/buffer.h
 @@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
@@ -19283,7 +28863,7 @@ index 73b6bd0..d907de3 100644
   * @}
   */
 diff --git a/libavutil/frame.h b/libavutil/frame.h
-index 7cb78a1..b94a635 100644
+index 7cb78a1a44..b94a63565f 100644
 --- a/libavutil/frame.h
 +++ b/libavutil/frame.h
 @@ -127,6 +127,13 @@ enum AVFrameSideDataType {
@@ -19315,10 +28895,10 @@ index 7cb78a1..b94a635 100644
  /**
   * Structure to hold side data for an AVFrame.
 diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index d4a7a8b..92a01a4 100644
+index d4a7a8ba3b..bf7e402373 100644
 --- a/libavutil/pixdesc.c
 +++ b/libavutil/pixdesc.c
-@@ -2158,6 +2158,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+@@ -2158,6 +2158,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
          .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
                   AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
      },
@@ -19333,26 +28913,463 @@ index d4a7a8b..92a01a4 100644
 +            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
 +        },
 +        .flags = 0,
-+    }
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
++        },
++        .flags = 0,
++    },
  };
  #if FF_API_PLUS1_MINUS1
  FF_ENABLE_DEPRECATION_WARNINGS
 diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 5dafc34..0895b69 100644
+index 5dafc341a1..9af4c3e610 100644
 --- a/libavutil/pixfmt.h
 +++ b/libavutil/pixfmt.h
-@@ -314,6 +314,9 @@ enum AVPixelFormat {
+@@ -314,6 +314,11 @@ enum AVPixelFormat {
      AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
      AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
  
 +// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
 +
      AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
  };
  
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..52d52a2a83
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,182 @@
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..b8bfad915e
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,96 @@
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if HAVE_NEON
++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
++#endif
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..48948ecb47
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,127 @@
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
++    //   to optimize a whole lot better
++    return frame->linesize[0];
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
 diff --git a/libswscale/input.c b/libswscale/input.c
-index 04a5190..837f633 100644
+index 04a5190711..0a188ba267 100644
 --- a/libswscale/input.c
 +++ b/libswscale/input.c
 @@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
@@ -19369,36 +29386,38 @@ index 04a5190..837f633 100644
  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
  
  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-@@ -1124,6 +1131,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+@@ -1124,6 +1131,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
      case AV_PIX_FMT_P016BE:
          c->chrToYV12 = p016BEToUV_c;
          break;
 +    case AV_PIX_FMT_SAND128:
-+        c->chrToYV12 = sand128ToUV_c;
++    case AV_PIX_FMT_SAND64_10:
++        c->chrToYV12 = sand128ToUV_c;  // NIF
 +        break;
      }
      if (c->chrSrcHSubSample) {
          switch (srcFormat) {
 diff --git a/libswscale/utils.c b/libswscale/utils.c
-index 4c9b53b..835f3aa 100644
+index 4c9b53bbeb..df8a793770 100644
 --- a/libswscale/utils.c
 +++ b/libswscale/utils.c
-@@ -254,6 +254,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+@@ -254,6 +254,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
      [AV_PIX_FMT_P010BE]      = { 1, 1 },
      [AV_PIX_FMT_P016LE]      = { 1, 0 },
      [AV_PIX_FMT_P016BE]      = { 1, 0 },
 +#ifdef RPI
 +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
 +#endif
  };
  
  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000..2d45679
+index 0000000000..b1e99a6a89
 --- /dev/null
 +++ b/pi-util/BUILD.txt
-@@ -0,0 +1,24 @@
+@@ -0,0 +1,25 @@
 +Building Pi FFmpeg
 +==================
 +
@@ -19416,16 +29435,216 @@ index 0000000..2d45679
 +in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
 +lot of history you don't want.
 +
-+If you have a copy of qasm.py in ../local then the .qasm sources will be
++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
 +rebuilt.  Otherwise the prebuilt .c & .h files will be used.
++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
 +
 +pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
 +H265 QPU acceleration is broken on Pi1 and so it is disabled.
 +
 +
+diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
+new file mode 100644
+index 0000000000..f05b7753f7
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,193 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
 new file mode 100644
-index 0000000..6082641
+index 0000000000..6082641271
 --- /dev/null
 +++ b/pi-util/conf_h265.2016_HEVC_v1.csv
 @@ -0,0 +1,147 @@
@@ -19578,7 +29797,7 @@ index 0000000..6082641
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
 new file mode 100644
-index 0000000..fc14f2a
+index 0000000000..fc14f2a3c2
 --- /dev/null
 +++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
@@ -19728,7 +29947,7 @@ index 0000000..fc14f2a
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
 new file mode 100755
-index 0000000..ec25b81
+index 0000000000..ec25b81c31
 --- /dev/null
 +++ b/pi-util/conf_pi1.sh
 @@ -0,0 +1,31 @@
@@ -19765,7 +29984,7 @@ index 0000000..ec25b81
 +# -Wa,-ahls
 diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
 new file mode 100755
-index 0000000..f8e5e75
+index 0000000000..f8e5e75375
 --- /dev/null
 +++ b/pi-util/conf_pi2.sh
 @@ -0,0 +1,30 @@
@@ -19801,12 +30020,13 @@ index 0000000..f8e5e75
 +# -Wa,-ahls
 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
 new file mode 100755
-index 0000000..e96bad2
+index 0000000000..70f7be22bb
 --- /dev/null
 +++ b/pi-util/ffconf.py
-@@ -0,0 +1,164 @@
+@@ -0,0 +1,174 @@
 +#!/usr/bin/env python
 +
++import string
 +import os
 +import subprocess
 +import re
@@ -19817,9 +30037,18 @@ index 0000000..e96bad2
 +
 +ffmpeg_exec = "./ffmpeg"
 +
-+def testone(fileroot, name, es_file, md5_file):
++def testone(fileroot, srcname, es_file, md5_file):
 +    tmp_root = "/tmp"
 +
++    names = srcname.split('/')
++    while len(names) > 1:
++        tmp_root = os.path.join(tmp_root, names[0])
++        del names[0]
++    name = names[0]
++
++    if not os.path.exists(tmp_root):
++        os.makedirs(tmp_root)
++
 +    dec_file = os.path.join(tmp_root, name + ".dec.md5")
 +    try:
 +        os.remove(dec_file)
@@ -19878,7 +30107,7 @@ index 0000000..e96bad2
 +                    pass
 +                elif ext == ".bit" or ext == ".bin":
 +                    es_file = f
-+                elif ext == ".md5" or (ext == ".txt" and base[-4:] == "_md5"):
++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
 +                    if md5_file == "?":
 +                        md5_file = f
 +                    elif base[-3:] == "yuv":
@@ -19890,9 +30119,9 @@ index 0000000..e96bad2
 +    if not tests:
 +        return True
 +    for t in tests:
-+        if name[0:len(t)] == t:
++        if name[0:len(t)] == t or name.find("/" + t) != -1:
 +            return True
-+        return False
++    return False
 +
 +def doconf(csva, tests, test_root):
 +    unx_failures = []
@@ -19954,9 +30183,9 @@ index 0000000..e96bad2
 +
 +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
 +    argp.add_argument("tests", nargs='*')
-+    argp.add_argument("--test_root", default="/opt/conform/h265", help="Root dir for test")
++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
 +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
 +    args = argp.parse_args()
 +
 +    if args.csvgen:
@@ -19969,14 +30198,169 @@ index 0000000..e96bad2
 +
 +    doconf(csva, args.tests, args.test_root)
 +
+diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
+new file mode 100755
+index 0000000000..27cc453963
+--- /dev/null
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,124 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
++
++def main():
++    global flog
++
++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
++
++    argp.add_argument("streams", nargs='*')
++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++    argp.add_argument("--csv_in", help="CSV input filename")
++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++
++    args = argp.parse_args()
++
++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++    csv_out.writeheader()
++
++    stats_in = {}
++    if args.csv_in != None:
++        with open(args.csv_in, 'r', newline='') as f_in:
++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
++
++    streams = args.streams
++    if not streams:
++        if not stats_in:
++            print ("No source streams specified")
++            return 1
++        prefix = "" if args.prefix == None else args.prefix
++        streams = [k for k in stats_in]
++    elif args.prefix != None:
++        prefix = args.prefix
++    else:
++        prefix = streams[0]
++        for f in streams[1:]:
++            prefix = common_prefix(prefix, f)
++        pp = prefix.rpartition(os.sep)
++        prefix = pp[0] + pp[1]
++        streams = [s[len(prefix):] for s in streams]
++
++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++        print ("====", f)
++
++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++        for i in range(3):
++            t = tstats.time_file(f, prefix)
++            print ("...", t.times_str())
++            if t0 > t:
++                t0 = t
++
++        if t0.name in stats_in:
++            pstat = stats_in[t0.name]
++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++        csv_out.writerow(t0.dict())
++
++        print ()
++
++    return 0
++
++
++if __name__ == '__main__':
++    exit(main())
++
+diff --git a/pi-util/make_array.py b/pi-util/make_array.py
+new file mode 100755
+index 0000000000..864fa5e704
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,19 @@
++#!/usr/bin/env python
++
++# Usage
++#   make_array file.bin
++#   Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++  prefix,suffix = file.split('.')
++  assert suffix=='bin'
++  name=prefix.split('/')[-1]
++  print 'Converting',file
++  with open(prefix+'.h','wb') as out:
++    print >>out, 'static const unsigned char',name,'[] = {'
++    with open(file,'rb') as fd:  
++      for byte in fd.read():
++        print >>out, '%d,' % ord(byte)
++    print >>out,'};'
++
 diff --git a/pi-util/qem.sh b/pi-util/qem.sh
 new file mode 100755
-index 0000000..47dd071
+index 0000000000..5ce2eeaf72
 --- /dev/null
 +++ b/pi-util/qem.sh
 @@ -0,0 +1,9 @@
 +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ pi-util/qasm.py
++QASM=python\ ../local/bin/qasm.py
 +SRC_FILE=libavcodec/rpi_shader.qasm
 +DST_BASE=shader
 +
@@ -19986,7 +30370,7 @@ index 0000000..47dd071
 +
 diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
 new file mode 100755
-index 0000000..5935a11
+index 0000000000..5935a11ca5
 --- /dev/null
 +++ b/pi-util/v3dusage.py
 @@ -0,0 +1,128 @@
@@ -20118,4 +30502,3 @@ index 0000000..5935a11
 +
 +    do_logparse(args.logfile)
 +
-