diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk index c1732c2d35..cdcc49e5fc 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.2sf" -PKG_VERSION="2bc20c5" -PKG_SHA256="e595cce4aa616c6f36bc110626172be43c87ffe013d2c6aa20aa2cdbaba49b39" +PKG_VERSION="0f2298f" +PKG_SHA256="354ae9f98e83b3a9b614076cc665ada7792c77a6707f85712088c4c90c772fec" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk index 51e66313e2..e20f45f9f5 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.dumb" -PKG_VERSION="fbe7090" -PKG_SHA256="8b91aec227250e9ec25010db1775aaf443e8923618b5097a9805db286929c7da" +PKG_VERSION="be3e3d6" +PKG_SHA256="f95d1175cba66b4443b089d8e788a0709c92d4556d45b1abc11647cb67e2a34d" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk index fe15c40aee..b61f3997cb 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.gsf" -PKG_VERSION="6af240a" -PKG_SHA256="1b0dbb73c7c071d892798d9ece2980bbe4eddf90d9f0fb99ff646aa55fac6061" +PKG_VERSION="67d9cd8" +PKG_SHA256="7ded6afc8dab0c65a1795845288f86bfe37ce376c6bcd389e9b624d240dd93ce" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk index 485de8fbaf..f0c75733e2 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.modplug" -PKG_VERSION="0f5274e" -PKG_SHA256="31ece57a1848c53b135f4971939ec723ea8368d51fa9431365427c60b52fad00" +PKG_VERSION="72018cd" +PKG_SHA256="e799c0a7405c4df89058b91b0925f0e7860d750c1613e3ef38e141f12fa78904" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk index bbe61b19e4..ba53a12ee6 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.qsf" -PKG_VERSION="945257e" -PKG_SHA256="ac7d301ff3d7b4caef0a23e88ecf7cf84da37f425c4cf3e2bc0d74731df8b3ce" +PKG_VERSION="932874a" +PKG_SHA256="a384b487bca722c62e31791df81a9750871308ad6c1c0434893db038efcda024" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk index d71dfeeaec..bfff9d19b2 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.sidplay" -PKG_VERSION="172cf89" -PKG_SHA256="ba580bcd662791c38fd1c8ef9b824084028c15d6bcce41908cf5595ad0bd9329" +PKG_VERSION="28bd921" +PKG_SHA256="e0f35803697d055f5defbb1a405804149860bc49a451819ade1b00fb2724a5dc" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk index 463839eb1b..16ec072b09 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.timidity" -PKG_VERSION="5567057" -PKG_SHA256="a6bfa95a08bdc2ceb8adf163d8c78c274fbb406df3dcc9d3bc78b753a8c814a7" +PKG_VERSION="8d37e2c" +PKG_SHA256="c99f3271409414e0675392a10b590eb77c81801eb86f69043948b65f5e706607" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk index 95099d6951..14db4260f2 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.vgmstream" -PKG_VERSION="9569fe5" -PKG_SHA256="f872b029370dc613194bd93536558a118f352eed0ca3035f36a8202c39143d33" +PKG_VERSION="43e05e4" +PKG_SHA256="7b57a437514e9ac31736f72f5be7634560e116b31949969d028a1f63a51be893" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk index 11d0401b01..d1b1a015b7 100644 --- a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audioencoder.flac" -PKG_VERSION="1d540c6" -PKG_SHA256="eb8eba562012a1048129f679ef0bd240a776fbca73b843084723af895212b0fe" +PKG_VERSION="ed75200" +PKG_SHA256="25f4449024fcaba0ccf519e565ae679e701dafb36035a56afd2197f6121c9bba" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk index 53c233452e..ac5afcd75e 100644 --- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk +++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="imagedecoder.raw" -PKG_VERSION="8d9c448" -PKG_SHA256="f0ec7c790fe37131c5a51b0dbe0f095bd0329dc6601c02bd6cd4627cf994f607" +PKG_VERSION="aa45f0a" +PKG_SHA256="5883d0f49e0f88e00a13dfcccf622032f0e0df5b9f67e99747d98fd500bbffb8" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index 6c19522058..358285b995 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.adaptive" -PKG_VERSION="1656efc" -PKG_SHA256="68e72db74706dc6a03f7d19125e4c9e62868b6aed078c6fb595f8b326a54f732" +PKG_VERSION="dde3921" +PKG_SHA256="a3b2f2c47a9545921980fe1b81825538fd877c0ad9809ed266c80f5cba7544e6" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk index 76acd71cd2..b76ab3cfab 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.rtmp" -PKG_VERSION="e094fa3" -PKG_SHA256="00e82db4cac59296f267192e6fc12dbeebf63db34985ed35819680757c76c663" +PKG_VERSION="26260c9" +PKG_SHA256="e55d808ed6a23138aa3abe94300013fb5656cba0efe210306db92c80d523185e" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk index 008d9d2978..7a05d2f2e6 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.mythtv" -PKG_VERSION="b46e5be" -PKG_SHA256="f9a5cd6c172ce5f4a4cb1db05dae0cf8adfc43776fd9a3a8ef55ca0865ce2e52" +PKG_VERSION="8965048" +PKG_SHA256="a894d858a17c448ac66ea6631004135d3170d23c15b220b1e48c149a7c4c2bfe" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk index 76091f2fb4..70dbd3b122 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.nextpvr" -PKG_VERSION="a855663" -PKG_SHA256="471e2ef3922bb26d5df83b2bb71a78ee322861c736dd72ae21a45593317c55ee" +PKG_VERSION="03933e9" +PKG_SHA256="b0d32816deed7e744e9785d23fffaa8e63d8dadb416aa841cc061f4cb559dd4d" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk index 9e3444bf8d..0d281f75d7 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.njoy" -PKG_VERSION="99874de" -PKG_SHA256="3bbd2b992825d2d786f2ce86d0d7161ceb9c8c97bb3cd4a6c365cce75cc2836c" +PKG_VERSION="cc1cb56" +PKG_SHA256="35425e762e780fc19759cdbc504a25f23be15e0da25a58c30056aeb9709061c1" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk index dcf90defed..44242c552a 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.octonet" -PKG_VERSION="a77cf11" -PKG_SHA256="9217e8e0bec3b882dd0c7cb30b9488be64514514e91dbad31556da1ad435b166" +PKG_VERSION="e9b4c05" +PKG_SHA256="01bd1f5584cc5f781c09e33e0123b70037edcda35cfc02b5d50f5536fdb56608" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk index 635b5441fa..148260af50 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.teleboy" -PKG_VERSION="a0c218b" -PKG_SHA256="eabe85ec76c140c9703598266c59d1b16197dc4e3461c7c7e4d13f61051a4439" +PKG_VERSION="94bb643" +PKG_SHA256="92d62261385eb7b9852252070075ae968354c2dd6f96f8fd46cc2196d27e619c" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk index 3f12d45577..933f7c5a42 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vbox" -PKG_VERSION="b5da811" -PKG_SHA256="b5fd9c726df32d49bd08ed565b551c6a62b864d8768870240f0d7dc288f221ff" +PKG_VERSION="ff01396" +PKG_SHA256="c4d6a0dc2f89c47de7ffc1fa2e1e7b2bb92ae1bf77b5ffcbdc5dccd6537d0c35" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk index 3a90f1d8f3..3150de64dd 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vdr.vnsi" -PKG_VERSION="7e6e385" -PKG_SHA256="618ba2c7c33be4df580b29c913caf47430d979d91b41013a96c006fcb9407e11" +PKG_VERSION="f3f80d5" +PKG_SHA256="f89bebb6b81f8ad21b520837e227fd175d7e7bc59d5d492484e3528f14c50766" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk index 97914fa313..0d687cd107 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vuplus" -PKG_VERSION="9bfd868" -PKG_SHA256="54f59345f9f226c528572a274ddb26e33f0e551786e7c926f7429c35340280b4" +PKG_VERSION="5e154bc" +PKG_SHA256="aa193e058c746dd459665d13289411073f29c7f2d740e0f17c3b870faae19158" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk index 55ad63bf08..53a75cb170 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.wmc" -PKG_VERSION="d820bf8" -PKG_SHA256="723e25571da3261d70f7911dc72bd881ca394b67d2dd9b4b022fcfe2aa754acd" +PKG_VERSION="7e2cb4b" +PKG_SHA256="d935ecf8dcc137953698cb7ea3bc7c8e3674dfaca2c038045fcbec481d9cd35c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk index 74d3e40485..56f40ac647 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.zattoo" -PKG_VERSION="a24879b" -PKG_SHA256="670308d5982dd4ce18b620c485c36aafa019acb292f7924cdc560a286c48540c" +PKG_VERSION="73009cd" +PKG_SHA256="431960430b354250dbb4e9f3b78fe6ee0046762a5b505139b94073580e0b05bd" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk index 3557417f9e..356e2ddf95 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.asteroids" -PKG_VERSION="d606895" -PKG_SHA256="d8cf9d2ced18a6a13a11c1f8749266563ad4a847c8785241fe3dc8575b4cf69f" +PKG_VERSION="2418981" +PKG_SHA256="f69ce2b58494f7ba8e714c9c8f738661e0d9ff56fc96dcec225295a1359748c9" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk index d4c8f8ddd2..d89b713e7d 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.asterwave" -PKG_VERSION="4326ddc" -PKG_SHA256="f29d6dd707ef5cd69abcec14af71a2e9623caf207fd12a0e9e0e0379fc3bf798" +PKG_VERSION="5bb1c48" +PKG_SHA256="1213695199587155d9f46a7c96586ee46cebfb1b5d373c1b1e2ba77de19381af" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk index ec132f9b19..28fbe5d69d 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.biogenesis" -PKG_VERSION="c1ea27c" -PKG_SHA256="a13adce077df37926da5fb4fd4f3f61902b19c7400fa9d7dfc92ab982efd379b" +PKG_VERSION="5241aec" +PKG_SHA256="57185a419f7c32dfefeb7c82ed3f07f6f8840f2ac7da5d4c03d023a2cda44238" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk index 25d9344a25..506d703c04 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.cpblobs" -PKG_VERSION="6acb909" -PKG_SHA256="9d238824d5cb2dccd76ef40bac2b4ec3f38e815d4167c8d86f78501c52ca7b28" +PKG_VERSION="be324f3" +PKG_SHA256="10669b1dd1b7f5677af468e73ec48270218c3112cc83b95168f2f3b426b38d00" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk index 4112b21d20..7d5735c911 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.greynetic" -PKG_VERSION="8b7b810" -PKG_SHA256="305ab296fb6a60f538309b095332f58f67d6e542ec380a886a6107eab02e5a91" +PKG_VERSION="6aefc4b" +PKG_SHA256="b23ff0b2db842eebb58c147057ac835184f121c065ce0d33c2d03534ea95d28f" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk index 053adf667f..465556b4d4 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.matrixtrails" -PKG_VERSION="3323406" -PKG_SHA256="f1030704c6b6e179a074edbe36fb41cce2cbe580da26ad41848aad044b690aad" +PKG_VERSION="99c5649" +PKG_SHA256="46da66cd6b41b02d04e1c7ad01baf9294fa76e18596bdcf0d9e1fa595a7281a4" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk index 00a2e9ffed..477f8e41fe 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.pingpong" -PKG_VERSION="cfd0a05" -PKG_SHA256="c99ca83607dd9313ffde1ba809df9339cc923e1f9fc7be7c88af6b5b41b49a0a" +PKG_VERSION="3a27396" +PKG_SHA256="e87d270e05b446174a937b0e1d468812476f332ed0c194387adbbdf2df1c2163" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk index 9a61c88ca6..19a70d7178 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.pyro" -PKG_VERSION="97e997e" -PKG_SHA256="d873b67eb516a625a07554cab44495414dfb2aea92874ee268ad35702959b01c" +PKG_VERSION="f91a732" +PKG_SHA256="3f016bef45d36c0b8a6ab16b6b82c7c47b433d349db9d025d049d04901457ffc" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk index 5a5833e61b..7c93436adb 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.stars" -PKG_VERSION="e0da61c" -PKG_SHA256="be90a6b4158b4298ca5ebf4b25fb98d9a784c01659e2454cc0aa2e142aa935d4" +PKG_VERSION="bb61e49" +PKG_SHA256="418e5c0dcf010b83b2cdf7ca00ff27b663359d0706ed00ac85fd841a3e943f43" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk index 49270135cb..6ab26bf66a 100644 --- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensavers.rsxs" -PKG_VERSION="e302833" -PKG_SHA256="9d05d2315616cb578818243d8c7cb7486f5407613f4e8ca5d87a109bc73d380c" +PKG_VERSION="3b74bb6" +PKG_SHA256="5ea9b045e98a3ebccd12a2c4c238f97493d9128d68f50fd208365c5666a443f2" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk index b8543b9b00..47a774a2e3 100644 --- a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk +++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="vfs.rar" -PKG_VERSION="53294af" -PKG_SHA256="5007f097ffafb64c61bf31a902959a334819fdd26eb273d52a8437382eda6200" +PKG_VERSION="22292bc" +PKG_SHA256="4d2df1c6dc31f46dedf828f057ed90ca83f400c6f521c1a05510f82999febcaa" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk index 2e23137975..0158a01d6b 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.fishbmc" -PKG_VERSION="ccc919c" -PKG_SHA256="0a642873a2ba5acea271d04600160c7143c050f6b637db7d55a76ecb627c6e21" +PKG_VERSION="3dae2bd" +PKG_SHA256="471765286c6054717980510edf5d49390b0d4f38289c83830a9e0a444202825c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk index 56ac1c1b78..748a1f518a 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.goom" -PKG_VERSION="0c93889" -PKG_SHA256="62b5b8d9f63650633a447c21f17b5341d11404a7238c2dda20283990d031cf5a" +PKG_VERSION="65f1d9c" +PKG_SHA256="7436332d329c275a5fd1a395b1312919726ed83d7d5375ca08fb305b49b2c590" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk index c9fb41eb85..0447467721 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.pictureit" -PKG_VERSION="66f88ff" -PKG_SHA256="8b91e71e4c7828a9bfa3df3fdce07aa5b0f9fab153bdf255a53833405f5f7e41" +PKG_VERSION="8eb74a6" +PKG_SHA256="358ced879c541974a4a2dbbaa7c6f633e77adb066bb639bb585e26a50820fd43" PKG_REV="2" PKG_ARCH="x86_64" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk index 3e8850c0b8..74cb56b432 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.projectm" -PKG_VERSION="a39f73b" -PKG_SHA256="5bf7d97acb5a4144b0c82397d39ea099eb9b4cc3c74aeb18f73352aee12bc06f" +PKG_VERSION="bc05ed8" +PKG_SHA256="b1e1db697502aa6810277b69d0e0141e40b6fb9cbd4f08298cceff0152544102" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk index ce97fda3ba..d839059b12 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.shadertoy" -PKG_VERSION="1717f36" -PKG_SHA256="988015445128036f79f006a0df9c1692838436b4420e418bf84a7113bfc46300" +PKG_VERSION="764d59d" +PKG_SHA256="0b050831c6f9b7de89d7cebb6d6b7984a4675db3744cd5c7c8aebaf6251c9181" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk index 4a0072f5e8..e7ab5dcf78 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.spectrum" -PKG_VERSION="9b4a792" -PKG_SHA256="6d2120bd1c1cb04233998736fd7ae43e42388b4b44ebe92331cd7c0064b37bf8" +PKG_VERSION="d75d995" +PKG_SHA256="1d838196a38bca5b1ca6b29f340165e4249513c548f9f183b2b07fbd10dae268" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk index 0ec0d8c268..5072591ebb 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.waveform" -PKG_VERSION="0e78a14" -PKG_SHA256="19d3377424daf6fd5a90e1707e71bd8ab34fe94b9a703c184f0e17ab8a73f514" +PKG_VERSION="8204be7" +PKG_SHA256="457d861a8ef5a054339effe803b4aae801256282b098db63ae45aa90a9c30c9e" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index ddcd2f6348..efa80c96ca 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="kodi" -PKG_VERSION="ef76936" -PKG_SHA256="6fd4354279bcdb6b20fcc8968ca54125027e88ad55444c8d91da1ab9b436c59d" +PKG_VERSION="9d82343" +PKG_SHA256="c22b044ca692798049b731a69bec501b88fb41a59304b9d3b85d51331d2bdea7" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index e8561d2993..9444284475 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -18,8 +18,8 @@ PKG_NAME="ffmpeg" # Current branch is: release/3.3-kodi -PKG_VERSION="30554d7" -PKG_SHA256="a1bc2f092e1b11ea3271a8fdcef8ec2f9bee7e1cf05f0a1b89ec7f903fee6d14" +PKG_VERSION="20f6654" +PKG_SHA256="34d4f16d529b03d276fe7cbab8c7d12c4dfd51f0c1f78c5f38fab4a66a836deb" PKG_ARCH="any" PKG_LICENSE="LGPLv2.1+" PKG_SITE="https://ffmpeg.org" diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch index 2786d22397..1fc696eac4 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch @@ -1,8 +1,16 @@ diff --git a/.gitignore b/.gitignore -index 524fb73..305632b 100644 +index 524fb73c16..bcc983739f 100644 --- a/.gitignore +++ b/.gitignore -@@ -23,6 +23,7 @@ +@@ -1,6 +1,7 @@ + *.a + *.o + *.o.* ++*.bin + *.d + *.def + *.dll +@@ -23,6 +24,7 @@ .\#* /.config /.version @@ -11,7 +19,7 @@ index 524fb73..305632b 100644 /ffplay /ffprobe diff --git a/ffmpeg.c b/ffmpeg.c -index 4b4dae4..9a7c29c 100644 +index 4b4dae47fe..0149e73f46 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -23,6 +23,11 @@ @@ -20,13 +28,21 @@ index 4b4dae4..9a7c29c 100644 +#ifdef RPI +#define RPI_DISPLAY -+#define RPI_ZERO_COPY ++#define RPI_DISPLAY_ALL 0 +#endif + #include "config.h" #include #include -@@ -69,6 +74,25 @@ +@@ -43,6 +48,7 @@ + #include "libavformat/avformat.h" + #include "libavdevice/avdevice.h" + #include "libswresample/swresample.h" ++#include "libavutil/atomic.h" + #include "libavutil/opt.h" + #include "libavutil/channel_layout.h" + #include "libavutil/parseutils.h" +@@ -69,6 +75,25 @@ # include "libavfilter/buffersrc.h" # include "libavfilter/buffersink.h" @@ -38,21 +54,21 @@ index 4b4dae4..9a7c29c 100644 +#include +#include +#include ++#include +#include +#include +#include +#include +#pragma GCC diagnostic pop -+#ifdef RPI_ZERO_COPY +#include "libavcodec/rpi_qpu.h" -+#endif ++#include "libavutil/rpi_sand_fns.h" +#include "libavcodec/rpi_zc.h" +#endif + #if HAVE_SYS_RESOURCE_H #include #include -@@ -165,6 +189,182 @@ static int restore_tty; +@@ -165,6 +190,241 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -60,39 +76,36 @@ index 4b4dae4..9a7c29c 100644 + +#define NUM_BUFFERS 4 + -+static MMAL_COMPONENT_T* rpi_display = NULL; -+static MMAL_POOL_T *rpi_pool = NULL; -+static volatile int rpi_display_count = 0; + -+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h) ++typedef struct rpi_display_env_s ++{ ++ MMAL_COMPONENT_T* display; ++ MMAL_COMPONENT_T* isp; ++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup ++ MMAL_CONNECTION_T * conn; ++ ++ MMAL_POOL_T *rpi_pool; ++ volatile int rpi_display_count; ++ enum AVPixelFormat avfmt; ++} rpi_display_env_t; ++ ++static rpi_display_env_t * rpi_display_env = NULL; ++ ++ ++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port) +{ + MMAL_POOL_T* pool; -+ size_t i; -+ size_t size = (w*h*3)/2; -+#ifdef RPI_ZERO_COPY + mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? + pool = mmal_port_pool_create(port, NUM_BUFFERS, 0); + assert(pool); -+#else -+ pool = mmal_port_pool_create(port, NUM_BUFFERS, size); -+ -+ for (i = 0; i < NUM_BUFFERS; ++i) -+ { -+ MMAL_BUFFER_HEADER_T* buffer = pool->header[i]; -+ char * bufPtr = buffer->data; -+ memset(bufPtr, i*30, w*h); -+ memset(bufPtr+w*h, 128, (w*h)/2); -+ } -+#endif + + return pool; +} + +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { -+#ifdef RPI_ZERO_COPY ++ rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata; + av_rpi_zc_unref(buffer->user_data); -+ --rpi_display_count; -+#endif ++ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1); + mmal_buffer_header_release(buffer); +} + @@ -100,9 +113,12 @@ index 4b4dae4..9a7c29c 100644 + mmal_buffer_header_release(buffer); +} + -+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h) ++#define DISPLAY_PORT_DEPTH 4 ++ ++static rpi_display_env_t * ++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h) +{ -+ MMAL_COMPONENT_T* display; ++ MMAL_STATUS_T err; + MMAL_DISPLAYREGION_T region = + { + .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, @@ -111,51 +127,113 @@ index 4b4dae4..9a7c29c 100644 + .fullscreen = 0, + .dest_rect = {x, y, w, h} + }; ++#if RPI_ZC_SAND_8_IN_10_BUF ++ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt; ++#else ++ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt; ++#endif + const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h); ++ rpi_display_env_t * de; ++ int isp_req = (fmt == AV_PIX_FMT_SAND64_10); + -+ bcm_host_init(); // TODO is this needed? -+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display); -+ assert(display); ++ bcm_host_init(); // Needs to be done by someone... + -+ mmal_port_parameter_set(display->input[0], ®ion.hdr); ++ if ((de = av_mallocz(sizeof(*de))) == NULL) { ++ return NULL; ++ } ++ ++ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display); ++ av_assert0(de->display); ++ de->port_in = de->display->input[0]; ++ ++ if (isp_req) ++ { ++ mmal_component_create("vc.ril.isp", &de->isp); ++ de->port_in = de->isp->input[0]; ++ } ++ ++ mmal_port_parameter_set(de->display->input[0], ®ion.hdr); + + { -+ MMAL_ES_FORMAT_T* format = display->input[0]->format; -+ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420; ++ MMAL_PORT_T * const port = de->port_in; ++ MMAL_ES_FORMAT_T* const format = port->format; ++ port->userdata = (struct MMAL_PORT_USERDATA_T *)de; ++ port->buffer_num = DISPLAY_PORT_DEPTH; ++ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : ++ fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 : ++ MMAL_ENCODING_I420; + format->es->video.width = geo.stride_y; -+ format->es->video.height = geo.height_y; ++ format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ? ++ (h + 15) & ~15 : geo.height_y; // Magic + format->es->video.crop.x = 0; + format->es->video.crop.y = 0; + format->es->video.crop.width = w; + format->es->video.crop.height = h; -+ mmal_port_format_commit(display->input[0]); ++ mmal_port_format_commit(port); + } + -+ mmal_component_enable(display); ++ de->rpi_pool = display_alloc_pool(de->port_in); ++ mmal_port_enable(de->port_in,display_cb_input); + -+ rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y); ++ if (isp_req) { ++ MMAL_PORT_T * const port_out = de->isp->output[0]; ++ mmal_log_dump_port(de->port_in); ++ mmal_format_copy(port_out->format, de->port_in->format); ++ if (fmt == AV_PIX_FMT_SAND64_10) { ++ if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS || ++ (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS) ++ { ++ av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n"); ++ } ++ else ++ av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n"); + -+ mmal_port_enable(display->input[0],display_cb_input); -+ mmal_port_enable(display->control,display_cb_control); ++ } ++ port_out->format->encoding = MMAL_ENCODING_I420; ++ mmal_log_dump_port(port_out); ++ if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n"); ++ goto fail; ++ } ++ if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) { ++ av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n"); ++ goto fail; ++ } ++ if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) { ++ av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n"); ++ goto fail; ++ } ++ mmal_port_enable(de->isp->control,display_cb_control); ++ mmal_component_enable(de->isp); ++ } ++ ++ mmal_component_enable(de->display); ++ mmal_port_enable(de->display->control,display_cb_control); ++ de->avfmt = fmt; + + printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt); + -+ return display; ++ return de; ++ ++fail: ++ // **** Free stuff ++ return NULL; +} + -+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr) ++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) +{ + MMAL_BUFFER_HEADER_T* buf; + -+ if (!display || !rpi_pool) ++ if (de == NULL) + return; + -+ if (rpi_display_count >= 3) { ++ if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); + return; + } + -+ buf = mmal_queue_get(rpi_pool->queue); ++ buf = mmal_queue_get(de->rpi_pool->queue); + if (!buf) { + // Running too fast so drop the frame + printf("Q alloc failure\n"); @@ -165,67 +243,64 @@ index 4b4dae4..9a7c29c 100644 + buf->cmd = 0; + buf->offset = 0; // Offset to valid data + buf->flags = 0; -+#ifdef RPI_ZERO_COPY -+{ -+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1); -+ if (fr_buf == NULL) { -+ mmal_buffer_header_release(buf); -+ return; -+ } -+ -+ buf->user_data = fr_buf; -+ buf->data = av_rpi_zc_vc_handle(fr_buf); -+ buf->offset = av_rpi_zc_offset(fr_buf); -+ buf->length = av_rpi_zc_length(fr_buf); -+ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); -+#if 0 + { -+ unsigned int n; -+ for (n = 0; n < fr->width; n += 128) { -+ memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2); ++ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1); ++ if (fr_buf == NULL) { ++ mmal_buffer_header_release(buf); ++ return; + } ++ ++ buf->user_data = fr_buf; ++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); ++ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1); + } -+#endif -+ ++rpi_display_count; -+} -+#else -+{ -+#error YYY -+ int w = fr->width; -+ int h = fr->height; -+ int w2 = (w+31)&~31; -+ int h2 = (h+15)&~15; -+ -+ buf->length = (w2 * h2 * 3)/2; -+ buf->user_data = NULL; -+ -+ //mmal_buffer_header_mem_lock(buf); -+ memcpy(buf->data, fr->data[0], w2 * h); -+ memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4); -+ memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4); -+ //mmal_buffer_header_mem_unlock(buf); -+} -+#endif -+ -+ while (rpi_display_count >= 3) { ++#if RPI_DISPLAY_ALL ++ while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + usleep(5000); + } ++#endif + -+ if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS) ++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) + { -+ printf("** send failed: depth=%d\n", rpi_display_count); -+ display_cb_input(NULL, buf); ++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); ++ display_cb_input(de->port_in, buf); + } +} + -+static void display_exit(MMAL_COMPONENT_T* display) ++static void display_exit(rpi_display_env_t ** const pde) +{ ++ rpi_display_env_t * const de = *pde; ++ *pde = NULL; ++ ++ if (de != NULL) { +// sleep(120); -+ if (display) { -+ mmal_component_destroy(display); -+ } -+ if (rpi_pool) { -+ mmal_port_pool_destroy(display->input[0], rpi_pool); ++ ++ if (de->port_in != NULL) { ++ mmal_port_disable(de->port_in); ++ } ++ ++ // The above disable should kick out all buffers - check that ++ if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) { ++ av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count)); ++ } ++ ++ if (de->conn != NULL) { ++ mmal_connection_destroy(de->conn); ++ } ++ if (de->isp != NULL) { ++ mmal_component_destroy(de->isp); ++ } ++ if (de->display != NULL) { ++ mmal_component_destroy(de->display); ++ } ++ if (de->rpi_pool != NULL) { ++ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool); ++ } ++ ++ av_free(de); + } +} + @@ -235,29 +310,30 @@ index 4b4dae4..9a7c29c 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -576,6 +776,11 @@ static void ffmpeg_cleanup(int ret) +@@ -576,6 +836,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } + +#ifdef RPI_DISPLAY -+ display_exit(rpi_display); ++ display_exit(&rpi_display_env); +#endif + for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -588,6 +793,9 @@ static void ffmpeg_cleanup(int ret) +@@ -587,7 +852,9 @@ static void ffmpeg_cleanup(int ret) + av_freep(&ist->filters); av_freep(&ist->hwaccel_device); av_freep(&ist->dts_buffer); - -+#ifdef RPI_ZERO_COPY +- ++#ifdef RPI_DISPLAY + av_rpi_zc_uninit(ist->dec_ctx); +#endif avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -618,6 +826,7 @@ static void ffmpeg_cleanup(int ret) +@@ -618,6 +885,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -265,28 +341,28 @@ index 4b4dae4..9a7c29c 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -1053,6 +1262,15 @@ static void do_video_out(OutputFile *of, +@@ -1053,6 +1321,15 @@ static void do_video_out(OutputFile *of, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; +#ifdef RPI_DISPLAY + if (next_picture && ist != NULL) + { -+ if (!rpi_display) -+ rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); -+ display_frame(ist->dec_ctx, rpi_display, next_picture); ++ if (rpi_display_env == NULL) ++ rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); ++ display_frame(ist->dec_ctx, rpi_display_env, next_picture); + } +#endif + frame_rate = av_buffersink_get_frame_rate(filter); if (frame_rate.num > 0 && frame_rate.den > 0) duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base)); -@@ -2884,6 +3102,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2884,6 +3161,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; + -+#ifdef RPI_ZERO_COPY ++#ifdef RPI_DISPLAY + // Overrides the above get_buffer2 + av_rpi_zc_init(ist->dec_ctx); +#endif @@ -295,39 +371,44 @@ index 4b4dae4..9a7c29c 100644 av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0); diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 0dd0c7b..b9732c5 100644 +index 0dd0c7b1bb..99755a297e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -5,6 +5,12 @@ NAME = avcodec +@@ -5,6 +5,15 @@ NAME = avcodec HEADERS = avcodec.h \ avdct.h \ avfft.h \ + rpi_qpu.h \ + rpi_shader.h \ -+ rpi_shader_cmd.h \ ++ rpi_shader_cmd.h \ ++ rpi_shader_template.h \ ++ rpi_shader_template_fn.h \ + rpi_mailbox.h \ -+ rpi_hevc_transform.h \ ++ rpi_hevc_transform8.h \ ++ rpi_hevc_transform10.h \ + rpi_zc.h \ d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -47,6 +53,10 @@ OBJS = allcodecs.o \ +@@ -47,6 +56,11 @@ OBJS = allcodecs.o \ resample.o \ resample2.o \ utils.o \ + rpi_qpu.o \ + rpi_shader.o \ ++ rpi_shader_template.o \ + rpi_mailbox.o \ + rpi_zc.o \ vorbis_parser.o \ xiph.o \ -@@ -1103,3 +1113,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -1103,3 +1117,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + +QASM_PY := ../local/bin/qasm.py ++VASMVIDCORE := ../local/bin/vasmvidcore_std + +ifneq ("$(wildcard $(QASM_PY))","") +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm @@ -337,9 +418,23 @@ index 0dd0c7b..b9732c5 100644 + $(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ +endif + -+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h ++ifneq ("$(wildcard $(VASMVIDCORE))","") ++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ ++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ ++ ++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin ++ python pi-util/make_array.py $< ++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin ++ python pi-util/make_array.py $< ++ ++endif ++ ++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h ++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c -index 4df4772..ca05158 100644 +index 4df4772e02..ca05158de8 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -696,6 +696,7 @@ static void register_all(void) @@ -351,10 +446,10 @@ index 4df4772..ca05158 100644 REGISTER_PARSER(MJPEG, mjpeg); REGISTER_PARSER(MLP, mlp); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index 1eeac54..a94a240 100644 +index 1eeac5449e..7e23777f5d 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -134,9 +134,13 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -134,9 +134,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -364,13 +459,14 @@ index 1eeac54..a94a240 100644 arm/hevcdsp_idct_neon.o \ - arm/hevcdsp_qpel_neon.o + arm/hevcdsp_cres_neon.o \ ++ arm/hevcdsp_res16_neon.o \ + arm/hevcdsp_qpel_neon.o \ + arm/hevcdsp_sao_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h -index fdbf86b..0a3980a 100644 +index fdbf86b45e..0a3980a1ef 100644 --- a/libavcodec/arm/cabac.h +++ b/libavcodec/arm/cabac.h @@ -26,13 +26,34 @@ @@ -553,7 +649,7 @@ index fdbf86b..0a3980a 100644 #endif /* AVCODEC_ARM_CABAC_H */ diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h new file mode 100644 -index 0000000..31d3c59 +index 0000000000..31d3c59205 --- /dev/null +++ b/libavcodec/arm/hevc_cabac.h @@ -0,0 +1,491 @@ @@ -1048,9 +1144,239 @@ index 0000000..31d3c59 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ +diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S +new file mode 100644 +index 0000000000..380d3c8d3b +--- /dev/null ++++ b/libavcodec/arm/hevc_idct_fn_neon.S +@@ -0,0 +1,224 @@ ++@ Included multiple times from hevc_idct_neon.S ++@ Macros defined there ++ ++#define DC_SHIFT (15 - BIT_DEPTH) ++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) ++#define TRN_SHIFT (20 - BIT_DEPTH) ++ ++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q0, r1 ++ vdup.16 q1, r1 ++ vst1.16 {q0, q1}, [r0] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++ vstm r0, {q8-q15} ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0, {q8-q15} ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ mov r3, #16 ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++1: subs r3, #1 ++ vstm r0!, {q8-q15} ++ bne 1b ++ bx lr ++endfunc ++ ++ ++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1 ++ vpush {d8-d15} ++ vld1.16 {q14, q15}, [r0] // coeffs ++ ldr r3, =0x00240053 // 36 and 83 ++ vmov.32 d0[0], r3 ++ ++ tr4_shift d28, d29, d30, d31, #7 ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ tr4_shift d28, d29, d30, d31, #(TRN_SHIFT) ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ vst1.16 {q14, q15}, [r0] ++ vpop {d8-d15} ++ bx lr ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1 ++ vpush {d8-d15} ++ vld1.16 {q14, q15}, [r0] // coeffs ++ ldr r3, =0x4a // 74 ++ vmov.32 d0[0], r3 ++ ldr r3, =0x1d // 29 ++ vmov.32 d0[1], r3 ++ ldr r3, =0x37 // 55 ++ vmov.32 d1[0], r3 ++ ++ tr4_luma_shift d28, d29, d30, d31, #7 ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT) ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ vst1.16 {q14, q15}, [r0] ++ vpop {d8-d15} ++ bx lr ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1 ++ push {r4-r8} ++ vpush {d8-d15} ++ mov r5, #16 ++ ++ adrl r3, tr4f ++ vld1.16 {d0, d1}, [r3] ++ ++ // left half ++ vld1.16 {d24}, [r0], r5 ++ vld1.16 {d25}, [r0], r5 ++ vld1.16 {d26}, [r0], r5 ++ vld1.16 {d27}, [r0], r5 ++ vld1.16 {d28}, [r0], r5 ++ vld1.16 {d29}, [r0], r5 ++ vld1.16 {d30}, [r0], r5 ++ vld1.16 {d31}, [r0], r5 ++ sub r0, #128 ++ tr8_begin d25, d27, d29, d31 ++ tr4 d24, d26, d28, d30 ++ tr8_end #7 ++ vst1.16 {d2}, [r0], r5 ++ vst1.16 {d3}, [r0], r5 ++ vst1.16 {d4}, [r0], r5 ++ vst1.16 {d5}, [r0], r5 ++ vst1.16 {d6}, [r0], r5 ++ vst1.16 {d7}, [r0], r5 ++ vst1.16 {d8}, [r0], r5 ++ vst1.16 {d9}, [r0], r5 ++ sub r0, #128 ++ //skip right half if col_limit in r1 is less than 4 ++ cmp r1, #4 ++ blt 1f ++ //right half ++ add r0, #8 ++ vld1.16 {d24}, [r0], r5 ++ vld1.16 {d25}, [r0], r5 ++ vld1.16 {d26}, [r0], r5 ++ vld1.16 {d27}, [r0], r5 ++ vld1.16 {d28}, [r0], r5 ++ vld1.16 {d29}, [r0], r5 ++ vld1.16 {d30}, [r0], r5 ++ vld1.16 {d31}, [r0], r5 ++ sub r0, #128 ++ tr8_begin d25, d27, d29, d31 ++ tr4 d24, d26, d28, d30 ++ tr8_end #7 ++ vst1.16 {d2}, [r0], r5 ++ vst1.16 {d3}, [r0], r5 ++ vst1.16 {d4}, [r0], r5 ++ vst1.16 {d5}, [r0], r5 ++ vst1.16 {d6}, [r0], r5 ++ vst1.16 {d7}, [r0], r5 ++ vst1.16 {d8}, [r0], r5 ++ vst1.16 {d9}, [r0], r5 ++ sub r0, #136 ++1: ++ // top half ++ vldm r0, {q12-q15} // coeffs ++ transpose_16b_4x4 d24, d26, d28, d30 ++ transpose_16b_4x4 d25, d27, d29, d31 ++ tr8_begin d26, d30, d27, d31 ++ tr4 d24, d28, d25, d29 ++ tr8_end #(TRN_SHIFT) ++ transpose_16b_4x4 d2, d3, d4, d5 ++ transpose_16b_4x4 d6, d7, d8, d9 ++ vswp d7, d5 ++ vswp d7, d8 ++ vswp d3, d6 ++ vswp d6, d4 ++ vstm r0!, {q1-q4} ++ ++ // bottom half ++ vldm r0, {q12-q15} // coeffs ++ transpose_16b_4x4 d24, d26, d28, d30 ++ transpose_16b_4x4 d25, d27, d29, d31 ++ tr8_begin d26, d30, d27, d31 ++ tr4 d24, d28, d25, d29 ++ tr8_end #(TRN_SHIFT) ++ transpose_16b_4x4 d2, d3, d4, d5 ++ transpose_16b_4x4 d6, d7, d8, d9 ++ vswp d7, d5 ++ vswp d7, d8 ++ vswp d3, d6 ++ vswp d6, d4 ++ //vstm r0, {q1-q4} ++ vst1.16 {q1-q2}, [r0] ++ add r0, #32 ++ vst1.16 {q3-q4}, [r0] ++ sub r0, #32 ++ vpop {d8-d15} ++ pop {r4-r8} ++ bx lr ++endfunc ++ ++#undef DC_SHIFT ++#undef DC_ADD ++#undef TRN_SHIFT ++ diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S new file mode 100644 -index 0000000..373576b +index 0000000000..373576b4cb --- /dev/null +++ b/libavcodec/arm/hevc_misc_neon.S @@ -0,0 +1,62 @@ @@ -1118,10 +1444,10 @@ index 0000000..373576b + diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S new file mode 100644 -index 0000000..880b26e +index 0000000000..bafefd4318 --- /dev/null +++ b/libavcodec/arm/hevcdsp_cres_neon.S -@@ -0,0 +1,275 @@ +@@ -0,0 +1,296 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" + @@ -1138,7 +1464,8 @@ index 0000000..880b26e +@ add_residual4x4_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++@ ptrdiff_t stride, [r2] ++@ int dc_v) [r3] + +function ff_hevc_add_residual_4x4_u_neon_8, export=1 + vld1.8 {d16}, [r0, :64], r2 @@ -1146,8 +1473,8 @@ index 0000000..880b26e + vld1.8 {d18}, [r0, :64], r2 + vld1.8 {d19}, [r0, :64], r2 + vld1.16 {q0, q1}, [r1] -+ vmov.i64 q2, #0 -+ vmov.i64 q3, #0 ++ vdup.16 q2, r3 ++ vdup.16 q3, r3 + vmovl.u8 q10, d16 + sub r0, r0, r2, lsl #2 + vmovl.u8 q11, d17 @@ -1174,9 +1501,11 @@ index 0000000..880b26e +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] + +function ff_hevc_add_residual_8x8_u_neon_8, export=1 + mov r12, #4 ++ vdup.16 q15, r3 +1: + vld2.8 {d16, d17}, [r0, :128], r2 + vld2.8 {d18, d19}, [r0, :128] @@ -1186,9 +1515,13 @@ index 0000000..880b26e + sub r0, r2 + vmovl.u8 q11, d18 + vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 + vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 + vqmovun.s16 d16, q0 ++ vqmovun.s16 d17, q2 + vqmovun.s16 d18, q1 ++ vqmovun.s16 d19, q3 + vst2.8 {d16, d17}, [r0, :128], r2 + vst2.8 {d18, d19}, [r0, :128], r2 + bne 1b @@ -1199,9 +1532,11 @@ index 0000000..880b26e +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] + +function ff_hevc_add_residual_16x16_u_neon_8, export=1 + mov r12, #16 ++ vdup.16 q15, r3 +1: + vld2.8 {q8, q9}, [r0, :256] + vld1.16 {q0, q1}, [r1, :256]! @@ -1210,8 +1545,12 @@ index 0000000..880b26e + vmovl.u8 q11, d17 + vqadd.s16 q0, q10 + vqadd.s16 q1, q11 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 + vqmovun.s16 d16, q0 + vqmovun.s16 d17, q1 ++ vqmovun.s16 d18, q2 ++ vqmovun.s16 d19, q3 + vst2.8 {q8, q9}, [r0, :256], r2 + bne 1b + bx lr @@ -1231,8 +1570,8 @@ index 0000000..880b26e + vld1.8 {d18}, [r0, :64], r2 + vld1.8 {d19}, [r0, :64], r2 + vld1.16 {q2, q3}, [r1] -+ vmov.i64 q0, #0 -+ vmov.i64 q1, #0 ++ vdup.16 q0, r3 ++ vdup.16 q1, r3 + vmovl.u8 q10, d16 + sub r0, r0, r2, lsl #2 + vmovl.u8 q11, d17 @@ -1262,6 +1601,7 @@ index 0000000..880b26e + +function ff_hevc_add_residual_8x8_v_neon_8, export=1 + mov r12, #4 ++ vdup.16 q15, r3 +1: + vld2.8 {d16, d17}, [r0, :128], r2 + vld2.8 {d18, d19}, [r0, :128] @@ -1272,8 +1612,12 @@ index 0000000..880b26e + vmovl.u8 q11, d19 + vqadd.s16 q0, q10 + vqadd.s16 q1, q11 ++ vaddw.u8 q2, q15, d16 ++ vaddw.u8 q3, q15, d18 + vqmovun.s16 d17, q0 ++ vqmovun.s16 d16, q2 + vqmovun.s16 d19, q1 ++ vqmovun.s16 d18, q3 + vst2.8 {d16, d17}, [r0, :128], r2 + vst2.8 {d18, d19}, [r0, :128], r2 + bne 1b @@ -1287,14 +1631,19 @@ index 0000000..880b26e + +function ff_hevc_add_residual_16x16_v_neon_8, export=1 + mov r12, #16 ++ vdup.16 q15, r3 +1: + vld2.8 {q8, q9}, [r0, :256] + vld1.16 {q0, q1}, [r1, :256]! + subs r12, #1 + vmovl.u8 q10, d18 + vmovl.u8 q11, d19 ++ vaddw.u8 q2, q15, d16 ++ vaddw.u8 q3, q15, d17 + vqadd.s16 q0, q10 + vqadd.s16 q1, q11 ++ vqmovun.s16 d16, q2 ++ vqmovun.s16 d17, q3 + vqmovun.s16 d18, q0 + vqmovun.s16 d19, q1 + vst2.8 {q8, q9}, [r0, :256], r2 @@ -1395,10 +1744,8 @@ index 0000000..880b26e +@ 32x32 chroma never occurs so NIF + +@ ============================================================================ -+ -+ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S -index 166bddb..9bd0a42 100644 +index 166bddb104..15c4329cdb 100644 --- a/libavcodec/arm/hevcdsp_deblock_neon.S +++ b/libavcodec/arm/hevcdsp_deblock_neon.S @@ -15,7 +15,7 @@ @@ -1410,66 +1757,204 @@ index 166bddb..9bd0a42 100644 */ -@@ -31,6 +31,9 @@ +@@ -24,70 +24,238 @@ + + .macro hevc_loop_filter_chroma_start + ldr r12, [r2] +- ldr r3, [r2, #4] +- add r2, r3, r12 +- cmp r2, #0 ++ ldr r2, [r2, #4] ++ orrs r2, r12, r2, lsl #16 + it eq bxeq lr .endm +-.macro hevc_loop_filter_chroma_body +- vsubl.u8 q3, d4, d2 +- vsubl.u8 q11, d18, d19 +- vshl.i16 q3, #2 +- vadd.i16 q11, q3 +- vdup.16 d0, r12 +- vdup.16 d1, r3 +- vrshr.s16 q11, q11, #3 +- vneg.s16 q12, q0 +@ Uses: d2, d4, d18, d19 +@ Returns: d2, d4 -+@ Modifies: d0-d7, d22-d25 - .macro hevc_loop_filter_chroma_body - vsubl.u8 q3, d4, d2 - vsubl.u8 q11, d18, d19 -@@ -49,6 +52,33 @@ - vqmovun.s16 d4, q2 - .endm - ++@ Modifies: d0-d7, d22-d25, r12 + -+@ Uses r2[0:7], r2[8:15] -+@ Modifies: d0-d7, d22-d25 -+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1 -+ vsubl.u8 q3, \Q0, \P0 -+ vsubl.u8 q11, \P1, \Q1 -+ vshl.i16 q3, #2 -+ vadd.i16 q11, q3 ++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1 ++ vsubl.u8 q0, \Q0, \P0 ++ vsubl.u8 q1, \P1, \Q1 ++ vdup.16 d4, r2 ++ lsr r2, r2, #16 ++ vshl.i16 q0, #2 ++ ldr r12, [sp, #0] @ r12 = &no_q ++ vadd.i16 q0, q1 ++ ldrh r3, [r3] @ r3[0:8] = no_p[0], r3[8:15] = no_p[1] ++ vdup.16 d5, r2 + -+ @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all) -+ vdup.16 d0, r2 -+ vmovl.u8 q0, d0 -+ vuzp.16 d0, d1 -+ -+ vrshr.s16 q11, q11, #3 -+ vneg.s16 q12, q0 ++ vrshr.s16 q0, q0, #3 ++ ldrh r12, [r12] ++ vneg.s16 q3, q2 ++ vmin.s16 q0, q0, q2 + vmovl.u8 q2, \Q0 -+ vmin.s16 q11, q11, q0 -+ vmax.s16 q11, q11, q12 -+ vaddw.u8 q1, q11, \P0 -+ vsub.i16 q2, q11 ++ vmax.s16 q0, q0, q3 ++ vaddw.u8 q1, q0, \P0 ++ vsub.i16 q2, q0 ++ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] + vqmovun.s16 \P0, q1 + vqmovun.s16 \Q0, q2 +.endm + ++@ Uses r2 (tc a;b) ++@ Modifies: q0-q3 ++@ On exit ++@ r12 (and flags) contain no_p;no_q ++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth ++ vsub.i16 q0, \Q0, \P0 ++ lsl r12, r2, #(\bit_depth - 8) ++ vsub.i16 q1, \P1, \Q1 ++ vshl.i16 q0, #2 ++ vdup.16 d4, r12 ++ lsr r12, r12, #16 ++ vadd.i16 q0, q1 ++ ldrh r3, [r3] ++ vdup.16 d5, r12 ++ ++ vrshr.s16 q0, q0, #3 ++ vneg.s16 q3, q2 ++ movw r12, #(1 << \bit_depth) - 1 ++ vmin.s16 q0, q0, q2 ++ vmax.s16 q0, q0, q3 ++ vdup.i16 q3, r12 ++ ldr r12, [sp, #0] ++ ++ vadd.i16 \P0, q0, \P0 ++ vsub.i16 \Q0, q0 ++ ++ vmov.i64 q2, #0 ++ ldrh r12, [r12] ++ vmin.s16 \P0, q3 ++ vmin.s16 \Q0, q3 ++ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] ++ vmax.s16 \P0, q2 ++ vmax.s16 \Q0, q2 ++.endm ++ ++ ++@ Preserves r12 ++@ Clobbers r2 ++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v ++ vsubl.u8 q0, \Q0u, \P0u ++ vsubl.u8 q1, \Q0v, \P0v ++ vsubl.u8 q2, \P1u, \Q1u ++ vsubl.u8 q3, \P1v, \Q1v ++ vshl.i16 q0, #2 ++ vshl.i16 q1, #2 ++ vadd.i16 q0, q2 ++ vdup.16 d4, r2 ++ lsr r2, #16 ++ vadd.i16 q1, q3 ++ ++ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all) ++ vrshr.s16 q0, #3 ++ vdup.16 d6, r2 + vmovl.u8 q2, d4 +- vmin.s16 q11, q11, q0 +- vmax.s16 q11, q11, q12 +- vaddw.u8 q1, q11, d2 +- vsub.i16 q2, q11 +- vqmovun.s16 d2, q1 +- vqmovun.s16 d4, q2 ++ vmovl.u8 q3, d6 ++ vuzp.16 d4, d5 ++ vrshr.s16 q1, #3 ++ vuzp.16 d6, d7 ++ ++ vmin.s16 q0, q2 ++ vneg.s16 q2, q2 ++ vmin.s16 q1, q3 ++ vneg.s16 q3, q3 ++ vmax.s16 q0, q2 ++ vaddw.u8 q2, q0, \P0u ++ vmax.s16 q1, q3 ++ vaddw.u8 q3, q1, \P0v ++ ++ vqmovun.s16 \P0u, q2 ++ vmovl.u8 q2, \Q0u ++ vqmovun.s16 \P0v, q3 ++ vmovl.u8 q3, \Q0v ++ vsub.i16 q2, q0 ++ vsub.i16 q3, q1 ++ ++ vqmovun.s16 \Q0u, q2 ++ vqmovun.s16 \Q0v, q3 + .endm + ++@ Preserves r12 ++@ Clobbers r2 ++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth ++ vsub.i16 q0, \Q0u, \P0u ++ vsub.i16 q1, \Q0v, \P0v ++ vsub.i16 q2, \P1u, \Q1u ++ vsub.i16 q3, \P1v, \Q1v ++ vshl.i16 q0, #2 ++ vshl.i16 q1, #2 ++ vadd.i16 q0, q2 ++ vdup.16 d4, r2 ++ lsr r2, #16 ++ vadd.i16 q1, q3 ++ ++ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all) ++ vrshr.s16 q0, #3 ++ vdup.16 d6, r2 ++ vshll.u8 q2, d4, #\bit_depth - 8 ++ vshll.u8 q3, d6, #\bit_depth - 8 ++ vuzp.16 d4, d5 ++ vrshr.s16 q1, #3 ++ vuzp.16 d6, d7 ++ ++ movw r2, #(1 << \bit_depth) - 1 ++ vmin.s16 q0, q2 ++ vneg.s16 q2, q2 ++ vmin.s16 q1, q3 ++ vneg.s16 q3, q3 ++ vmax.s16 q0, q2 ++ vmov.i64 q2, #0 ++ vmax.s16 q1, q3 ++ vdup.i16 q3, r2 ++ vadd.i16 \P0u, q0 ++ vsub.i16 \Q0u, q0 ++ vadd.i16 \P0v, q1 ++ vsub.i16 \Q0v, q1 ++ ++ vmax.s16 \P0u, q2 ++ vmax.s16 \Q0u, q2 ++ vmax.s16 \P0v, q2 ++ vmax.s16 \Q0v, q2 ++ vmin.s16 \P0u, q3 ++ vmin.s16 \Q0u, q3 ++ vmin.s16 \P0v, q3 ++ vmin.s16 \Q0v, q3 ++.endm ++ + + .macro hevc_loop_filter_luma_start ldr r12, [r3] ldr r3, [r3, #4] -@@ -60,15 +90,17 @@ - lsr r3, #16 +- lsl r3, #16 +- orr r3, r12 +- cmp r3, #0 ++ orrs r3, r12, r3, lsl #16 + it eq + bxeq lr +- lsr r3, #16 .endm -.macro hevc_loop_filter_luma_body -+@ Uses: r2, r3, r12 -+@ Modifies: r5, r6, r7, r8, r9 -+function hevc_loop_filter_luma_body -+ vmovl.u8 q15, d23 -+ vmovl.u8 q14, d22 -+ vmovl.u8 q13, d21 -+ vmovl.u8 q12, d20 -+ vmovl.u8 q11, d19 -+ vmovl.u8 q10, d18 -+ vmovl.u8 q9, d17 - vmovl.u8 q8, d16 +- vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 @@ -1477,46 +1962,103 @@ index 166bddb..9bd0a42 100644 - vmovl.u8 q13, d26 - vmovl.u8 q14, d28 - vmovl.u8 q15, d30 ++@ Uses: r2, r3, r12 ++@ Modifies: r5, r6, r7, r8, r9 ++ ++@ Input: ++@ r2 beta (raw: needs shift for bitdepth > 8) ++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) ++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) ++@ [sp,#96] &no_p[0] ++@ [sp,#100] &no_q[0] ++@ ++@ Input & output ++@ 8-bit: d16-d23 ++@ 16-bit: q8-q15 ++@ ++@ Output ++@ Z r10==0 ++@ r10[ 0:7 ] no_p[0] ++@ r10[ 8:15] no_p[1] ++@ r10[16:23] no_q[0] ++@ r10[24:31] no_q[1] ++ ++.macro m_filter_luma bit_depth ++.if \bit_depth == 8 ++ vmovl.u8 q15, d23 ++ vmovl.u8 q14, d22 ++ vmovl.u8 q13, d21 ++ vmovl.u8 q12, d20 ++ vmovl.u8 q11, d19 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q9, d17 ++ vmovl.u8 q8, d16 ++.endif vadd.i16 q7, q9, q11 ++.if \bit_depth > 8 ++ lsl r2, r2, #(\bit_depth - 8) ++.endif vadd.i16 q6, q14, q12 -@@ -77,7 +109,6 @@ ++.if \bit_depth > 8 ++ lsl r3, r3, #(\bit_depth - 8) ++.endif + vsub.i16 q7, q10 ++ ldr r5, [sp, #96] @ Bolt no_x values together into r10 + vsub.i16 q6, q13 vabd.s16 q7, q7, q10 vabd.s16 q6, q6, q13 - - ++ ldrh r10, [r5] + vdup.16 q0, r2 vmov q4, q7 vmov q5, q6 -@@ -152,7 +183,7 @@ +- vdup.16 d4, r12 ++ ldr r5, [sp, #100] ++ vdup.16 d4, r3 ++ lsr r3, r3, #16 + vtrn.16 q7, q4 ++ ldrh r5, [r5] + vtrn.16 q6, q5 + + vshl.u64 q7, #32 + vshr.u64 q4, #32 + vshl.u64 q6, #32 ++ orr r10, r10, r5, lsl #16 + vshr.u64 q5, #32 + vshr.u64 q7, #32 + vshr.u64 q6, #32 +@@ -152,7 +320,7 @@ and r9, r8, r7 cmp r9, #0 - beq weakfilter_\@ -+ beq weakfilter_ ++ beq 1f vadd.i16 q2, q11, q12 vadd.i16 q4, q9, q8 -@@ -210,11 +241,11 @@ +@@ -210,11 +378,11 @@ vbit q13, q3, q5 vbit q14, q2, q5 -weakfilter_\@: -+weakfilter_: ++1: mvn r8, r8 and r9, r8, r7 cmp r9, #0 - beq ready_\@ -+ beq ready_ ++ beq 2f vdup.16 q4, r2 -@@ -275,75 +306,345 @@ weakfilter_\@: +@@ -275,111 +443,1041 @@ weakfilter_\@: vbit q11, q0, q5 vbit q12, q4, q5 -ready_\@: -+ready_: ++2: ++.if \bit_depth == 8 vqmovun.s16 d16, q8 - vqmovun.s16 d18, q9 - vqmovun.s16 d20, q10 @@ -1525,7 +2067,7 @@ index 166bddb..9bd0a42 100644 - vqmovun.s16 d26, q13 - vqmovun.s16 d28, q14 - vqmovun.s16 d30, q15 --.endm ++ cmp r10, #0 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 @@ -1533,7 +2075,30 @@ index 166bddb..9bd0a42 100644 + vqmovun.s16 d21, q13 + vqmovun.s16 d22, q14 + vqmovun.s16 d23, q15 ++.else ++ movw r12, #(1 << \bit_depth - 1) ++ vmov.i64 q0, #0 ++ vdup.i16 q1, r12 ++ @ q8 & q15 should be unaltered and so don't require clipping ++ vmax.s16 q9, q0 ++ cmp r10, #0 ++ vmax.s16 q10, q0 ++ vmax.s16 q11, q0 ++ vmax.s16 q12, q0 ++ vmax.s16 q13, q0 ++ vmax.s16 q14, q0 ++ vmin.s16 q9, q1 ++ vmin.s16 q10, q1 ++ vmin.s16 q11, q1 ++ vmin.s16 q12, q1 ++ vmin.s16 q13, q1 ++ vmin.s16 q14, q1 ++.endif + mov pc, lr + .endm + ++function hevc_loop_filter_luma_body ++ m_filter_luma 8 +endfunc + +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) @@ -1545,7 +2110,16 @@ index 166bddb..9bd0a42 100644 + b v_loop_luma_common +endfunc + - ++ ++@ void ff_hevc_v_loop_filter_luma_neon( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int *_tc, [r3] ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++ function ff_hevc_v_loop_filter_luma_neon, export=1 hevc_loop_filter_luma_start - push {r5-r11} @@ -1553,14 +2127,6 @@ index 166bddb..9bd0a42 100644 + + sub r4, r0, #4 +v_loop_luma_common: -+ @ Why this isn't a bitmask to start with I have no idea... -+ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 -+ ldr r5, [sp, #32] -+ ldrh r10, [r5] -+ ldr r5, [sp, #36] -+ ldrh r5, [r5] -+ orr r10, r10, r5, lsl #16 @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1] -+ vpush {d8-d15} - sub r0, #4 - vld1.8 {d16}, [r0], r1 @@ -1617,44 +2183,38 @@ index 166bddb..9bd0a42 100644 + + @ no_p[1] + tst r10, #0xff00 -+ itt ne -+ addne r4, r4, r1, lsl #2 ++ add r2, r4, r1, lsl #2 + bne 1f + vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 + vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 + vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 -+ ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32] ++1: ++ @ no_p[0] ++ tst r10, #0xff ++ bne 1f ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r2:32] +1: + @ no_q[1] + tst r10, #0xff000000 -+ itt ne -+ addne r0, r0, r1, lsl #2 -+ bne 2f ++ add r2, r0, r1, lsl #2 ++ bne 1f + vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 + vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 + vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 -+ -+2: -+ @ no_p[0] -+ tst r10, #0xff -+ bne 3f -+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 -+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] -+ -+3: ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32] ++1: + @ no_q[0] + tst r10, #0xff0000 -+ bne 4f -+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 -+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] -+ -+4: ++ bne 1f ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] ++1: +bypasswrite: vpop {d8-d15} - pop {r5-r11} @@ -1662,6 +2222,81 @@ index 166bddb..9bd0a42 100644 + pop {r4-r10,pc} endfunc ++.macro m_filter_v_luma_common_16 bit_depth ++ vpush {d8-d15} ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ neg r1, r1 ++ ++ @ p[1] ++ tst r10, #0xff00 ++ add r2, r4, r1, lsl #2 ++ bne 1f ++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4] ++1: ++ @ p[0] ++ tst r10, #0xff ++ bne 1f ++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r2], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r2], r1 ++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r2], r1 ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r2] ++1: ++ @ q[1] ++ tst r10, #0xff000000 ++ add r2, r0, r1, lsl #2 ++ bne 1f ++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0] ++1: ++ @ q[0] ++ tst r10, #0xff0000 ++ bne 1f ++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r2], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 ++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r2], r1 ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] ++1: ++ vpop {d8-d15} ++ pop {r4-r10,pc} ++.endm ++ ++ ++ ++ +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] +@ ptrdiff_t stride, [r1] +@ int beta, [r2] @@ -1711,13 +2346,6 @@ index 166bddb..9bd0a42 100644 + neg r1, r1 + add r0, r0, r1 + -+ @ Why this isn't a bitmask to start with I have no idea... -+ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 -+ ldr r5, [sp, #32] -+ ldrh r10, [r5] -+ ldr r5, [sp, #36] -+ ldrh r5, [r5] -+ orrs r10, r10, r5, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] + bne 1f + + vst1.8 {d22}, [r0], r1 @@ -1768,8 +2396,81 @@ index 166bddb..9bd0a42 100644 + + pop {r4-r10,pc} + - endfunc - ++endfunc ++ ++ ++.macro m_filter_h_luma_16 bit_depth ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} ++ ++ vpush {d8-d15} ++ sub r0, r0, r1, lsl #2 ++ ++ vld1.16 { q8}, [r0], r1 ++ vld1.16 { q9}, [r0], r1 ++ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q11}, [r0], r1 ++ vld1.16 {q12}, [r0], r1 ++ vld1.16 {q13}, [r0], r1 ++ vld1.16 {q14}, [r0], r1 ++ vld1.16 {q15}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ vpop {d8-d15} ++ ++ sub r0, r1 ++ neg r1, r1 ++ bne 1f ++ ++ vst1.16 {q14}, [r0], r1 ++ vst1.16 {q13}, [r0], r1 ++ vst1.16 {q12}, [r0], r1 ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r0], r1 ++ vst1.16 { q9}, [r0] ++ pop {r4-r10,pc} ++ ++@ Partial write ++1: ++ tst r10, #0xff0000 ++ mov r2, r0 ++ bne 1f ++ vst1.16 {d28}, [r2], r1 ++ vst1.16 {d26}, [r2], r1 ++ vst1.16 {d24}, [r2] ++ ++1: ++ tst r10, #0xff000000 ++ add r2, r0, #8 ++ bne 1f ++ vst1.16 {d29}, [r2], r1 ++ vst1.16 {d27}, [r2], r1 ++ vst1.16 {d25}, [r2] ++ ++1: ++ tst r10, #0xff ++ @ r0 = r0 + r1 * 3 ++ add r0, r0, r1 ++ add r0, r0, r1, lsl # 1 ++ add r2, r0, #8 ++ bne 1f ++ vst1.16 {d22}, [r0], r1 ++ vst1.16 {d20}, [r0], r1 ++ vst1.16 {d18}, [r0] ++ ++1: ++ tst r10, #0xff00 ++ bne 1f ++ vst1.16 {d23}, [r2], r1 ++ vst1.16 {d21}, [r2], r1 ++ vst1.16 {d19}, [r2] ++ ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 @@ -1783,9 +2484,7 @@ index 166bddb..9bd0a42 100644 + vld2.8 {d26,d27}, [r0], r1 + vld2.8 {d28,d29}, [r0] + sub r0, r0, r1, lsl #1 -+ hevc_loop_filter_uv_body d16, d18, d26, d28 -+ lsr r2, r2, #16 -+ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29 + cmp r3, #0 + bne 1f + vst2.8 {d18,d19}, [r0], r1 @@ -1795,122 +2494,509 @@ index 166bddb..9bd0a42 100644 + @ At least one no_f bit is set + @ Which means we need to break this apart in an ugly fashion +1: vzip.8 d18, d19 ++ lsls r2, r3, #31 @ b0 -> N, b1 -> C + vzip.8 d26, d27 + sub r1, r1, #8 + -+ tst r3, #1 -+ bne 1f ++ bmi 1f + vst1.8 {d18}, [r0] +1: add r0, r0, #8 -+ tst r3, #2 -+ bne 2f ++ bcs 2f + vst1.8 {d19}, [r0] -+2: add r0, r0, r1 ++2: lsls r2, r3, #29 @ b2 -> N, b3 -> C ++ add r0, r0, r1 + -+ tst r3, #4 -+ bne 1f ++ bmi 1f + vst1.8 {d26}, [r0] -+1: add r0, r0, #8 -+ tst r3, #8 -+ it ne -+ bxne lr ++1: it cs ++ bxcs lr ++ add r0, r0, #8 + vst1.8 {d27}, [r0] + bx lr + +endfunc + + ++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ ++@ Macro here actual function near bottom ++ ++.macro m_filter_h_uv_16 bit_depth ++ sub r0, r0, r1, lsl #1 ++ vld2.16 {q8, q9 }, [r0], r1 ++ vld2.16 {q10, q11}, [r0], r1 ++ vld2.16 {q12, q13}, [r0], r1 ++ vld2.16 {q14, q15}, [r0] ++ sub r0, r0, r1, lsl #1 ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ ++ cmp r3, #0 ++ bne 1f ++ vst2.16 {q10, q11}, [r0], r1 ++ vst2.16 {q12, q13}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: vzip.16 q10, q11 ++ lsls r2, r3, #31 @ b0 -> N, b1 -> C ++ vzip.16 q12, q13 ++ sub r1, r1, #16 ++ ++ bmi 1f ++ vst1.16 {q10}, [r0] ++1: add r0, r0, #16 ++ bcs 2f ++ vst1.16 {q11}, [r0] ++2: lsls r2, r3, #29 @ b2 -> N, b3 -> C ++ add r0, r0, r1 ++ ++ bmi 1f ++ vst1.16 {q12}, [r0] ++1: it cs ++ bxcs lr ++ add r0, r0, #16 ++ vst1.16 {q13}, [r0] ++ bx lr ++.endm ++ ++ +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 +@ unsigned int no_f); // sp[0] +@ -+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++ +function ff_hevc_v_loop_filter_uv2_neon_8, export=1 + vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1 -+ vld4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0], r1 ++ vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0], r1 ++ sub r12, r0, r3 + + vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 -+ vld4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ vld4.8 {d20[1], d21[1], d22[1], d23[1]}, [r0], r1 ++ cmp r12, #4 + + vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 -+ vld4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ vld4.8 {d20[2], d21[2], d22[2], d23[2]}, [r0], r1 + + vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 -+ vld4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ vld4.8 {d20[3], d21[3], d22[3], d23[3]}, [r0], r1 + + vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 -+ vld4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ vld4.8 {d20[4], d21[4], d22[4], d23[4]}, [r0], r1 + + vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 -+ vld4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ vld4.8 {d20[5], d21[5], d22[5], d23[5]}, [r0], r1 + + vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 -+ vld4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ vld4.8 {d20[6], d21[6], d22[6], d23[6]}, [r0], r1 + + vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3] -+ vld4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0] -+ -+ hevc_loop_filter_uv_body d16, d18, d26, d28 -+ lsr r2, r2, #16 -+ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ vld4.8 {d20[7], d21[7], d22[7], d23[7]}, [r0] ++ it eq ++ ldreq r12, [sp, #0] + ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 ++ cmp r12, #0 ++ add r3, #2 + neg r1, r1 -+ -+ ldr r2, [sp, #0] -+ -+ @ p[1] -+ tst r2, #2 -+ itt ne -+ addne r3, r3, r1, lsl #2 + bne 1f -+ vst4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3], r1 -+ vst4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 -+ vst4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 -+ vst4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 + ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst4.8 {d18[7], d19[7], d20[7], d21[7]}, [r3], r1 ++ vst4.8 {d18[6], d19[6], d20[6], d21[6]}, [r3], r1 ++ vst4.8 {d18[5], d19[5], d20[5], d21[5]}, [r3], r1 ++ vst4.8 {d18[4], d19[4], d20[4], d21[4]}, [r3], r1 ++ vst4.8 {d18[3], d19[3], d20[3], d21[3]}, [r3], r1 ++ vst4.8 {d18[2], d19[2], d20[2], d21[2]}, [r3], r1 ++ vst4.8 {d18[1], d19[1], d20[1], d21[1]}, [r3], r1 ++ vst4.8 {d18[0], d19[0], d20[0], d21[0]}, [r3] ++ bx lr ++ ++@ Either split or partial +1: -+ @ q[1] -+ tst r2, #8 -+ itt ne -+ addne r0, r0, r1, lsl #2 -+ bne 2f -+ vst4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0], r1 -+ vst4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 -+ vst4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 -+ vst4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ldr r12, [sp, #0] ++ lsls r12, #29 @ b2 -> N, b3 -> C ++ add r2, r0, r1, lsl #2 ++ bcs 1f ++ vst2.8 {d20[7], d21[7]}, [r0], r1 ++ vst2.8 {d20[6], d21[6]}, [r0], r1 ++ vst2.8 {d20[5], d21[5]}, [r0], r1 ++ vst2.8 {d20[4], d21[4]}, [r0] ++1: ++ bmi 2f ++ vst2.8 {d20[3], d21[3]}, [r2], r1 ++ vst2.8 {d20[2], d21[2]}, [r2], r1 ++ vst2.8 {d20[1], d21[1]}, [r2], r1 ++ vst2.8 {d20[0], d21[0]}, [r2] + +2: -+ @ p[0] -+ tst r2, #1 -+ bne 3f -+ vst4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 -+ vst4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 -+ vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 -+ vst4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3] -+ ++ lsls r12, #2 ++ add r2, r3, r1, lsl #2 ++ bcs 3f ++ vst2.8 {d18[7], d19[7]}, [r3], r1 ++ vst2.8 {d18[6], d19[6]}, [r3], r1 ++ vst2.8 {d18[5], d19[5]}, [r3], r1 ++ vst2.8 {d18[4], d19[4]}, [r3] +3: -+ @ q[0] -+ tst r2, #4 -+ it ne -+ bxne lr -+ vst4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 -+ vst4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 -+ vst4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 -+ vst4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0] -+ ++ it mi ++ bxmi lr ++ vst2.8 {d18[3], d19[3]}, [r2], r1 ++ vst2.8 {d18[2], d19[2]}, [r2], r1 ++ vst2.8 {d18[1], d19[1]}, [r2], r1 ++ vst2.8 {d18[0], d19[0]}, [r2] + bx lr -+endfunc + endfunc + ++ ++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++.macro m_filter_v_uv2_16 bit_depth ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r3], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ sub r12, r0, r3 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r3], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ cmp r12, #8 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r3], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r3], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r3], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r3], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r3], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r3] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ it eq ++ ldreq r12, [sp, #0] ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ cmp r12, #0 ++ add r3, #4 ++ neg r1, r1 ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst4.16 {d21[3], d23[3],d25[3], d27[3]}, [r3], r1 ++ vst4.16 {d21[2], d23[2],d25[2], d27[2]}, [r3], r1 ++ vst4.16 {d21[1], d23[1],d25[1], d27[1]}, [r3], r1 ++ vst4.16 {d21[0], d23[0],d25[0], d27[0]}, [r3], r1 ++ vst4.16 {d20[3], d22[3],d24[3], d26[3]}, [r3], r1 ++ vst4.16 {d20[2], d22[2],d24[2], d26[2]}, [r3], r1 ++ vst4.16 {d20[1], d22[1],d24[1], d26[1]}, [r3], r1 ++ vst4.16 {d20[0], d22[0],d24[0], d26[0]}, [r3], r1 ++ bx lr ++ ++@ Either split or partial ++1: ++ ldr r12, [sp, #0] ++ lsls r12, #29 @ b2 -> N, b3 -> C ++ add r2, r0, r1, lsl #2 ++ bcs 1f ++ vst2.16 {d25[3], d27[3]}, [r0], r1 ++ vst2.16 {d25[2], d27[2]}, [r0], r1 ++ vst2.16 {d25[1], d27[1]}, [r0], r1 ++ vst2.16 {d25[0], d27[0]}, [r0] ++1: ++ bmi 2f ++ vst2.16 {d24[3], d26[3]}, [r2], r1 ++ vst2.16 {d24[2], d26[2]}, [r2], r1 ++ vst2.16 {d24[1], d26[1]}, [r2], r1 ++ vst2.16 {d24[0], d26[0]}, [r2] ++ ++2: ++ lsls r12, #2 ++ add r2, r3, r1, lsl #2 ++ bcs 3f ++ vst2.16 {d21[3], d23[3]}, [r3], r1 ++ vst2.16 {d21[2], d23[2]}, [r3], r1 ++ vst2.16 {d21[1], d23[1]}, [r3], r1 ++ vst2.16 {d21[0], d23[0]}, [r3] ++3: ++ it mi ++ bxmi lr ++ vst2.16 {d20[3], d22[3]}, [r2], r1 ++ vst2.16 {d20[2], d22[2]}, [r2], r1 ++ vst2.16 {d20[1], d22[1]}, [r2], r1 ++ vst2.16 {d20[0], d22[0]}, [r2] ++ bx lr ++.endm ++ + + function ff_hevc_v_loop_filter_chroma_neon, export=1 hevc_loop_filter_chroma_start ++ ++ sub r0, #2 ++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r0], r1 ++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0], r1 ++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r0], r1 ++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r0], r1 ++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r0], r1 ++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r0], r1 ++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r0], r1 ++ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r0], r1 ++ ++ sub r0, r0, r1, lsl #3 ++ add r0, r0, #1 ++ hevc_loop_filter_chroma_body d16, d17, d18, d19 ++ bne 1f ++ ++ vst2.8 {d17[0], d18[0]}, [r0], r1 ++ vst2.8 {d17[1], d18[1]}, [r0], r1 ++ vst2.8 {d17[2], d18[2]}, [r0], r1 ++ vst2.8 {d17[3], d18[3]}, [r0], r1 ++ vst2.8 {d17[4], d18[4]}, [r0], r1 ++ vst2.8 {d17[5], d18[5]}, [r0], r1 ++ vst2.8 {d17[6], d18[6]}, [r0], r1 ++ vst2.8 {d17[7], d18[7]}, [r0], r1 ++ bx lr ++ ++1: ++ tst r12, #0xff @ P0a ++ bne 2f ++ ++ vst1.8 {d17[0]}, [r0], r1 ++ vst1.8 {d17[1]}, [r0], r1 ++ vst1.8 {d17[2]}, [r0], r1 ++ vst1.8 {d17[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++2: ++ tst r12, #0xff0000 @ Q0a ++ add r0, #1 ++ bne 3f ++ vst1.8 {d18[0]}, [r0], r1 ++ vst1.8 {d18[1]}, [r0], r1 ++ vst1.8 {d18[2]}, [r0], r1 ++ vst1.8 {d18[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++3: ++ tst r12, #0xff000000 @ Q0b ++ add r0, r0, r1, lsl #2 ++ bne 4f ++ vst1.8 {d18[4]}, [r0], r1 ++ vst1.8 {d18[5]}, [r0], r1 ++ vst1.8 {d18[6]}, [r0], r1 ++ vst1.8 {d18[7]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++4: ++ tst r12, #0xff00 @ P0b ++ it ne ++ bxne lr ++ ++ sub r0, #1 ++ vst1.8 {d17[4]}, [r0], r1 ++ vst1.8 {d17[5]}, [r0], r1 ++ vst1.8 {d17[6]}, [r0], r1 ++ vst1.8 {d17[7]}, [r0], r1 ++ bx lr ++ ++endfunc ++ ++ ++.macro m_filter_v_chroma_16 bit_depth ++ hevc_loop_filter_chroma_start ++ sub r0, #4 -@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 - vst1.8 {d4}, [r0] ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r0], r1 ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0], r1 ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r0], r1 ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r0], r1 ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r0], r1 ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r0], r1 ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r0], r1 ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0], r1 ++ ++ sub r0, r0, r1, lsl #3 ++ add r0, r0, #2 ++ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth ++ bne 1f ++ ++ vst2.16 {d18[0], d20[0]}, [r0], r1 ++ vst2.16 {d18[1], d20[1]}, [r0], r1 ++ vst2.16 {d18[2], d20[2]}, [r0], r1 ++ vst2.16 {d18[3], d20[3]}, [r0], r1 ++ vst2.16 {d19[0], d21[0]}, [r0], r1 ++ vst2.16 {d19[1], d21[1]}, [r0], r1 ++ vst2.16 {d19[2], d21[2]}, [r0], r1 ++ vst2.16 {d19[3], d21[3]}, [r0], r1 ++ bx lr ++ ++1: ++ tst r12, #0xff @ P0a ++ bne 2f ++ ++ vst1.16 {d18[0]}, [r0], r1 ++ vst1.16 {d18[1]}, [r0], r1 ++ vst1.16 {d18[2]}, [r0], r1 ++ vst1.16 {d18[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++2: ++ tst r12, #0xff0000 @ Q0a ++ add r0, #1 ++ bne 3f ++ vst1.16 {d20[0]}, [r0], r1 ++ vst1.16 {d20[1]}, [r0], r1 ++ vst1.16 {d20[2]}, [r0], r1 ++ vst1.16 {d20[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++3: ++ tst r12, #0xff000000 @ Q0b ++ add r0, r0, r1, lsl #2 ++ bne 4f ++ vst1.16 {d21[0]}, [r0], r1 ++ vst1.16 {d21[1]}, [r0], r1 ++ vst1.16 {d21[2]}, [r0], r1 ++ vst1.16 {d21[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++4: ++ tst r12, #0xff00 @ P0b ++ it ne ++ bxne lr ++ ++ sub r0, #1 ++ vst1.16 {d19[0]}, [r0], r1 ++ vst1.16 {d19[1]}, [r0], r1 ++ vst1.16 {d19[2]}, [r0], r1 ++ vst1.16 {d19[3]}, [r0], r1 ++ bx lr ++.endm ++ ++ ++@ void ff_hevc_h_loop_filter_chroma_neon( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int *_tc, [r2] ++@ uint8_t *_no_p, [r3] ++@ uint8_t *_no_q); [sp+0] ++ ++function ff_hevc_h_loop_filter_chroma_neon, export=1 ++ hevc_loop_filter_chroma_start ++ sub r0, r0, r1, lsl #1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 +- vld1.8 {d2}, [r0], r1 +- vld1.8 {d4}, [r0], r1 +- vld1.8 {d19}, [r0], r1 +- vld1.8 {d20}, [r0], r1 +- vld1.8 {d21}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 +- hevc_loop_filter_chroma_body +- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 +- vst1.8 {d16}, [r0], r1 ++ vld1.8 {d19}, [r0] ++ sub r0, r0, r1, lsl #1 ++ hevc_loop_filter_chroma_body d16, d17, d18, d19 ++ bne 1f @ Partial write + vst1.8 {d17}, [r0], r1 +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d2}, [r0], r1 +- vst1.8 {d4}, [r0], r1 +- vst1.8 {d19}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d21}, [r0] ++ vst1.8 {d18}, [r0] ++ bx lr ++1: ++ tst r12, #0xff ++ vmov r2, r3, d17 ++ it eq ++ streq r2, [r0] ++ tst r12, #0xff00 ++ it eq ++ streq r3, [r0, #4] ++ ++ add r0, r1 ++ tst r12, #0xff0000 ++ vmov r2, r3, d18 ++ it eq ++ streq r2, [r0] ++ tst r12, #0xff000000 ++ it eq ++ streq r3, [r0, #4] ++ bx lr endfunc + +-function ff_hevc_h_loop_filter_chroma_neon, export=1 ++.macro m_filter_h_chroma_16 bit_depth + hevc_loop_filter_chroma_start + sub r0, r0, r1, lsl #1 +- vld1.8 {d18}, [r0], r1 +- vld1.8 {d2}, [r0], r1 +- vld1.8 {d4}, [r0], r1 +- vld1.8 {d19}, [r0] ++ vld1.16 {q8}, [r0], r1 ++ vld1.16 {q9}, [r0], r1 ++ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q11}, [r0] + sub r0, r0, r1, lsl #1 +- hevc_loop_filter_chroma_body +- vst1.8 {d2}, [r0], r1 +- vst1.8 {d4}, [r0] ++ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth ++ bne 1f @ Partial write ++ vst1.16 {q9}, [r0], r1 ++ vst1.16 {q10}, [r0] ++ bx lr ++1: ++ tst r12, #0xff ++ bne 2f ++ vst1.16 {d18}, [r0] ++2: ++ tst r12, #0xff00 ++ bne 3f ++ add r0, #8 ++ vst1.16 {d19}, [r0] ++ sub r0, #8 ++3: ++ tst r12, #0xff0000 ++ add r0, r1 ++ bne 4f ++ vst1.16 {d20}, [r0] ++4: ++ tst r12, #0xff000000 ++ it ne ++ bxne lr ++ add r0, #8 ++ vst1.16 {d21}, [r0] ++ + bx lr ++.endm ++ + +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i + * int *curr_rpl0, int *curr_ @@ -2036,9 +3122,54 @@ index 166bddb..9bd0a42 100644 + b 11b +endfunc + ++@ ============================================================================= ++@ ++@ 10 bit ++ ++function hevc_loop_filter_luma_body_10 ++ m_filter_luma 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_luma_neon_10, export=1 ++ m_filter_h_luma_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_luma2_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} @ 8 regs = 32 bytes ++ ++ ldr r4, [sp, #40] ++ b v_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_v_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} ++ ++ sub r4, r0, #8 ++v_loop_luma_common_10: ++ m_filter_v_luma_common_16 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_uv_neon_10, export=1 ++ m_filter_h_uv_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_uv2_neon_10, export=1 ++ m_filter_v_uv2_16 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_chroma_neon_10, export=1 ++ m_filter_h_chroma_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_chroma_neon_10, export=1 ++ m_filter_v_chroma_16 10 + endfunc ++ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S new file mode 100644 -index 0000000..00eab9e +index 0000000000..00eab9eeee --- /dev/null +++ b/libavcodec/arm/hevcdsp_epel_neon.S @@ -0,0 +1,337 @@ @@ -2379,11 +3510,399 @@ index 0000000..00eab9e + .byte 4, 28, 46, 6 + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 +diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S +index e39d00634b..ee2111f9b2 100644 +--- a/libavcodec/arm/hevcdsp_idct_neon.S ++++ b/libavcodec/arm/hevcdsp_idct_neon.S +@@ -21,82 +21,6 @@ + #include "libavutil/arm/asm.S" + #include "neon.S" + +-function ff_hevc_idct_4x4_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q0, r1 +- vdup.16 q1, r1 +- vst1.16 {q0, q1}, [r0] +- bx lr +-endfunc +- +-function ff_hevc_idct_8x8_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +- vstm r0, {q8-q15} +- bx lr +-endfunc +- +-function ff_hevc_idct_16x16_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +- vstm r0!, {q8-q15} +- vstm r0!, {q8-q15} +- vstm r0!, {q8-q15} +- vstm r0, {q8-q15} +- bx lr +-endfunc +- +-function ff_hevc_idct_32x32_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- mov r3, #16 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +-1: subs r3, #1 +- vstm r0!, {q8-q15} +- bne 1b +- bx lr +-endfunc +- + function ff_hevc_add_residual_4x4_neon_8, export=1 + vldm r1, {q0-q1} + vld1.32 d4[0], [r0], r2 +@@ -168,6 +92,131 @@ function ff_hevc_add_residual_32x32_neon_8, export=1 + bx lr + endfunc + ++ ++@ ff_hevc_add_residual_4x4_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_4x4_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ ++ vld1.32 d4[0], [r0], r1 ++ vld1.32 d4[1], [r0], r1 ++ vld1.32 d5[0], [r0], r1 ++ vld1.32 d5[1], [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ vaddw.u8 q0, q15, d4 ++ vaddw.u8 q1, q15, d5 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r1 ++ vst1.32 d0[1], [r0], r1 ++ vst1.32 d1[0], [r0], r1 ++ vst1.32 d1[1], [r0], r1 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_4x4_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #4 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_8x8_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_8x8_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #8 ++ ++1: subs r3, #1 ++ vld1.8 d16, [r0] ++ vaddw.u8 q0, q15, d16 ++ vqmovun.s16 d0, q0 ++ vst1.32 d0, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_8x8_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #8 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_16x16_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_16x16_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #16 ++ ++1: subs r3, #1 ++ vld1.8 {q8}, [r0] ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.8 {q0}, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_16x16_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #16 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_32x32_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_32x32_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #32 ++ ++1: subs r3, #1 ++ vld1.8 {q8, q9}, [r0] ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d2, q2 ++ vqmovun.s16 d3, q3 ++ vst1.8 {q0, q1}, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++ + .macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.64 \r0, \r4 + vtrn.64 \r1, \r5 +@@ -263,55 +312,6 @@ endfunc + vqrshrn.s32 \r3, q3, \shift + .endm + +-function ff_hevc_transform_4x4_neon_8, export=1 +- vpush {d8-d15} +- vld1.16 {q14, q15}, [r0] // coeffs +- ldr r3, =0x00240053 // 36 and 83 +- vmov.32 d0[0], r3 +- +- tr4_shift d28, d29, d30, d31, #7 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- tr4_shift d28, d29, d30, d31, #12 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- vst1.16 {q14, q15}, [r0] +- vpop {d8-d15} +- bx lr +-endfunc +- +-function ff_hevc_transform_luma_4x4_neon_8, export=1 +- vpush {d8-d15} +- vld1.16 {q14, q15}, [r0] // coeffs +- ldr r3, =0x4a // 74 +- vmov.32 d0[0], r3 +- ldr r3, =0x1d // 29 +- vmov.32 d0[1], r3 +- ldr r3, =0x37 // 55 +- vmov.32 d1[0], r3 +- +- tr4_luma_shift d28, d29, d30, d31, #7 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- tr4_luma_shift d28, d29, d30, d31, #12 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- vst1.16 {q14, q15}, [r0] +- vpop {d8-d15} +- bx lr +-endfunc +- + .macro tr8_begin in0, in1, in2, in3 + vmull.s16 q7, \in0, d1[1] // 89 * src1 + vmull.s16 q8, \in0, d1[0] // 75 * src1 +@@ -356,100 +356,6 @@ endfunc + vqrshrn.s32 d8, q5, \shift + .endm + +-function ff_hevc_transform_8x8_neon_8, export=1 +- push {r4-r8} +- vpush {d8-d15} +- mov r5, #16 +- +- adr r3, tr4f +- vld1.16 {d0, d1}, [r3] +- +- // left half +- vld1.16 {d24}, [r0], r5 +- vld1.16 {d25}, [r0], r5 +- vld1.16 {d26}, [r0], r5 +- vld1.16 {d27}, [r0], r5 +- vld1.16 {d28}, [r0], r5 +- vld1.16 {d29}, [r0], r5 +- vld1.16 {d30}, [r0], r5 +- vld1.16 {d31}, [r0], r5 +- sub r0, #128 +- tr8_begin d25, d27, d29, d31 +- tr4 d24, d26, d28, d30 +- tr8_end #7 +- vst1.16 {d2}, [r0], r5 +- vst1.16 {d3}, [r0], r5 +- vst1.16 {d4}, [r0], r5 +- vst1.16 {d5}, [r0], r5 +- vst1.16 {d6}, [r0], r5 +- vst1.16 {d7}, [r0], r5 +- vst1.16 {d8}, [r0], r5 +- vst1.16 {d9}, [r0], r5 +- sub r0, #128 +- //skip right half if col_limit in r1 is less than 4 +- cmp r1, #4 +- blt 1f +- //right half +- add r0, #8 +- vld1.16 {d24}, [r0], r5 +- vld1.16 {d25}, [r0], r5 +- vld1.16 {d26}, [r0], r5 +- vld1.16 {d27}, [r0], r5 +- vld1.16 {d28}, [r0], r5 +- vld1.16 {d29}, [r0], r5 +- vld1.16 {d30}, [r0], r5 +- vld1.16 {d31}, [r0], r5 +- sub r0, #128 +- tr8_begin d25, d27, d29, d31 +- tr4 d24, d26, d28, d30 +- tr8_end #7 +- vst1.16 {d2}, [r0], r5 +- vst1.16 {d3}, [r0], r5 +- vst1.16 {d4}, [r0], r5 +- vst1.16 {d5}, [r0], r5 +- vst1.16 {d6}, [r0], r5 +- vst1.16 {d7}, [r0], r5 +- vst1.16 {d8}, [r0], r5 +- vst1.16 {d9}, [r0], r5 +- sub r0, #136 +-1: +- // top half +- vldm r0, {q12-q15} // coeffs +- transpose_16b_4x4 d24, d26, d28, d30 +- transpose_16b_4x4 d25, d27, d29, d31 +- tr8_begin d26, d30, d27, d31 +- tr4 d24, d28, d25, d29 +- tr8_end #12 +- transpose_16b_4x4 d2, d3, d4, d5 +- transpose_16b_4x4 d6, d7, d8, d9 +- vswp d7, d5 +- vswp d7, d8 +- vswp d3, d6 +- vswp d6, d4 +- vstm r0!, {q1-q4} +- +- // bottom half +- vldm r0, {q12-q15} // coeffs +- transpose_16b_4x4 d24, d26, d28, d30 +- transpose_16b_4x4 d25, d27, d29, d31 +- tr8_begin d26, d30, d27, d31 +- tr4 d24, d28, d25, d29 +- tr8_end #12 +- transpose_16b_4x4 d2, d3, d4, d5 +- transpose_16b_4x4 d6, d7, d8, d9 +- vswp d7, d5 +- vswp d7, d8 +- vswp d3, d6 +- vswp d6, d4 +- //vstm r0, {q1-q4} +- vst1.16 {q1-q2}, [r0] +- add r0, #32 +- vst1.16 {q3-q4}, [r0] +- sub r0, #32 +- vpop {d8-d15} +- pop {r4-r8} +- bx lr +-endfunc + + .align 4 + tr4f: +@@ -463,3 +369,11 @@ tr16: + .word 0x00500046 // 80, d2[2] = 70 + .word 0x0039002b // 57, d2[0] = 43 + .word 0x00190009 // 25, d2[2] = 9 ++ ++#define BIT_DEPTH 8 ++#include "hevc_idct_fn_neon.S" ++ ++#undef BIT_DEPTH ++#define BIT_DEPTH 10 ++#include "hevc_idct_fn_neon.S" ++ diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c -index 1a3912c..c87e9d3 100644 +index 1a3912c609..3b7e5bd148 100644 --- a/libavcodec/arm/hevcdsp_init_neon.c +++ b/libavcodec/arm/hevcdsp_init_neon.c -@@ -22,11 +22,26 @@ +@@ -22,11 +22,41 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" #include "hevcdsp_arm.h" @@ -2395,6 +3914,11 @@ index 1a3912c..c87e9d3 100644 void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + ++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ +#ifdef RPI +void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, const int32_t tc[2], @@ -2405,65 +3929,196 @@ index 1a3912c..c87e9d3 100644 +void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, + uint8_t * src_l, + unsigned int no_f); ++ ++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); +#endif + void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); -@@ -43,6 +58,52 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, +@@ -34,6 +64,15 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs); + void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs); + void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs); + void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); ++ ++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs); ++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs); ++ + void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); + void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, +@@ -43,6 +82,157 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ +#if RPI_HEVC_SAND +void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_v); +void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_v); +void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_v); +void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_u); +void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_u); +void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); ++ ptrdiff_t stride, int dc_u); +void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++ ++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); +#endif + -+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); ++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + -+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); ++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + -+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); ++#if RPI_HEVC_SAND ++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); + -+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, -+ const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo); ++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); + -+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); + ++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++#endif ++ ++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ + #define PUT_PIXELS(name) \ void name(int16_t *dst, uint8_t *src, \ ptrdiff_t srcstride, int height, \ -@@ -58,6 +119,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); +@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); #undef PUT_PIXELS @@ -2479,227 +4134,110 @@ index 1a3912c..c87e9d3 100644 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int width); -@@ -142,14 +212,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t +@@ -142,14 +341,124 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); } -+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, ++ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, ++ MvField *curr, MvField *neigh, uint8_t *bs); ++ ++ ++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int8_t offset_table[32] = { 0 }; -+ int k, y, x; -+ int shift = 3; // BIT_DEPTH - 5 -+ int cwidth = 0; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ for (k = 0; k < 4; k++) -+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; -+ -+ if (height % 8 == 0) -+ cwidth = width; -+ -+ switch(cwidth){ -+ case 8: -+ ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 16: -+ ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 32: -+ ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 64: -+ ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ default: -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } ++ ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); ++} ++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); +} + -+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++ ++#if SAO_FILTER_N == 6 ++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); ++} ++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); ++} ++ ++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++ ++#if RPI_HEVC_SAND ++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++ ++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ -+ // Width 32 already dealt with -+ // width 16 code works in double lines -+ if (width == 16 && (height & 1) == 0) { -+ ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst, -+ sao_offset_val_u, sao_left_class_u, -+ sao_offset_val_v, sao_left_class_v, -+ width, height); -+ } -+ else -+ { -+ const int shift = 3; // BIT_DEPTH - 5 -+ int k, y, x; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int8_t offset_table_u[32] = { 0 }; -+ int8_t offset_table_v[32] = { 0 }; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ for (k = 0; k < 4; k++) -+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; -+ for (k = 0; k < 4; k++) -+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width * 2; x += 2) -+ { -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); -+ } -+ dst += stride_dst; -+ src += stride_src; -+ -+ } -+ } ++ ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} -+ -+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1)) -+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, -+ int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) +{ -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ int8_t sao_offset_val[8]; // padding of 3 for vld -+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE); -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ int cwidth = 0; -+ -+ for (x = 0; x < 5; x++) { -+ sao_offset_val[x] = _sao_offset_val[edge_idx[x]]; -+ } -+ -+ if (height % 8 == 0) -+ cwidth = width; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ switch (cwidth) { -+ case 32: -+ switch(eo) { -+ case 0: -+ ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 1: -+ ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 2: -+ ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 3: -+ ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ } -+ break; -+ case 64: -+ switch(eo) { -+ case 0: -+ ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 1: -+ ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 2: -+ ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 3: -+ ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ } -+ break; -+ default: -+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ int diff0 = CMP(src[x], src[x + a_stride]); -+ int diff1 = CMP(src[x], src[x + b_stride]); -+ int idx = diff0 + diff1; -+ if (idx) -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+ } ++ ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} ++#endif ++#endif + + -+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height) -+{ -+ const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); + -+ if (width == 32 && (height & 7) == 0) { -+ ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo); -+ } -+ else -+ { -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ int8_t sao_offset_val_u[8]; // padding of 3 for vld -+ int8_t sao_offset_val_v[8]; // padding of 3 for vld -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ -+ for (x = 0; x < 5; x++) { -+ sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]]; -+ sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]]; -+ } -+ -+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width * 2; x += 2) { -+ int diff0u = CMP(src[x], src[x + a_stride]); -+ int diff1u = CMP(src[x], src[x + b_stride]); -+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); -+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]); -+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+ } -+} -+#undef CMP -+ -+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, -+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, -+ MvField *curr, MvField *neigh, uint8_t *bs); ++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160 ++#error SAO edge src stride not 160 - value used in .S ++#endif + av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) { @@ -2710,7 +4248,9 @@ index 1a3912c..c87e9d3 100644 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon; +#ifdef RPI + c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8; + c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8; @@ -2719,10 +4259,14 @@ index 1a3912c..c87e9d3 100644 c->idct[0] = ff_hevc_transform_4x4_neon_8; c->idct[1] = ff_hevc_transform_8x8_neon_8; c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; -@@ -160,7 +455,25 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -160,7 +469,53 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8; c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8; c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8; ++ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_8; ++ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_8; ++ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_8; ++ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_8; +#if RPI_HEVC_SAND + c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_8; + c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_8; @@ -2733,19 +4277,43 @@ index 1a3912c..c87e9d3 100644 + c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_8; + c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_8; + c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_8; ++ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_8; ++ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_8; ++ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_8; +#endif c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; -+ for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) { -+ c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper; -+ c->sao_band_filter_c[x] = ff_hevc_sao_band_c_neon_wrapper; -+ c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper; -+ c->sao_edge_filter_c[x] = ff_hevc_sao_edge_c_neon_wrapper; -+ } -+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_neon_8; // width=32 ++ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_8; ++ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_8; ++ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_8; ++ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_8; ++ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_8; ++ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_8; ++ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_8; ++ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_8; ++ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_8; ++ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_8; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_8; ++ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_8; ++#endif ++#if RPI_HEVC_SAND ++ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_8; ++ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_8; ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_8; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_8; ++ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_8; ++ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_8; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_8; ++ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_8; ++#endif ++#endif put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; -@@ -201,7 +514,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; @@ -2767,22 +4335,711 @@ index 1a3912c..c87e9d3 100644 c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -@@ -221,4 +548,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; } ++ else if (bit_depth == 10) { ++ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon_10; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10; ++#ifdef RPI ++ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_10; ++ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_10; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_10; ++#endif ++ c->idct[0] = ff_hevc_transform_4x4_neon_10; ++ c->idct[1] = ff_hevc_transform_8x8_neon_10; ++ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_10; ++ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_10; ++ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_10; ++ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_10; ++ c->add_residual[0] = ff_hevc_add_residual_4x4_neon_10; ++ c->add_residual[1] = ff_hevc_add_residual_8x8_neon_10; ++ c->add_residual[2] = ff_hevc_add_residual_16x16_neon_10; ++ c->add_residual[3] = ff_hevc_add_residual_32x32_neon_10; ++ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_10; ++ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_10; ++ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_10; ++ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_10; ++#if RPI_HEVC_SAND ++ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_10; ++ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_10; ++ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_10; ++ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_10; ++ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_10; ++ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_10; ++ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_10; ++ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_10; ++ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_10; ++ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_10; ++ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_10; ++ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_10; ++#endif ++ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_10; ++ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_10; ++ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_10; ++ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_10; ++ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_10; ++ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_10; ++ ++ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_10; ++ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_10; ++ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_10; ++ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_10; ++ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_10; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_10; ++ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_10; ++#endif ++#if RPI_HEVC_SAND ++ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_10; ++ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_10; ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_10; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_10; ++ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_10; ++ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_10; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_10; ++ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_10; ++#endif ++#endif ++ } + + assert(offsetof(MvField, mv) == 0); + assert(offsetof(MvField, ref_idx) == 8); + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon; } +diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S +new file mode 100644 +index 0000000000..7cc5cd5e5c +--- /dev/null ++++ b/libavcodec/arm/hevcdsp_res16_neon.S +@@ -0,0 +1,610 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++#define BIT_DEPTH 10 ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1] ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vld1.16 {d0}, [r0, :64], r2 ++ vld1.16 {d1}, [r0, :64], r2 ++ vld1.16 {d2}, [r0, :64], r2 ++ vld1.16 {d3}, [r0, :64], r2 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0, :64], r2 ++ vst1.16 {d1}, [r0, :64], r2 ++ vst1.16 {d2}, [r0, :64], r2 ++ vst1.16 {d3}, [r0, :64], r2 ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vdup.i16 q9, r3 ++ vld1.16 {d0}, [r0, :64], r1 ++ vld1.16 {d1}, [r0, :64], r1 ++ vdup.16 q15, r2 ++ vld1.16 {d2}, [r0, :64], r1 ++ vld1.16 {d3}, [r0, :64], r1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0, :64], r1 ++ vst1.16 {d1}, [r0, :64], r1 ++ vst1.16 {d2}, [r0, :64], r1 ++ vst1.16 {d3}, [r0, :64], r1 ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual8x8( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #2 ++1: ++ vldm r1!, {q10-q13} ++ vld1.16 {q0}, [r0, :128], r2 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r2 ++ vqadd.s16 q0, q10 ++ vld1.16 {q2}, [r0, :128], r2 ++ vqadd.s16 q1, q11 ++ vld1.16 {q3}, [r0, :128], r2 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {q0}, [r0, :128], r2 ++ vmin.s16 q2, q2, q9 ++ vst1.16 {q1}, [r0, :128], r2 ++ vmin.s16 q3, q3, q9 ++ vst1.16 {q2}, [r0, :128], r2 ++ vst1.16 {q3}, [r0, :128], r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #1 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual8x8_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 ++ mov r12, #2 ++ vdup.16 q15, r2 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vld1.16 {q0}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r1 ++ vqadd.s16 q0, q15 ++ vld1.16 {q2}, [r0, :128], r1 ++ vqadd.s16 q1, q15 ++ vld1.16 {q3}, [r0, :128], r1 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q8 ++ vmax.s16 q1, q8 ++ vmax.s16 q2, q8 ++ vmax.s16 q3, q8 ++ vmin.s16 q0, q9 ++ vmin.s16 q1, q9 ++ vst1.16 {q0}, [r0, :128], r1 ++ vmin.s16 q2, q9 ++ vst1.16 {q1}, [r0, :128], r1 ++ vmin.s16 q3, q9 ++ vst1.16 {q2}, [r0, :128], r1 ++ vst1.16 {q3}, [r0, :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual16x16( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #8 ++1: ++ vldm r1!, {q10-q13} ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r2 ++ subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst1.16 {q0, q1}, [r0, :128], r2 ++ vst1.16 {q2, q3}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #4 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual16x16_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #8 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r1 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0, :128], r1 ++ vst1.16 {q2, q3}, [r0, :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual32x32( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #32 ++1: ++ vldm r1!, {q10-q13} ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #16 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual32x32_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #32 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 ++ vldm r1, {q10-q13} ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++ add r3, r1, #(8*8*2) @ Offset to V ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ add r3, r1, #(16*16*2) @ Offset to V ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, #32 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S new file mode 100644 -index 0000000..08a021d +index 0000000000..30113d9c93 --- /dev/null +++ b/libavcodec/arm/hevcdsp_sao_neon.S -@@ -0,0 +1,862 @@ +@@ -0,0 +1,1882 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * @@ -2806,124 +5063,211 @@ index 0000000..08a021d +#include "libavutil/arm/asm.S" +#include "neon.S" + -+.macro init_sao_band -+ pld [r1] -+ vld1.8 {q0, q1}, [r2] // offset table -+ ldr r2, [sp, #0] // stride_dst -+ ldr r12, [sp, #4] // height -+ vmov.u8 q3, #128 -+.endm ++.set EDGE_SRC_STRIDE, 160 ++ ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 ++ vshr.u8 q13, q9, #3 ++ vadd.s8 q9, \Q_K128 ++ ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 + -+// 128 in q3 -+// input q8 - q11 -+.macro sao_band_64 -+ vtbl.8 d24, {d0, d1, d2, d3}, d24 -+ vadd.s8 q8, q3 -+ vtbl.8 d25, {d0, d1, d2, d3}, d25 -+ vadd.s8 q9, q3 -+ vtbl.8 d26, {d0, d1, d2, d3}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d0, d1, d2, d3}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0, d1, d2, d3}, d28 + vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0, d1, d2, d3}, d29 ++ vshr.u8 q12, q10, #3 ++ vadd.s8 q10, \Q_K128 + vqadd.s8 q9, q13 -+ vtbl.8 d30, {d0, d1, d2, d3}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d0, d1, d2, d3}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 ++ vshr.u8 q13, q11, #3 ++ vadd.s8 q11, \Q_K128 ++ ++ vsub.s8 q8, \Q_K128 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vsub.s8 q9, \Q_K128 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vqadd.s8 q11, q13 ++ vsub.s8 q10, \Q_K128 ++ vsub.s8 q11, \Q_K128 +.endm + -+function ff_hevc_sao_band_w8_neon_8, export=1 -+ init_sao_band -+1: subs r12, #8 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 -+ vshr.u8 q12, q8, #3 -+ vld1.8 {d18}, [r1, :64], r3 -+ vld1.8 {d19}, [r1, :64], r3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {d20}, [r1, :64], r3 -+ vld1.8 {d21}, [r1, :64], r3 -+ vshr.u8 q14, q10, #3 -+ vld1.8 {d22}, [r1, :64], r3 -+ vld1.8 {d23}, [r1, :64], r3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ vst1.8 {d18}, [r0, :64], r2 -+ vst1.8 {d19}, [r0, :64], r2 -+ vst1.8 {d20}, [r0, :64], r2 -+ vst1.8 {d21}, [r0, :64], r2 -+ vst1.8 {d22}, [r0, :64], r2 -+ vst1.8 {d23}, [r0, :64], r2 -+ bne 1b ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 + -+ bx lr ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ ++ vqadd.s8 q8, q12 ++ vsub.s8 q8, \Q_K128 ++.endm ++ ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ Clobbers q12, q13 ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vshrn.i16 d26, \Q2, #(\bit_depth - 5) ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vtbl.8 d26, \XLAT0, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vaddw.s8 \Q2, d26 ++ vaddw.s8 \Q3, d27 ++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX ++.endm ++ ++@ Clobbers q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++.endm ++ ++ ++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) ++@ so we are quite safe stuffing it into a byte array ++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma ++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of ++@ precision ++ ++@ This, somewhat nasty, bit of code builds the {d0-d3} translation ++@ array via the stack ++@ Given that sao_left_class > 28 can cause wrap we can't just poke ++@ all 4 bytes in at once ++@ ++@ It also loads other common regs ++ ++function band_load_y ++ vmov.i64 q0, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q1, #0 ++ ldr r12, [sp, #12] @ sao_left_class ++ ++ mov r4, sp ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND ++ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array ++ mov sp, r4 ++ ++ ldr r12, [sp, #20] @ height ++ pld [r1] ++ ++ sub r12, #1 ++ add r4, r1, r3 ++ bx lr +endfunc + -+function ff_hevc_sao_band_w16_neon_8, export=1 -+ init_sao_band -+1: subs r12, #4 -+ vld1.8 {q8}, [r1, :128], r3 -+ vshr.u8 q12, q8, #3 -+ vld1.8 {q9}, [r1, :128], r3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vld1.8 {q11}, [r1, :128], r3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {q8}, [r0, :128], r2 -+ vst1.8 {q9}, [r0, :128], r2 -+ vst1.8 {q10}, [r0, :128], r2 -+ vst1.8 {q11}, [r0, :128], r2 -+ bne 1b + -+ bx lr -+endfunc ++function band_load_c ++ vmov.i64 q2, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val1[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q3, #0 ++ ldr r12, [sp, #12] @ sao_left_class + -+function ff_hevc_sao_band_w32_neon_8, export=1 -+ init_sao_band -+1: subs r12, #2 -+ vld1.8 {q8-q9}, [r1, :128], r3 -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {q8-q9}, [r0, :128], r2 -+ vst1.8 {q10-q11}, [r0, :128], r2 -+ bne 1b ++ mov r4, sp @ Remember SP ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND + -+ bx lr -+endfunc ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array + -+function ff_hevc_sao_band_w64_neon_8, export=1 -+ init_sao_band ++ @ And again for the 2nd set ++ ldr r12, [r4, #16] @ &sao_offset_val2[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ ldr r12, [r4, #20] @ sao_left_class2 ++ ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again) ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array ++ ++ mov sp, r4 ++ ++ ldr r12, [sp, #28] @ height ++ pld [r1] + -+ push {r4, lr} + subs r12, #1 -+ mov r4, r1 -+ it ne -+ addne r4, r3 ++ add r4, r1, r3 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_sao_band_64_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_64_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 + +1: subs r12, #1 + vldm r1, {q8-q11} + pld [r4] -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 + add r1, r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ + it ne + addne r4, r3 + vstm r0, {q8-q11} @@ -2933,8 +5277,113 @@ index 0000000..08a021d + pop {r4, pc} +endfunc + ++@ ff_hevc_sao_band_32_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] + -+@ ff_hevc_sao_band_c_w64_neon_8( ++function ff_hevc_sao_band_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #2 ++ vld1.8 { q8, q9 }, [r1, :128], r3 ++ vld1.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8, q9 }, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_16_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #4 ++ vld1.8 { q8}, [r1, :128], r3 ++ vld1.8 { q9}, [r1, :128], r3 ++ vld1.8 {q10}, [r1, :128], r3 ++ vld1.8 {q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8}, [r0, :128], r2 ++ vst1.8 { q9}, [r0, :128], r2 ++ vst1.8 {q10}, [r0, :128], r2 ++ vst1.8 {q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_8_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #2 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #4 ++ vld1.32 {d16[0]}, [r1, :32], r3 ++ vld1.32 {d16[1]}, [r1, :32], r3 ++ vld1.32 {d17[0]}, [r1, :32], r3 ++ vld1.32 {d17[1]}, [r1, :32], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.32 {d16[0]}, [r0, :32], r2 ++ vst1.32 {d16[1]}, [r0, :32], r2 ++ vst1.32 {d17[0]}, [r0, :32], r2 ++ vst1.32 {d17[1]}, [r0, :32], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_32_neon_8( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] @@ -2946,707 +5395,1535 @@ index 0000000..08a021d +@ int width sp[16] +@ int height sp[20] + -+@ As this is often done in-place on the frame buffer it is worth preloading -+@ the pixel values but we want to beware of loading ouside our buffer to avoid -+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU) ++function ff_hevc_sao_band_c_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c + -+function ff_hevc_sao_band_c_neon_8, export=1 -+ mov r12, sp -+ push {r4-r8, lr} // 24 bytes ++ vmov.i8 q15, #128 ++ sub r3, #32 ++ sub r2, #32 + -+ ldm r12, {r4-r7} ++1: subs r12, #1 ++ vld2.8 { q8, q9 }, [r1, :128]! ++ vld2.8 {q10, q11}, [r1, :128], r3 + -+ add r4, #2 -+ add r6, #2 -+ vld1.16 {d16}, [r4] @ Unaligned -+ lsl r5, r5, #3 -+ vld1.16 {d18}, [r6] -+ pld [r1] -+ vmov.i8 d17, #0 -+ mov r4, r1 -+ vmov.i8 d19, #0 -+ lsl r7, r7, #3 -+ vdup.8 q1, r5 -+ ldr r5, [r12, #16] @ width -+ vdup.8 q2, r7 -+ ldr r12, [r12, #20] -+ vqmovn.s16 d0, q8 -+ cmp r5, #16 @ At some point we may want a table lookup -+ vqmovn.s16 d1, q9 -+ vmov.i8 q3, #128 -+ beq 16f ++ pld [r4] + -+ @ d0 U lookup -+ @ d1 V lookup -+ @ q1 U raw offset -+ @ q2 V raw offset -+ @ q3 #128 ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 + -+ @ r4 = r1 = src - Inteded for preload pointer -+ @ r12 = height ++ vst2.8 { q8, q9 }, [r0, :128]! ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ itt ne ++ addne r4, r3 ++ addne r4, #32 ++ ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_16_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_sao_band_c_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ vmov.i8 q15, #128 ++ ++1: subs r12, #2 ++ vld2.8 { q8, q9 }, [r1, :128], r3 ++ vld2.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_8_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_sao_band_c_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #1 ++ vld2.8 {d16, d17}, [r1, :128], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #1 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ vuzp.8 d16, d17 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vzip.8 d16, d17 ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++ ++@ ff_hevc_sao_band_64_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_64_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q2, #0 ++ vdup.i16 q3, lr ++ bl band_load_y ++ vpush {q4-q7} ++ ++1: subs r12, #1 ++ vldm r1, {q4-q11} ++ add r1, r3 ++ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ vstm r0, {q4-q11} ++ add r0, r2 ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_64_neon_10, export=1 ++ band_64_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_32_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_32_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q2, #0 ++ vdup.i16 q3, lr ++ bl band_load_y ++ ++1: subs r12, #1 ++ vldm r1, {q8-q11} ++ add r1, r3 ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_32_neon_10, export=1 ++ band_32_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_16_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_16_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ bl band_load_y ++ ++1: subs r12, #2 ++ vld1.16 { q8, q9 }, [r1, :128], r3 ++ vld1.16 {q10, q11}, [r1, :128], r3 ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 { q8, q9 }, [r0, :128], r2 ++ vst1.16 {q10, q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_16_neon_10, export=1 ++ band_16_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_8_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_8_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ bl band_load_y ++ ldr lr, [sp, #16] ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #2 ++ vld1.16 { q8}, [r1, :128], r3 ++ vld1.16 { q9}, [r1, :128], r3 ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 { q8}, [r0, :128], r2 ++ vst1.16 { q9}, [r0, :128], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #4 ++ vld1.16 {d16}, [r1, :64], r3 ++ vld1.16 {d17}, [r1, :64], r3 ++ vld1.16 {d18}, [r1, :64], r3 ++ vld1.16 {d19}, [r1, :64], r3 ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 {d16}, [r0, :64], r2 ++ vst1.16 {d17}, [r0, :64], r2 ++ vst1.16 {d18}, [r0, :64], r2 ++ vst1.16 {d19}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_8_neon_10, export=1 ++ band_8_16 10 ++endfunc ++ ++ ++@ ff_hevc_sao_band_c_32_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_32_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ vpush {q4-q7} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ sub r2, #96 ++ ++1: subs r12, #1 ++ ++ vld2.16 { q4, q5 }, [r1, :128]! ++ vld2.16 { q6, q7 }, [r1, :128]! ++ vld2.16 { q8, q9 }, [r1, :128]! ++ vld2.16 {q10, q11}, [r1, :128], r3 ++ ++ pld [r4] ++ sub r1, #96 ++ ++ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth + -+ @ Might (unlikely) be called with height == 1 -+ subs r12, #1 + it ne + addne r4, r3 + -+1: -+ subs r12, #1 -+ vld2.8 {q8-q9}, [r1, :128]! -+ vsub.u8 q12, q8, q1 -+ vld2.8 {q10-q11}, [r1, :128], r3 -+ vsub.u8 q14, q10, q1 -+ vsub.u8 q13, q9, q2 -+ sub r1, #32 -+ vsub.u8 q15, q11, q2 -+ pld [r4] -+ vshr.u8 q12, #3 -+ vadd.s8 q8, q3 -+ vshr.u8 q13, #3 -+ vadd.s8 q9, q3 ++ vst2.16 { q4, q5 }, [r0, :128]! ++ vst2.16 { q6, q7 }, [r0, :128]! ++ vst2.16 { q8, q9 }, [r0, :128]! ++ vst2.16 {q10, q11}, [r0, :128], r2 + -+ vtbl.8 d24, {d0}, d24 -+ vshr.u8 q14, #3 -+ vtbl.8 d25, {d0}, d25 -+ vshr.u8 q15, #3 -+ vtbl.8 d26, {d1}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d1}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0}, d28 -+ vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0}, d29 -+ vqadd.s8 q9, q13 -+ vtbl.8 d30, {d1}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d1}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 -+ -+ it ne -+ addne r4, r3 @ Do not inc on final pass -+ vst2.8 {q8-q9}, [r0, :128]! -+ vst2.8 {q10-q11}, [r0, :128], r2 -+ sub r0, #32 + bpl 1b + -+ pop {r4-r8, pc} -+ -+@ -- width 16 (UV pairs) -- -+16: -+ subs r12, #2 -+ it ne -+ addne r4, r4, r3, lsl #1 -+ -+1: -+ subs r12, #2 -+ vld2.8 {q8-q9}, [r1, :128], r3 -+ vsub.u8 q12, q8, q1 -+ vld2.8 {q10-q11}, [r1, :128], r3 -+ vsub.u8 q14, q10, q1 -+ vsub.u8 q13, q9, q2 -+ pld [r4] -+ vsub.u8 q15, q11, q2 -+ pld [r4, r3] -+ vshr.u8 q12, #3 -+ vadd.s8 q8, q3 -+ vshr.u8 q13, #3 -+ vadd.s8 q9, q3 -+ -+ vtbl.8 d24, {d0}, d24 -+ vshr.u8 q14, #3 -+ vtbl.8 d25, {d0}, d25 -+ vshr.u8 q15, #3 -+ vtbl.8 d26, {d1}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d1}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0}, d28 -+ vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0}, d29 -+ vqadd.s8 q9, q13 -+ vtbl.8 d30, {d1}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d1}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 -+ -+ it ne -+ addne r4, r4, r3, lsl #1 -+ vst2.8 {q8-q9}, [r0, :128], r2 -+ vst2.8 {q10-q11}, [r0, :128], r2 -+ bpl 1b -+ -+ pop {r4-r8, pc} ++ vpop {q4-q7} ++ pop {r4, pc} ++.endm + ++function ff_hevc_sao_band_c_32_neon_10, export=1 ++ band_c_32_16 10 +endfunc + + -+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3 -+ vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0 -+ vcgt.u8 \out1, \in3, \in1 // c > a -> -1 , otherwise 0 part 2 -+ vcgt.u8 \tmp1, \in1, \in3 // a > c -> -1 , otherwise 0 part 2 -+ vsub.s8 \out0, \tmp0, \out0 // diff0 -+ vsub.s8 \out1, \tmp1, \out1 // diff0 part 2 ++@ ff_hevc_sao_band_c_16_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_16_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ sub r2, #32 ++ sub r3, #32 ++ ++1: subs r12, #1 ++ ++ vld2.16 { q8, q9 }, [r1, :128]! ++ vld2.16 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128]! ++ vst2.16 {q10, q11}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} +.endm + ++function ff_hevc_sao_band_c_16_neon_10, export=1 ++ band_c_16_16 10 ++endfunc + -+// input -+// a in q0 - q3 -+// c in q4 - q7 -+// b in q8 - q11 -+// offset table r4,r5 and r6,r7 -+// r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C -+// output in q0 - q3 -+// clobbers q12 - q15 + -+@ a <- c <- b ++@ ff_hevc_sao_band_c_8_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_8_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ ldr lr, [sp, #24] @ width ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #1 ++ vld2.16 { q8, q9 }, [r1, :128], r3 ++ ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #2 ++ vld2.16 {d16, d17}, [r1, :128], r3 ++ vld2.16 {d18, d19}, [r1, :128], r3 ++ ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 {d16, d17}, [r0, :128], r2 ++ vst2.16 {d18, d19}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_c_8_neon_10, export=1 ++ band_c_8_16 10 ++endfunc ++ ++ ++@ ============================================================================= ++@ SAO EDGE ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ [r5] translate values +@ -+@ It appears that Neon can stall if you try and use results too soon so we try to -+@ spread our instruction out ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 + -+.macro edgeidx64 ++function edge_64b_body_8 + -+ vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q1 -+ vcgt.u8 q14, q6, q2 -+ vcgt.u8 q15, q7, q3 ++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 + -+ vcgt.u8 q0, q0, q4 // a > c -> -1 , otherwise 0 -+ vcgt.u8 q1, q1, q5 -+ vcgt.u8 q2, q2, q6 -+ vcgt.u8 q3, q3, q7 ++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q5 ++ vcgt.u8 q2, q6 ++ vcgt.u8 q3, q7 + -+ vsub.s8 q0, q0, q12 // a = sign(c-a) -+ vsub.s8 q1, q1, q13 -+ vsub.s8 q2, q2, q14 -+ vsub.s8 q3, q3, q15 ++ vsub.s8 q0, q12 @ a = sign(c-a) ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 + -+ vcgt.u8 q12, q4, q8 // c > b -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q9 -+ vcgt.u8 q14, q6, q10 -+ vcgt.u8 q15, q7, q11 ++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 + -+ vsub.s8 q0, q0, q12 -+ vsub.s8 q1, q1, q13 -+ vsub.s8 q2, q2, q14 -+ vsub.s8 q3, q3, q15 ++ vsub.s8 q0, q12 ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 + -+ vcgt.u8 q12, q8, q4 // c < b -> -1 , otherwise 0 -+ vcgt.u8 q13, q9, q5 -+ vcgt.u8 q14, q10, q6 -+ vcgt.u8 q15, q11, q7 ++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 + -+ vadd.s8 q0, q0, q12 // a = sign(c-a) + sign(c-b) -+ vadd.s8 q1, q1, q13 -+ vmov.u8 q12, #2 -+ vadd.s8 q2, q2, q14 -+ vadd.s8 q3, q3, q15 ++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q14 ++ vadd.s8 q3, q15 + -+ vadd.s8 q0, q0, q12 -+ vadd.s8 q1, q1, q12 -+ @ whilst vmov dn, rm, rn exists it is a vfp instruction -+ @ and causes a stall till neon pipe empty - so don't do that! -+ vmov d26[0], r4 -+ vmov d26[1], r5 -+ vmov d27[0], r6 -+ vmov d27[1], r7 -+ vadd.s8 q2, q2, q12 -+ vuzp.8 q0, q1 -+ vmov.u8 q15, #128 -+ vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b) ++ vadd.s8 q0, q12 ++ vadd.s8 q1, q12 + -+ vtbl.8 d0, {d26}, d0 -+ vadd.s8 q12, q4, q15 // Add -128 so we can use saturating signed add ++ vld1.8 {d26, d27}, [r5] + -+ vtbl.8 d1, {d26}, d1 -+ vadd.s8 q14, q5, q15 ++ vadd.s8 q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) + -+ vtbl.8 d2, {d27}, d2 -+ vuzp.8 q2, q3 ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add + -+ vtbl.8 d3, {d27}, d3 ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 + -+ vtbl.8 d4, {d26}, d4 -+ vzip.8 q0, q1 ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 + -+ vtbl.8 d5, {d26}, d5 -+ vqadd.s8 q0, q0, q12 -+ vqadd.s8 q1, q1, q14 -+ vadd.s8 q12, q6, q15 // Add -128 so we can use saturating signed add ++ vtbl.8 d3, {d27}, d3 + -+ vtbl.8 d6, {d27}, d6 -+ vadd.s8 q14, q7, q15 // Add -128 so we can use saturating signed add ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 + -+ vtbl.8 d7, {d27}, d7 -+ vzip.8 q2, q3 ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q12 ++ vqadd.s8 q1, q14 ++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add + -+ vsub.s8 q0, q0, q15 -+ vqadd.s8 q2, q2, q12 -+ vqadd.s8 q3, q3, q14 -+ vsub.s8 q1, q1, q15 -+ vsub.s8 q2, q2, q15 -+ vsub.s8 q3, q3, q15 ++ vtbl.8 d6, {d27}, d6 ++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add + -+.endm ++ vtbl.8 d7, {d27}, d7 ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q15 ++ vqadd.s8 q2, q12 ++ vqadd.s8 q3, q14 ++ vsub.s8 q1, q15 ++ vsub.s8 q2, q15 ++ vsub.s8 q3, q15 ++ ++ bx lr ++endfunc ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ r4 upper clip value ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_16 ++ ++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q1 ++ vcgt.u16 q14, q6, q2 ++ vcgt.u16 q15, q7, q3 ++ ++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u16 q1, q1, q5 ++ vcgt.u16 q2, q2, q6 ++ vcgt.u16 q3, q3, q7 ++ ++ vsub.s16 q0, q0, q12 // a = sign(c-a) ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q9 ++ vcgt.u16 q14, q6, q10 ++ vcgt.u16 q15, q7, q11 ++ ++ vsub.s16 q0, q0, q12 ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u16 q13, q9, q5 ++ vcgt.u16 q14, q10, q6 ++ vcgt.u16 q15, q11, q7 ++ ++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s16 q1, q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s16 q2, q2, q14 ++ vadd.s16 q3, q3, q15 ++ ++ vmovn.s16 d0, q0 ++ vmovn.s16 d1, q1 ++ vmovn.s16 d2, q2 ++ vmovn.s16 d3, q3 ++ ++ vuzp.8 q0, q1 ++ ++ vld1.8 {d26, d27}, [r5] ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ ++ vtbl.8 d0, {d26}, d0 ++ vtbl.8 d1, {d26}, d1 ++ vtbl.8 d2, {d27}, d2 ++ vtbl.8 d3, {d27}, d3 ++ ++ vmov.i64 q12, #0 ++ ++ vzip.8 q0, q1 ++ ++ vdup.i16 q13, r4 ++ ++ @ Avoid overwrite whilst widening ++ vaddw.s8 q2, q6, d2 ++ vaddw.s8 q3, q7, d3 ++ vaddw.s8 q1, q5, d1 ++ vaddw.s8 q0, q4, d0 ++ ++ @ now clip ++ clip16_4 q2, q3, q1, q0, q12, q13 + -+function edge_w64_body -+ edgeidx64 -+ vstm r0, {q0-q3} -+ add r0, r0, r2 + bx lr +endfunc + -+.macro init_edge_64 -+ push {r4-r8,lr} -+ ldr r12, [sp, #24] // height -+ ldr r5, [sp, #28] // sao_offset_val_table -+ ldrd r4, r5, [r5] -+ mov r6, r4 -+ mov r7, r5 -+.endm + -+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+ sub r1, #8 -+1: subs r12, #1 -+ vld1.64 {d7}, [r1, :64]! -+ vld1.64 {q4-q5}, [r1, :128]! // load c -+ vld1.64 {q6-q7}, [r1, :128]! -+ vld1.64 {d24}, [r1, :64], r3 -+ sub r1, #72 -+ // load a -+ vext.8 q0, q3, q4, #15 -+ vext.8 q1, q4, q5, #15 -+ vext.8 q2, q5, q6, #15 -+ vext.8 q3, q6, q7, #15 -+ // load b -+ vext.8 q8, q4, q5, #1 -+ vext.8 q9, q5, q6, #1 -+ vext.8 q10, q6, q7, #1 -+ vext.8 q11, q7, q12, #1 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3, q9, q10 ++@ ++@ d16, d17 (q8) xlat U, V ++@ q14.u8 #2 ++@ q15.u8 #128 ++ ++function edge_16b_body_8 ++ vcgt.u8 q3, q1, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q0, q1 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q9, q1, q2 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q10, q2, q1 @ c < b -> -1 , otherwise 0 ++ ++ vsub.s8 q0, q3 ++ vsub.s8 q10, q9 ++ vadd.s8 q0, q10 @ a = sign(c-a) ++ ++ vadd.s8 q0, q14 ++ vuzp.8 d0, d1 ++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ vqadd.s8 q0, q3 ++ vsub.s8 q0, q15 ++ ++ bx lr +endfunc + -+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+ sub r1, r3 ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3 ++@ ++@ q12, #0 ++@ d16, d17 xlat U, V ++@ q14.u8 #2 ++@ q15.u16 max ++function edge_16b_body_16 ++ vcgt.u16 q3, q1, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u16 q0, q1 @ a > c -> -1 , otherwise 0 ++ vsub.s16 q0, q3 @ a = sign(c-a) ++ vcgt.u16 q3, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.s16 q0, q3 ++ vcgt.u16 q3, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.s16 q0, q3 @ a = sign(c-a) + sign(c-b) ++ ++ vmovn.s16 d0, q0 ++ @ d1 will have random contents that we transform but ++ @ that doesn't matter as we then discard them ++ vuzp.8 d0, d1 ++ ++ vadd.s8 q0, q0, q14 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ ++ vaddw.s8 q0, q1, d0 ++ ++ @ now clip ++ vmax.s16 q0, q12 ++ vmin.s16 q0, q15 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_sao_edge_[c_]xx_neon( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only ++@ int eo, [sp, #sp_base + 0] ++@ int width, [sp, #sp_base + 4] ++@ int height) [sp, #sp_base + 8] ++ ++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0 ++ push {r4-r6, lr} @ 16 bytes ++.set sp_base, 16 ++ ++@ Build translate registers ++@ As translate values can only be 0-4 we don't care about junk in the rest ++@ of the register ++ mov r12, #2 ++.if \is_chroma ++ ldr r4, [sp, #16] ++.set sp_base, sp_base + 4 ++.endif ++ vld1.8 {d16[2]}, [r3], r12 ++ vld1.8 {d16[0]}, [r3], r12 ++ vld1.8 {d16[1]}, [r3], r12 ++ vld1.8 {d16[3]}, [r3], r12 ++ vld1.8 {d16[4]}, [r3] ++.if \is_chroma ++ vld1.8 {d17[2]}, [r4], r12 ++ vld1.8 {d17[0]}, [r4], r12 ++ vld1.8 {d17[1]}, [r4], r12 ++ vld1.8 {d17[3]}, [r4], r12 ++ vld1.8 {d17[4]}, [r4] ++.else ++ vmov d17, d16 ++.endif ++ ++@ Setup constant registers ++.if \bit_depth > 8 ++ movw r4, (1 << \bit_depth) - 1 ++.endif ++.if \setup_16b ++.if \bit_depth > 8 ++ vmov.i64 q12, #0 ++ vdup.16 q15, r4 ++.else ++ vmov.u8 q15, #128 ++.endif ++ vmov.u8 q14, #2 ++.endif ++ movw r3, EDGE_SRC_STRIDE ++ ++@ If setup_64b we need the xlat table on the stack and q4-q7 saved ++.if \setup_64b ++ sub r5, sp, #16 ++ vpush {q4-q8} @ 80 bytes, q8 pushed first ++.set sp_base, sp_base + 80 ++.endif ++ ++@ Get jump address ++@ We have a special case for width 4 as the calling code doesn't detect it ++@ If we may have w4 then we add a 2nd jump table after the 1st ++.if \check_w4 ++ ldr r12, [sp, #sp_base + 4] @ width ++ cmp r12, #8 ++.endif ++ ldr r12, [sp, #sp_base + 0] @ e0 ++ adr r6, \jump_tab ++.if \check_w4 ++ it lt ++ addlt r6, #16 ++.endif ++ ldr r6, [r6, r12, lsl #2] ++ ++ ldr r12, [sp, #sp_base + 8] @ height ++ ++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes ++.if \do2 ++ push {r0, r1, r6, r12} ++ blx r6 ++ pop {r0, r1, r6, r12} ++ ++ add r0, #64 ++ add r1, #64 ++.endif ++ ++ blx r6 ++ ++@ Tidy up & return ++.if \setup_64b ++ vpop {q4-q8} @ spurious but harmless load of q8 ++.endif ++ pop {r4-r6, pc} ++.endm ++ ++ ++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 ++.endm ++ ++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1 ++.endm ++ ++ ++.macro edge_64b_e0, body_fn, pb ++ mov r6, lr ++ sub r1, #8 ++1: vldm r1, {d7-d16} ++ subs r12, #1 ++ add r1, r3 + // load a -+ vld1.8 {q0-q1}, [r1, :128]! -+ vld1.8 {q2-q3}, [r1, :128], r3 -+ sub r1, #32 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+1: subs r12, #1 ++ vext.8 q0, q3, q4, #(16 - \pb) ++ vext.8 q1, q4, q5, #(16 - \pb) ++ vext.8 q2, q5, q6, #(16 - \pb) ++ vext.8 q3, q6, q7, #(16 - \pb) + // load b -+ vld1.8 {q8-q9}, [r1, :128]! -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ sub r1, #32 -+ bl edge_w64_body ++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vext.8 q10, q6, q7, #\pb ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_32bx2_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #2 ++ ++ vld1.8 {q4-q5}, [r1] ++ sub r1, #\pb ++ vld1.8 {q0-q1}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {q8-q9}, [r1], r3 ++ sub r1, #\pb ++ vld1.8 {q6-q7}, [r1] ++ sub r1, #\pb ++ vld1.8 {q2-q3}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {q10-q11}, [r1], r3 ++ sub r1, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0,q1}, [r0], r2 ++ vst1.8 {q2,q3}, [r0], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_16b_e0, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ sub r3, #\pb * 2 ++ ++1: subs r12, #1 ++ ++ vld1.64 {q0}, [r1] @ load a ++ add r1, #\pb ++ vld1.64 {q1}, [r1, :128] @ load c ++ add r1, #\pb ++ vld1.64 {q2}, [r1], r3 @ load b ++ ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #2 ++ ++ vld1.8 {d2}, [r1, :64] ++ sub r1, #\pb ++ vld1.8 {d0}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {d4}, [r1], r3 ++ sub r1, #\pb ++ vld1.8 {d3}, [r1, :64] ++ sub r1, #\pb ++ vld1.8 {d1}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {d5}, [r1], r3 ++ sub r1, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #4 ++ ++ vld1.32 {d2[0]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d0[0]}, [r1] ++ add r1, #(\pb * 2) ++ vld1.32 {d4[0]}, [r1], r3 @ R ++ vld1.32 {d4[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d2[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d0[1]}, [r1], r3 @ L ++ vld1.32 {d1[0]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[0]}, [r1] ++ add r1, #\pb ++ vld1.32 {d5[0]}, [r1], r3 @ R ++ vld1.32 {d5[1]}, [r1] ++ sub r1, #(\pb * 2) ++ vld1.32 {d1[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[1]}, [r1], r3 @ M ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++ ++.macro edge_64b_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {q0-q1}, [r1, :128]! ++ vld1.8 {q2-q3}, [r1, :128], r3 ++ sub r1, #32 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++1: subs r12, #1 ++ // load b ++ vld1.8 {q8-q9}, [r1, :128]! ++ vld1.8 {q10-q11}, [r1, :128], r3 ++ sub r1, #32 ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 + // copy c to a -+ vmov.64 q0, q4 -+ vmov.64 q1, q5 -+ vmov.64 q2, q6 -+ vmov.64 q3, q7 ++ vmov.64 q0, q4 ++ vmov.64 q1, q5 ++ vmov.64 q2, q6 ++ vmov.64 q3, q7 + // copy b to c -+ vmov.64 q4, q8 -+ vmov.64 q5, q9 -+ vmov.64 q6, q10 -+ vmov.64 q7, q11 -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ bgt 1b ++ bx r6 ++.endm + -+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+1: sub r1, r3 ++.macro edge_32bx2_e1, body_fn ++ mov r6, lr ++ sub r1, r3 + // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ sub r1, #1 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #31 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ add r1, #1 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #33 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++ vld1.8 {q0-q1}, [r1, :128], r3 ++ vld1.8 {q4-q5}, [r1, :128], r3 + -+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+1: sub r1, r3 -+ // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ add r1, #1 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #33 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ sub r1, #1 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #31 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++1: subs r12, #2 ++ @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vmov q2, q4 ++ vmov q3, q5 ++ vld1.8 {q8-q9}, [r1, :128], r3 ++ vld1.8 {q10-q11}, [r1, :128], r3 ++ vmov q6, q8 ++ vmov q7, q9 + ++ bl \body_fn + -+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8( -+@ uint8_t *_dst, r0 -+@ uint8_t *_src, r1 -+@ ptrdiff_t stride_dst, r2 -+@ ptrdiff_t stride_src, r3 -+@ int height, sp[0] -+@ int16_t *sao_offset_table_u, sp[4] -+@ int16_t *sao_offset_table_v); sp[8] -+@ int eo sp[12] ++ vst1.8 {q0,q1}, [r0], r2 ++ vst1.8 {q2,q3}, [r0], r2 + -+function ff_hevc_sao_edge_c_w64_neon_8, export=1 -+ push {r4-r8,lr} // 6 reg = 24 -+ ldr r5, [sp, #28] // sao_offset_val_table_u -+ ldr r7, [sp, #32] // sao_offset_val_table_v -+ -+ @ Load and rearrange offsets -+ @ Also "convert" from 16bit to 8bit -+ ldrb r4, [r5, #2] -+ ldrb r8, [r5, #4] -+ ldrb r6, [r7, #2] -+ ldrb r12, [r7, #4] -+ orr r4, r4, r8, lsl #8 -+ orr r6, r6, r12, lsl #8 -+ ldrb r8, [r5, #6] -+ ldrb r12, [r7, #6] -+ orr r4, r4, r8, lsl #24 -+ orr r6, r6, r12, lsl #24 -+ ldrb r5, [r5, #8] -+ ldrb r7, [r7, #8] -+ -+ ldr r12, [sp, #36] // e0 -+ adr r8, edge_c_tbl_w64 -+ ldr r8, [r8, r12, lsl #2] -+ -+ ldr r12, [sp, #24] // height -+ vpush {d8-d15} -+ mov pc, r8 -+ -+edge_c_tbl_w64: -+ .word ff_hevc_sao_edge_c_eo0_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo1_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo2_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo3_w64_neon_8 -+ -+ff_hevc_sao_edge_c_eo0_w64_neon_8: -+ sub r1, #8 -+1: subs r12, #1 -+ vld1.64 {d7}, [r1, :64]! -+ vld1.64 {q4-q5}, [r1, :128]! // load c -+ vld1.64 {q6-q7}, [r1, :128]! -+ vld1.64 {d24}, [r1, :64], r3 -+ sub r1, #72 -+ // load a -+ vext.8 q0, q3, q4, #14 -+ vext.8 q1, q4, q5, #14 -+ vext.8 q2, q5, q6, #14 -+ vext.8 q3, q6, q7, #14 -+ // load b -+ vext.8 q8, q4, q5, #2 -+ vext.8 q9, q5, q6, #2 -+ vext.8 q10, q6, q7, #2 -+ vext.8 q11, q7, q12, #2 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+ -+ff_hevc_sao_edge_c_eo1_w64_neon_8: -+ sub r1, r3 -+ // load a -+ vldm r1, {q0-q3} -+ add r1, r3 -+ // load c -+ vldm r1, {q4-q7} -+ add r1, r3 -+1: subs r12, #1 -+ // load b -+ vldm r1, {q8-q11} -+ add r1, r3 -+ bl edge_w64_body + // copy c to a -+ vmov.64 q0, q4 -+ vmov.64 q1, q5 -+ vmov.64 q2, q6 -+ vmov.64 q3, q7 ++ vmov.64 q0, q8 ++ vmov.64 q1, q9 ++ + // copy b to c -+ vmov.64 q4, q8 -+ vmov.64 q5, q9 -+ vmov.64 q6, q10 -+ vmov.64 q7, q11 -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++ vmov.64 q4, q10 ++ vmov.64 q5, q11 ++ bgt 1b ++ bx r6 ++.endm + -+ff_hevc_sao_edge_c_eo2_w64_neon_8: -+1: sub r1, r3 ++.macro edge_16b_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {q0}, [r1, :128], r3 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++1: subs r12, #1 ++ // load b ++ vld1.8 {q2}, [r1, :128], r3 ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ // copy c to a ++ vmov.64 q0, q1 ++ // copy b to c ++ vmov.64 q1, q2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {d0}, [r1, :64], r3 ++ vld1.8 {d2}, [r1, :64], r3 ++ ++1: subs r12, #2 ++ @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vmov.64 d1, d2 ++ vld1.8 {d4}, [r1, :64], r3 ++ vld1.8 {d5}, [r1, :64], r3 ++ vmov.64 d3, d4 ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0], r2 ++ vst1.8 {d1}, [r0], r2 ++ ++ // copy c to a ++ vmov.64 d0, d4 ++ // copy b to c ++ vmov.64 d2, d5 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e1, body_fn ++ mov r6, lr ++debug_me: ++ sub r1, r3 ++ // load a ++ vld1.32 {d0[0]}, [r1], r3 ++ vld1.32 {d0[1]}, [r1], r3 ++ ++1: subs r12, #4 ++ @ Given the data duplication here we could probably do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.32 {d4[0]}, [r1], r3 ++ vld1.32 {d4[1]}, [r1], r3 ++ vld1.32 {d5[0]}, [r1], r3 ++ vld1.32 {d5[1]}, [r1], r3 ++ ++ vmov.32 d1, d4 ++ vext.32 d2, d0, d4, #1 ++ vext.32 d3, d4, d5, #1 ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ ++ vmov.32 d0, d5 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_64b_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #32 ++ sub r3, #(32 - \pb) ++ ++1: sub r1, r3 + // load a + // TODO: fix unaligned load + // don't reload a like in eo1 -+ sub r1, #2 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #30 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ add r1, #2 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #34 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ // load b ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #(64 + \pb) ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b + -+ff_hevc_sao_edge_c_eo3_w64_neon_8: -+1: sub r1, r3 -+ // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ add r1, #2 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #34 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ sub r1, #2 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #30 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc -+ -+ -+.macro init_edge_32 -+ ldr r12, [sp, #4] // sao_offset_val_table -+ vld1.32 {d31}, [r12] -+ ldr r12, [sp] // height ++ add r3, #(32 - \pb) ++ bx r6 +.endm + -+.macro diff out0, tmp0, in0, in1 -+ vcgt.u8 \out0, \in1, \in0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 \tmp0, \in0, \in1 // a > c -> -1 , otherwise 0 -+ vsub.s8 \out0, \tmp0, \out0 // diff0 ++.macro edge_32bx2_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ ++1: sub r1, r3 ++ vld1.8 {q0-q1}, [r1], r3 ++ vld1.8 {q2-q3}, [r1] ++ subs r12, #2 ++ // load c ++ add r1, #\pb ++ vld1.8 {q4-q5}, [r1, :128], r3 ++ vld1.8 {q6-q7}, [r1, :128] ++ // load b ++ add r1, #\pb ++ vld1.8 {q8-q9}, [r1], r3 ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #(\pb * 2) ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0], r2 ++ vst1.8 {q2-q3}, [r0], r2 ++ bgt 1b ++ ++ bx r6 +.endm + -+.macro table32 -+ vmov.s8 q10, #2 -+ vadd.s8 q0, q10 -+ vadd.s8 q1, q10 -+ vmov.s8 q10, #128 -+ vtbl.8 d0, {d31}, d0 -+ vadd.s8 q11, q2, q10 -+ vtbl.8 d1, {d31}, d1 -+ vadd.s8 q12, q3, q10 -+ vtbl.8 d2, {d31}, d2 -+ vqadd.s8 q11, q0 -+ vtbl.8 d3, {d31}, d3 -+ vqadd.s8 q12, q1 -+ vsub.s8 q0, q11, q10 -+ vsub.s8 q1, q12, q10 -+ vst1.8 {q0-q1}, [r0, :128], r2 ++.macro edge_16b_e2, body_fn, pb ++ mov r6, lr ++ add r3, #\pb ++ ++1: sub r1, r3 ++ // load a ++ vld1.8 {q0}, [r1], r3 ++ subs r12, #1 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++ // load b ++ vld1.8 {q2}, [r1] ++ sub r1, #\pb ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ bgt 1b ++ bx r6 +.endm + -+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {q4-q7} -+ sub r1, #4 -+1: subs r12, #1 -+ vld1.8 {q13-q14}, [r1]! -+ vld1.32 d30, [r1], r3 -+ sub r1, #32 -+ // a -+ vext.8 q0, q13, q14, #3 -+ vext.8 q1, q14, q15, #3 -+ vshr.u64 d24, d30, #24 -+ // c -+ vext.8 q2, q13, q14, #4 -+ vext.8 q3, q14, q15, #4 -+ vshr.u64 d16, d30, #32 -+ // diff0 -+ diff32 q13, q14, q4, q5, q0, q1, q2, q3 -+ diff d18, d25, d24, d16 -+ // -diff1 -+ vext.s8 q0, q13, q14, #1 -+ vext.s8 q1, q14, q9, #1 ++.macro edge_8bx2_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb + -+ vsub.s8 q0, q13, q0 //diff0 + diff1 -+ vsub.s8 q1, q14, q1 -+ table32 -+ bne 1b -+ vpop {q4-q7} ++1: sub r1, r3 ++ vld1.8 {d0}, [r1], r3 ++ vld1.8 {d1}, [r1] ++ subs r12, #2 ++ // load c ++ add r1, #\pb ++ vld1.8 {d2}, [r1, :64], r3 ++ vld1.8 {d3}, [r1, :64] ++ // load b ++ add r1, #\pb ++ vld1.8 {d4}, [r1], r3 ++ vld1.8 {d5}, [r1] ++ sub r1, #(\pb * 2) + -+ bx lr ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0], r2 ++ vst1.8 {d1}, [r0], r2 ++ bgt 1b ++ ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ ++1: sub r1, r3 ++ @ line 0 {d0[0], -, - } r1 lo ++ vld1.32 {d0[0]}, [r1], r3 ++ subs r12, #4 ++ @ Line 1 {d0[1], d2[0], - } r1 lo ++ vld1.32 {d0[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d2[0]}, [r1], r3 ++ @ Line 2 {d1[0], d2[1], d4[0]} r1 mid ++ vld1.32 {d2[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d1[0]}, [r1] ++ add r1, #\pb * 2 ++ vld1.32 {d4[0]}, [r1], r3 ++ @ Line 2 {d1[1], d3[0], d4[1]} r1 hi ++ vld1.32 {d4[1]}, [r1] ++ sub r1, #\pb * 2 ++ vld1.32 {d1[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[0]}, [r1], r3 ++ @ Line 3 {-, d3[1], d5[0]} r1 mid ++ vld1.32 {d3[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d5[0]}, [r1], r3 ++ @ Line 4 {-, -, d5[1]} r1 hi ++ vld1.32 {d5[1]}, [r1] ++ sub r1, #(\pb * 2) ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ bgt 1b ++ ++ bx r6 ++.endm ++ ++.macro edge_64b_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_64b_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_32bx2_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_32bx2_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_16b_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_16b_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_8bx2_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_8bx2_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_4bx4_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_4bx4_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_64b_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_64b_e0 \body_fn, \pb ++10: edge_64b_e1 \body_fn ++20: edge_64b_e2 \body_fn, \pb ++30: edge_64b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_32bx2_e0 \body_fn, \pb ++10: edge_32bx2_e1 \body_fn ++20: edge_32bx2_e2 \body_fn, \pb ++30: edge_32bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_16b_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_32bx2_e0 \body_fn_64b, \pb ++10: edge_32bx2_e1 \body_fn_64b ++20: edge_32bx2_e2 \body_fn_64b, \pb ++30: edge_32bx2_e3 \body_fn_64b, \pb ++5: edge_16b_e0 \body_fn_16b, \pb ++15: edge_16b_e1 \body_fn_16b ++25: edge_16b_e2 \body_fn_16b, \pb ++35: edge_16b_e3 \body_fn_16b, \pb ++.endm ++ ++.macro edge_16b_8bx2_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++5: edge_8bx2_e0 \body_fn, \pb ++15: edge_8bx2_e1 \body_fn ++25: edge_8bx2_e2 \body_fn, \pb ++35: edge_8bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_8bx2_4bx4_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_8bx2_e0 \body_fn, \pb ++10: edge_8bx2_e1 \body_fn ++20: edge_8bx2_e2 \body_fn, \pb ++30: edge_8bx2_e3 \body_fn, \pb ++5: edge_4bx4_e0 \body_fn, \pb ++15: edge_4bx4_e1 \body_fn ++25: edge_4bx4_e2 \body_fn, \pb ++35: edge_4bx4_e3 \body_fn, \pb ++.endm ++ ++@ void ff_hevc_sao_edge_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_8_neon_8, export=1 ++ edge_16b_init 8, 0, 1, 99f ++99: ++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {q4-q7} -+ // load a -+ sub r1, r3 -+ vld1.8 {q0-q1}, [r1, :128], r3 -+ // load c -+ vld1.8 {q2-q3}, [r1, :128], r3 -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a ) -+1: subs r12, #1 -+ // load b -+ vld1.8 {q8-q9}, [r1, :128], r3 -+ diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b ) -+ vadd.s8 q0, q4, q12 //diff0 + diff1 -+ vadd.s8 q1, q5, q13 -+ table32 -+ // CMP ( c, a ) -+ vneg.s8 q12, q4 -+ vneg.s8 q13, q5 -+ // c -+ vmov.64 q2, q8 -+ vmov.64 q3, q9 -+ bne 1b -+ vpop {q4-q7} -+ bx lr ++@ void ff_hevc_sao_edge_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_16_neon_8, export=1 ++ edge_16b_init 8, 0, 0, 99f ++99: ++ edge_16b_bodies edge_16b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {d8-d15} -+ // load a -+ sub r1, r3 -+ sub r1, #8 -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {d24}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q0, q10, q11, #7 -+ vext.8 q1, q11, q12, #7 -+ // load c -+ vld1.8 {d9}, [r1, :64]! -+ vld1.8 {q2-q3}, [r1, :64], r3 -+ sub r1, #8 -+ vext.8 q4, q4, q2, #15 -+1: subs r12, #1 -+ // load b -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {q12}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q8, q10, q11, #9 -+ vext.8 q9, q11, q12, #9 -+ vext.8 q6, q10, q11, #8 -+ vext.8 q7, q11, q12, #8 -+ vext.8 q5, q10, q11, #7 -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 -+ table32 -+ // inputs for next loop iteration -+ // a -+ vmov.8 q0, q4 -+ vext.8 q1, q2, q3, #15 -+ // c -+ vmov.8 q2, q6 -+ vmov.8 q3, q7 -+ vmov.8 q4, q5 -+ bne 1b -+ vpop {d8-d15} -+ bx lr ++@ void ff_hevc_sao_edge_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_32_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1 -+ init_edge_32 -+ sub r1, r3 -+ // load a -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {d24}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q0, q10, q11, #1 -+ vext.8 q1, q11, q12, #1 -+ // load c -+ vld1.8 {q2-q3}, [r1, :64]! -+ vld1.8 {d30}, [r1, :64], r3 -+ sub r1, #40 -+1: subs r12, #1 -+ // load b -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {q12}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q8, q10, q11, #7 -+ vext.8 q9, q11, q12, #7 -+ vext.8 q14, q12, q10, #7 ++@ void ff_hevc_sao_edge_64_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] + -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++function ff_hevc_sao_edge_64_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 1 ++endfunc + -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 -+ table32 ++@ ff_hevc_sao_edge_c_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] + -+ // inputs for next loop iteration -+ // a -+ vext.8 q0, q2, q3, #1 -+ vext.8 q1, q3, q15, #1 -+ // c -+ vext.8 q2, q8, q9, #1 -+ vext.8 q3, q9, q14, #1 -+ vext.8 d30, d28, d2, #1 -+ bne 1b -+ bx lr ++function ff_hevc_sao_edge_c_8_neon_8, export=1 ++ edge_16b_init 8, 1, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_8, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_16_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_32_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_8_neon_10, export=1 ++ edge_16b_init 10, 0, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_16_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_64_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++@ We simply split the 32 case into 2 vertical stripes ++@ and call the fns for w32 ++@ ++@ Calling code will always have src != dst so we don't have to worry ++@ about edge effects ++ ++function ff_hevc_sao_edge_64_neon_10, export=1 ++ edge_64b_init 10, 0, 1, 99f ++endfunc ++ ++@ void ff_hevc_sao_edge_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_32_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_8_neon_10, export=1 ++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 ++99: ++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 ++endfunc ++ ++@ ff_hevc_sao_edge_c_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_32_neon_10, export=1 ++ edge_64b_init 10, 1, 1, 99f ++endfunc ++ ++ ++@ ff_hevc_sao_edge_c_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_16_neon_10, export=1 ++ edge_64b_init 10, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 4 +endfunc + diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 57334df..7648294 100644 +index 57334df3fc..7648294965 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -443,6 +443,8 @@ enum AVCodecID { @@ -3692,7 +6969,7 @@ index 57334df..7648294 100644 * discarded by the caller from the end of the stream to get the original * audio without any trailing padding. diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h -index 1bf1c62..ccfa991 100644 +index 1bf1c620d6..ccfa991f60 100644 --- a/libavcodec/cabac.h +++ b/libavcodec/cabac.h @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; @@ -3712,7 +6989,7 @@ index 1bf1c62..ccfa991 100644 const uint8_t *bytestream; const uint8_t *bytestream_end; diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 9711019..9f99a2c 100644 +index 9711019e9d..9f99a2c927 100644 --- a/libavcodec/codec_desc.c +++ b/libavcodec/codec_desc.c @@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = { @@ -3765,7 +7042,7 @@ index 9711019..9f99a2c 100644 /* various PCM "codecs" */ { diff --git a/libavcodec/h264.h b/libavcodec/h264.h -index 86df5eb..22c4f1d 100644 +index 86df5eb9b3..22c4f1d82a 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -41,7 +41,9 @@ enum { @@ -3779,7 +7056,7 @@ index 86df5eb..22c4f1d 100644 #endif /* AVCODEC_H264_H */ diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index bc35a61..055828c 100644 +index 2564c6c6c3..f939fa3fc9 100644 --- a/libavcodec/h264_parser.c +++ b/libavcodec/h264_parser.c @@ -60,6 +60,8 @@ typedef struct H264ParseContext { @@ -3876,10 +7153,10 @@ index bc35a61..055828c 100644 + .split = h264_split, +}; diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h -index de77d2a..a63db2b 100644 +index de77d2ac43..2568fd88b3 100644 --- a/libavcodec/hevc.h +++ b/libavcodec/hevc.h -@@ -21,6 +21,45 @@ +@@ -21,6 +21,47 @@ #ifndef AVCODEC_HEVC_H #define AVCODEC_HEVC_H @@ -3895,8 +7172,6 @@ index de77d2a..a63db2b 100644 + #include "rpi_qpu.h" + #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU + -+ // Define RPI_WORKER to launch a worker thread for pixel processing tasks -+ #define RPI_WORKER + // By passing jobs to a worker thread we hope to be able to catch up during slow frames + // This has no effect unless RPI_WORKER is defined + // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as @@ -3919,6 +7194,10 @@ index de77d2a..a63db2b 100644 + #define RPI_HEVC_SAND 0 + #endif + ++ ++ #define RPI_QPU_EMU_Y 0 ++ #define RPI_QPU_EMU_C 0 ++ + #define RPI_TSTATS 0 +#endif + @@ -3926,7 +7205,7 @@ index de77d2a..a63db2b 100644 * Table 7-3: NAL unit type codes */ diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c -index e27c54e..09727d9 100644 +index e27c54ed4b..925bccd188 100644 --- a/libavcodec/hevc_cabac.c +++ b/libavcodec/hevc_cabac.c @@ -21,6 +21,8 @@ @@ -3943,7 +7222,7 @@ index e27c54e..09727d9 100644 #include "hevcdec.h" +#ifdef RPI -+#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" +#endif + +// BY22 is probably faster than simple bypass if the processor has @@ -4288,7 +7567,7 @@ index e27c54e..09727d9 100644 { return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); } -@@ -968,90 +1229,395 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, +@@ -968,90 +1229,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); } @@ -4301,7 +7580,7 @@ index e27c54e..09727d9 100644 + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param) -+{ + { + CABACContext * const c = &s->HEVClc->cc; + uint32_t y; + unsigned int prefix; @@ -4342,7 +7621,7 @@ index e27c54e..09727d9 100644 +#endif + +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param) - { ++{ + CABACContext * const c = &s->HEVClc->cc; int prefix = 0; int suffix = 0; @@ -4418,7 +7697,7 @@ index e27c54e..09727d9 100644 + rv = (rv << 1) | b; + } + return rv; -+} + } +#endif + + @@ -4502,7 +7781,7 @@ index e27c54e..09727d9 100644 + (*stat_coeff)++; + else if (x == 0 && *stat_coeff > 0) + (*stat_coeff)--; - } ++} +#endif + + @@ -4559,22 +7838,21 @@ index e27c54e..09727d9 100644 + int * const pPrev_sig) +{ + while (--i >= 0) { -+ unsigned int x_cg = scan_x_cg[i]; -+ unsigned int y_cg = scan_y_cg[i]; ++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; ++ const unsigned int x_cg = scan_x_cg[i]; + + // For the flag decode we only care about Z/NZ but -+ // we use the full Right + Down * 2 when calculating -+ // significant coeff flags so we obtain it here -+ //. ++ // we use the full Right * 2 + Down when calculating ++ // significant coeff flags so we obtain it here. ++ // + // The group flag array is one longer than it needs to + // be so we don't need to check for y_cg limits -+ unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) | -+ (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1); ++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); + + if (i == 0 || + significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig)) + { -+ significant_coeff_group_flag[y_cg] |= (1 << x_cg); ++ gf_y[0] |= (1 << x_cg); + *pPrev_sig = prev_sig; + break; + } @@ -4592,31 +7870,46 @@ index e27c54e..09727d9 100644 + unsigned int stride = frame->linesize[c_idx]; + unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; + unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; -+ const int is_sliced = rpi_sliced_frame(frame); ++ const int is_sliced = av_rpi_is_sand_frame(frame); + uint8_t * dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(frame, x, y) : -+ rpi_sliced_frame_pos_c(frame, x, y); ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); + + if (s->enable_rpi) { -+ const unsigned int i = s->num_pred_cmds[s->pass0_job]; -+ HEVCPredCmd * const pc = s->univ_pred_cmds[s->pass0_job] + i - 1; ++ const unsigned int i = s->jb0->intra.n; ++ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1; + + if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && + pc->ta.dst == dst) + { -+ av_assert0(pc->size == log2_trafo_size && ++ av_assert1(pc->size == log2_trafo_size && + pc->c_idx == 1 && -+ pc->ta.buf + (1 << (log2_trafo_size * 2)) && + pc->ta.stride == stride); + + pc->type = RPI_PRED_ADD_RESIDUAL_C; + } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride); ++ ++ // Rewrite as add residual - must rewrite all fields as different union member ++ pc->type = RPI_PRED_ADD_RESIDUAL_V; ++ pc->c_idx = c_idx; ++ pc->ta.buf = coeffs; ++ pc->ta.dst = dst; ++ pc->ta.stride = stride; ++ pc->ta.dc = dc; ++ } + else + { + HEVCPredCmd * const cmd = pc + 1; -+ s->num_pred_cmds[s->pass0_job] = i + 1; ++ s->jb0->intra.n = i + 1; + + cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); + cmd->size = log2_trafo_size; @@ -4624,20 +7917,81 @@ index e27c54e..09727d9 100644 + cmd->ta.buf = coeffs; + cmd->ta.dst = dst; + cmd->ta.stride = stride; ++ cmd->ta.dc = 0; + } + } + else if (!is_sliced || c_idx == 0) { + s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); + } +#if RPI_HEVC_SAND ++ // * These should probably never happen + else if (c_idx == 1) { -+ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); + } + else { -+ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); + } +#endif +} ++ ++ ++static void rpi_add_dc(HEVCContext * const s, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame->linesize[c_idx]; ++ const unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; ++ const unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; ++ const int is_sliced = av_rpi_is_sand_frame(frame); ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); ++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); ++ ++ if (s->enable_rpi) { ++ const unsigned int i = s->jb0->intra.n; ++ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->ta.dc = (int16_t)coeff; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride && ++ (pc->dc.dc & ~0xffff) == 0); ++ ++ pc->dc.dc |= (coeff << 16); ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ s->jb0->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_DC + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->dc.dst = dst; ++ cmd->dc.stride = stride; ++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; ++ } ++ } ++} ++ ++ +#endif void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, @@ -4680,6 +8034,7 @@ index e27c54e..09727d9 100644 +#endif +#ifdef RPI + int use_vpu; ++ int use_dc = 0; +#endif + int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero @@ -4701,7 +8056,6 @@ index e27c54e..09727d9 100644 + const int c_idx_nz = (c_idx != 0); + + int may_hide_sign; -+ // Derive QP for dequant if (!lc->cu.cu_transquant_bypass_flag) { @@ -4710,7 +8064,7 @@ index e27c54e..09727d9 100644 static const uint8_t rem6[51 + 4 * 6 + 1] = { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1067,9 +1633,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1067,9 +1708,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, }; int qp_y = lc->qp_y; @@ -4731,7 +8085,7 @@ index e27c54e..09727d9 100644 } if (c_idx == 0) { -@@ -1102,39 +1678,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1102,39 +1753,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, qp += s->ps.sps->qp_bd_offset; } @@ -4822,7 +8176,7 @@ index e27c54e..09727d9 100644 &last_significant_coeff_x, &last_significant_coeff_y); if (last_significant_coeff_x > 3) { -@@ -1162,119 +1775,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1162,119 +1850,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int last_x_c = last_significant_coeff_x & 3; int last_y_c = last_significant_coeff_y & 3; @@ -4879,53 +8233,35 @@ index e27c54e..09727d9 100644 - for (i = num_last_subset; i >= 0; i--) { - int n, m; - int x_cg, y_cg, x_c, y_c, pos; -+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant -+ -+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; -+ -+ { -+ const unsigned int ccount = 1 << (log2_trafo_size * 2); -+#ifdef RPI -+ use_vpu = 0; -+ if (s->enable_rpi) { -+ use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4; -+ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); -+#if HAVE_NEON -+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); -+#else -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+#endif -+ } -+ else -+#endif -+ { -+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+ } -+ } -+ -+ i = num_last_subset; -+ do { - int implicit_non_zero_coeff = 0; +- int implicit_non_zero_coeff = 0; - int64_t trans_coeff_level; - int prev_sig = 0; - int offset = i << 4; - int rice_init = 0; -+ int n_end; - - uint8_t significant_coeff_flag_idx[16]; -- uint8_t nb_significant_coeff_flag = 0; - +- uint8_t significant_coeff_flag_idx[16]; +- uint8_t nb_significant_coeff_flag = 0; ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant + - x_cg = scan_x_cg[i]; - y_cg = scan_y_cg[i]; -- ++ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; + - if ((i < num_last_subset) && (i > 0)) { - int ctx_cg = 0; - if (x_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg]; - if (y_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1]; -- ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++#ifdef RPI ++ use_vpu = 0; ++ if (s->enable_rpi) { ++ const int special = trans_skip_or_bypass || lc->tu.cross_pf; // These need special processinmg ++ use_dc = (num_coeff == 1) && !special && ++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); + - significant_coeff_group_flag[x_cg][y_cg] = - significant_coeff_group_flag_decode(s, c_idx, ctx_cg); - implicit_non_zero_coeff = 1; @@ -4933,9 +8269,37 @@ index e27c54e..09727d9 100644 - significant_coeff_group_flag[x_cg][y_cg] = - ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) || - (x_cg == 0 && y_cg == 0)); -- } -- ++ if (use_dc) { ++ // Just need a little empty space ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ // No need to clear ++ } ++ else ++ { ++ use_vpu = !special && log2_trafo_size >= 4; ++ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } + } ++ else ++#endif ++ { ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ } ++ } + - last_scan_pos = num_coeff - offset - 1; ++ i = num_last_subset; ++ do { ++ int implicit_non_zero_coeff = 0; ++ int n_end; ++ ++ uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; if (i == num_last_subset) { @@ -4967,23 +8331,24 @@ index e27c54e..09727d9 100644 + H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 + V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 + }; ++ // N.B. prev_sig = Right * 2 + Down + static const uint8_t ctx_idx_maps[3][4][16] = { + { + D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + } }; @@ -5021,7 +8386,7 @@ index e27c54e..09727d9 100644 if (log2_trafo_size == 3) { scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; } else { -@@ -1288,34 +1916,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1288,34 +2004,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } } @@ -5070,11 +8435,12 @@ index e27c54e..09727d9 100644 significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; nb_significant_coeff_flag++; } -@@ -1325,141 +1949,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1325,141 +2037,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } - n_end = nb_significant_coeff_flag; +- + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | @@ -5122,9 +8488,6 @@ index e27c54e..09727d9 100644 + coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2); + } -+ // Probably not worth the overhead of starting by22 for just one value -+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); - - if (n_end) { - int first_nz_pos_in_cg; - int last_nz_pos_in_cg; @@ -5135,6 +8498,9 @@ index e27c54e..09727d9 100644 - int sum_abs = 0; - int sign_hidden; - int sb_type; ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); + + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { @@ -5145,18 +8511,13 @@ index e27c54e..09727d9 100644 + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); +- // initialize first elem of coeff_bas_level_greater1_flag +- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } -- // initialize first elem of coeff_bas_level_greater1_flag -- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; -+ { -+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; -+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; -+ const unsigned int scale_m = blk_scale[xy_off->scale]; - - if (s->ps.sps->persistent_rice_adaptation_enabled_flag) { - if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag) - sb_type = 2 * (c_idx == 0 ? 1 : 0); @@ -5164,7 +8525,11 @@ index e27c54e..09727d9 100644 - sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1; - c_rice_param = lc->stat_coeff[sb_type] / 4; - } -- ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; + - if (!(i == num_last_subset) && greater1_ctx == 0) - ctx_set++; - greater1_ctx = 1; @@ -5246,9 +8611,6 @@ index e27c54e..09727d9 100644 + { + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param); + const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; -+ -+ sum_abs += last_coeff_abs_level_remaining + 1; -+ *level = trans_coeff_level; - for (m = 0; m < n_end; m++) { - n = significant_coeff_flag_idx[m]; @@ -5269,6 +8631,9 @@ index e27c54e..09727d9 100644 - if (lc->stat_coeff[sb_type] > 0) - lc->stat_coeff[sb_type]--; - rice_init = 1; ++ sum_abs += last_coeff_abs_level_remaining + 1; ++ *level = trans_coeff_level; ++ + if (stat_coeff != NULL) + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + stat_coeff = NULL; @@ -5373,7 +8738,7 @@ index e27c54e..09727d9 100644 if (lc->cu.cu_transquant_bypass_flag) { if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1469,7 +2137,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1469,7 +2225,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else { @@ -5382,7 +8747,7 @@ index e27c54e..09727d9 100644 int rot = s->ps.sps->transform_skip_rotation_enabled_flag && log2_trafo_size == 2 && lc->cu.pred_mode == MODE_INTRA; -@@ -1489,7 +2157,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1489,10 +2245,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { s->hevcdsp.transform_4x4_luma(coeffs); @@ -5396,13 +8761,27 @@ index e27c54e..09727d9 100644 + { int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) - s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); -@@ -1512,7 +2186,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +- s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ { ++#ifdef RPI ++ if (use_dc) ++ rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ else ++#endif ++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ } + else { + int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; + if (max_xy < 4) +@@ -1512,7 +2281,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); } } +#ifdef RPI -+ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ if (!use_dc) ++ { ++ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ } +#else s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride); +#endif @@ -5410,7 +8789,7 @@ index e27c54e..09727d9 100644 void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c -index 14e7c8d..0256b01 100644 +index b53f4cc721..b56f4d20f6 100644 --- a/libavcodec/hevc_filter.c +++ b/libavcodec/hevc_filter.c @@ -22,6 +22,12 @@ @@ -5426,26 +8805,31 @@ index 14e7c8d..0256b01 100644 #include "libavutil/common.h" #include "libavutil/internal.h" -@@ -30,6 +36,11 @@ +@@ -30,6 +36,16 @@ #include "bit_depth_template.c" +#ifdef RPI +#include "rpi_qpu.h" ++#endif ++#if RPI_HEVC_SAND +#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++#else ++#define RPI_ZC_SAND_8_IN_10_BUF 0 +#endif + #define LUMA 0 #define CB 1 #define CR 2 -@@ -138,6 +149,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) +@@ -138,6 +154,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; } +static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx) +{ -+#ifdef RPI -+ return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift; ++#if RPI_HEVC_SAND ++ return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; +#else + return s->ps.sps->pixel_shift; +#endif @@ -5454,7 +8838,75 @@ index 14e7c8d..0256b01 100644 static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, ptrdiff_t stride_dst, ptrdiff_t stride_src) { -@@ -192,7 +212,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, +@@ -160,12 +185,21 @@ int i, j; + } + } + ++// "DSP" these? + static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) + { +- if (pixel_shift) +- *(uint16_t *)dst = *(uint16_t *)src; +- else +- *dst = *src; ++ switch (pixel_shift) ++ { ++ case 2: ++ *(uint32_t *)dst = *(uint32_t *)src; ++ break; ++ case 1: ++ *(uint16_t *)dst = *(uint16_t *)src; ++ break; ++ default: ++ *dst = *src; ++ break; ++ } + } + + static void copy_vert(uint8_t *dst, const uint8_t *src, +@@ -173,18 +207,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride_dst, ptrdiff_t stride_src) + { + int i; +- if (pixel_shift == 0) { +- for (i = 0; i < height; i++) { +- *dst = *src; +- dst += stride_dst; +- src += stride_src; +- } +- } else { +- for (i = 0; i < height; i++) { +- *(uint16_t *)dst = *(uint16_t *)src; +- dst += stride_dst; +- src += stride_src; +- } ++ switch (pixel_shift) ++ { ++ case 2: ++ for (i = 0; i < height; i++) { ++ *(uint32_t *)dst = *(uint32_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ case 1: ++ for (i = 0; i < height; i++) { ++ *(uint16_t *)dst = *(uint16_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ default: ++ for (i = 0; i < height; i++) { ++ *dst = *src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; + } + } + +@@ -192,7 +237,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, ptrdiff_t stride_src, int x, int y, int width, int height, int c_idx, int x_ctb, int y_ctb) { @@ -5463,7 +8915,7 @@ index 14e7c8d..0256b01 100644 int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; -@@ -223,13 +243,14 @@ static void restore_tqb_pixels(HEVCContext *s, +@@ -223,13 +268,14 @@ static void restore_tqb_pixels(HEVCContext *s, int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); @@ -5481,21 +8933,27 @@ index 14e7c8d..0256b01 100644 for (n = 0; n < (min_pu_size >> vshift); n++) { memcpy(src, dst, len); src += stride_src; -@@ -245,7 +266,7 @@ static void restore_tqb_pixels(HEVCContext *s, +@@ -245,7 +291,13 @@ static void restore_tqb_pixels(HEVCContext *s, static void sao_filter_CTB(HEVCContext *s, int x, int y) { - static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; ++#if SAO_FILTER_N == 5 + static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#elif SAO_FILTER_N == 6 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#else ++#error Confused by size of sao fn array ++#endif HEVCLocalContext *lc = s->HEVClc; int c_idx; int edges[4]; // 0 left 1 top 2 right 3 bottom -@@ -266,12 +287,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -266,12 +318,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) uint8_t right_tile_edge = 0; uint8_t up_tile_edge = 0; uint8_t bottom_tile_edge = 0; -+#ifdef RPI -+ const int sliced = rpi_sliced_frame(s->frame); ++#if RPI_HEVC_SAND ++ const int sliced = av_rpi_is_sand_frame(s->frame); + const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1); +#else + const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1); @@ -5513,7 +8971,7 @@ index 14e7c8d..0256b01 100644 if (restore) { if (!edges[0]) { left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -303,7 +334,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -303,7 +365,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) } } @@ -5522,7 +8980,7 @@ index 14e7c8d..0256b01 100644 int x0 = x >> s->ps.sps->hshift[c_idx]; int y0 = y >> s->ps.sps->vshift[c_idx]; ptrdiff_t stride_src = s->frame->linesize[c_idx]; -@@ -312,28 +343,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -312,28 +374,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0); int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0); int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; @@ -5530,24 +8988,24 @@ index 14e7c8d..0256b01 100644 ptrdiff_t stride_dst; uint8_t *dst; -+#ifdef RPI -+ const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift; ++#if RPI_HEVC_SAND ++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); + const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; + uint8_t * const src = !sliced ? -+ &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] : ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0, y0); + const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : + !sliced ? src - (1 << sh) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); + const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : + !sliced ? src + (width << sh) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0 + width, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); + + + if (sliced && c_idx > 1) { @@ -5578,7 +9036,7 @@ index 14e7c8d..0256b01 100644 + dst = lc->edge_emu_buffer; + stride_dst = 2*MAX_PB_SIZE; + copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, @@ -5599,9 +9057,11 @@ index 14e7c8d..0256b01 100644 - s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, - sao->offset_val[c_idx], sao->band_position[c_idx], - width, height); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { ++// printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src); ++ + s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, + sao->offset_val[1], sao->band_position[1], + sao->offset_val[2], sao->band_position[2], @@ -5617,7 +9077,7 @@ index 14e7c8d..0256b01 100644 } sao->type_idx[c_idx] = SAO_APPLIED; break; -@@ -341,108 +426,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -341,108 +459,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) { int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; @@ -5756,7 +9216,7 @@ index 14e7c8d..0256b01 100644 - vert_edge, - horiz_edge, - diag_edge); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { + // Class always the same for both U & V (which is just as well :-)) @@ -5786,18 +9246,42 @@ index 14e7c8d..0256b01 100644 + horiz_edge, + diag_edge); + } ++ // ??? Does this actually work for chroma ??? restore_tqb_pixels(s, src, dst, stride_src, stride_dst, x, y, width, height, c_idx); sao->type_idx[c_idx] = SAO_APPLIED; -@@ -452,6 +546,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -450,8 +578,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } } ++ ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && ++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) ++ { ++ const unsigned int stride1 = s->frame->linesize[0]; ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); ++ const unsigned int xoff = (x >> 8) * stride2 * stride1; ++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; ++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; ++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; ++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; ++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); ++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; ++ ++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); ++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); ++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); ++ } ++#endif } +// Returns 2 or 0. static int get_pcm(HEVCContext *s, int x, int y) { int log2_min_pu_size = s->ps.sps->log2_min_pu_size; -@@ -478,7 +573,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -478,7 +628,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) uint8_t *src; int x, y; int chroma, beta; @@ -5806,7 +9290,7 @@ index 14e7c8d..0256b01 100644 uint8_t no_p[2] = { 0 }; uint8_t no_q[2] = { 0 }; -@@ -495,6 +590,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -495,6 +645,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->ps.sps->pcm.loop_filter_disable_flag) || s->ps.pps->transquant_bypass_enable_flag; @@ -5822,7 +9306,7 @@ index 14e7c8d..0256b01 100644 if (x0) { left_tc_offset = s->deblock[ctb - 1].tc_offset; left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -528,19 +632,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -528,19 +687,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; @@ -5840,14 +9324,14 @@ index 14e7c8d..0256b01 100644 - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); + } -+#ifdef RPI -+ if (rpi_sliced_frame(s->frame)) { ++#if RPI_HEVC_SAND ++ if (av_rpi_is_sand_frame(s->frame)) { + + // This copes properly with no_p/no_q -+ s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y), ++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + s->frame->linesize[LUMA], + beta, tc, no_p, no_q, -+ rpi_sliced_frame_pos_y(s->frame, x - 4, y)); ++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); + } + else +#endif @@ -5882,21 +9366,21 @@ index 14e7c8d..0256b01 100644 } } -@@ -560,7 +696,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -560,7 +751,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; - src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_y(s->frame, x, y) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_y(s->frame, x, y) : +#endif + &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x, y - 1); no_p[1] = get_pcm(s, x + 4, y - 1); -@@ -570,6 +711,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -570,6 +766,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -5916,17 +9400,19 @@ index 14e7c8d..0256b01 100644 s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -578,6 +732,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -578,6 +787,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } if (s->ps.sps->chroma_format_idc) { -+#ifdef RPI -+ if (rpi_sliced_frame(s->frame)) { ++#if RPI_HEVC_SAND ++ if (av_rpi_is_sand_frame(s->frame)) { + const int v = 2; + const int h = 2; + + // vertical filtering chroma + for (y = y0; y < y_end; y += 8 * v) { ++// const int demi_y = y + 4 * v >= s->ps.sps->height; ++ const int demi_y = 0; + for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { + const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; + const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; @@ -5934,7 +9420,7 @@ index 14e7c8d..0256b01 100644 + if ((bs0 == 2) || (bs1 == 2)) { + const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; + const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; -+ unsigned int no_f = 0; ++ unsigned int no_f = !demi_y ? 0 : 2 | 8; + + // tc_offset here should be set to cur_tc_offset I think + const uint32_t tc4 = @@ -5954,10 +9440,10 @@ index 14e7c8d..0256b01 100644 + continue; + } + -+ s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + s->frame->linesize[1], + tc4, -+ rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), + no_f); + } + } @@ -5972,6 +9458,9 @@ index 14e7c8d..0256b01 100644 + x_end2 = x_end - 8 * h; + + for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { ++// const int demi_x = x + 4 * v >= s->ps.sps->width; ++ const int demi_x = 0; ++ + const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; + const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; + if ((bs0 == 2) || (bs1 == 2)) { @@ -5980,7 +9469,7 @@ index 14e7c8d..0256b01 100644 + const uint32_t tc4 = + ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | + ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); -+ unsigned int no_f = 0; ++ unsigned int no_f = !demi_x ? 0 : 2 | 8; + + if (tc4 == 0) + continue; @@ -5996,7 +9485,7 @@ index 14e7c8d..0256b01 100644 + continue; + } + -+ s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + s->frame->linesize[1], + tc4, no_f); + } @@ -6008,21 +9497,21 @@ index 14e7c8d..0256b01 100644 for (chroma = 1; chroma <= 2; chroma++) { int h = 1 << s->ps.sps->hshift[chroma]; int v = 1 << s->ps.sps->vshift[chroma]; -@@ -594,7 +833,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -594,7 +893,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0; c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0; - src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : +#endif + &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x - 1, y); no_p[1] = get_pcm(s, x - 1, y + (4 * v)); -@@ -604,9 +848,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -604,9 +908,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6046,21 +9535,21 @@ index 14e7c8d..0256b01 100644 } } -@@ -627,7 +885,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -627,7 +945,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0; c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0; - src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : +#endif + &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x, y - 1); no_p[1] = get_pcm(s, x + (4 * h), y - 1); -@@ -637,6 +900,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -637,6 +960,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6080,7 +9569,7 @@ index 14e7c8d..0256b01 100644 s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q); -@@ -647,69 +923,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -647,69 +983,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } } @@ -6150,7 +9639,7 @@ index 14e7c8d..0256b01 100644 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size) -@@ -720,10 +933,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -720,10 +993,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_min_tu_size = s->ps.sps->log2_min_tb_size; int min_pu_width = s->ps.sps->min_pu_width; int min_tu_width = s->ps.sps->min_tb_width; @@ -6176,7 +9665,7 @@ index 14e7c8d..0256b01 100644 boundary_upper = y0 > 0 && !(y0 & 7); if (boundary_upper && -@@ -735,34 +960,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -735,34 +1020,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_upper = 0; @@ -6253,7 +9742,7 @@ index 14e7c8d..0256b01 100644 boundary_left = x0 > 0 && !(x0 & 7); if (boundary_left && ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -773,64 +1020,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -773,64 +1080,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_left = 0; @@ -6356,7 +9845,7 @@ index 14e7c8d..0256b01 100644 } } } -@@ -839,11 +1076,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -839,11 +1136,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, #undef CB #undef CR @@ -6425,7 +9914,7 @@ index 14e7c8d..0256b01 100644 + // Call VPU + { + const vpu_qpu_job_h vqj = vpu_qpu_job_new(); -+ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands ++ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands + vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id); + vpu_qpu_job_finish(vqj); + } @@ -6462,61 +9951,167 @@ index 14e7c8d..0256b01 100644 if (s->ps.sps->sao_enabled) { int y_end = y >= s->ps.sps->height - ctb_size; if (y && x) -@@ -852,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) +@@ -852,16 +1243,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) sao_filter_CTB(s, x - ctb_size, y); if (y && x_end) { sao_filter_CTB(s, x, y - ctb_size); - if (s->threads_type & FF_THREAD_FRAME ) +- ff_thread_report_progress(&s->ref->tf, y, 0); + if (s->threads_type == FF_THREAD_FRAME ) { +#if RPI_INTER + rpi_flush_ref_frame_progress(s,&s->ref->tf, y); +#endif - ff_thread_report_progress(&s->ref->tf, y, 0); ++ ff_hevc_progress_signal_recon(s, y); + } } if (x_end && y_end) { sao_filter_CTB(s, x , y); - if (s->threads_type & FF_THREAD_FRAME ) +- ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); + if (s->threads_type == FF_THREAD_FRAME ) { +#if RPI_INTER + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size); +#endif - ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); ++ ff_hevc_progress_signal_recon(s, y + ctb_size); + } -+ } + } +- } else if (s->threads_type & FF_THREAD_FRAME && x_end) +- ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); + } else if (s->threads_type == FF_THREAD_FRAME && x_end) { + //int newh = y + ctb_size - 4; + //int currh = s->ref->tf.progress->data[0]; + //if (((y + ctb_size)&63)==0) +#ifdef RPI_DEBLOCK_VPU + if (s->enable_rpi_deblock) { -+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi -+ if (done_deblock) { -+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); -+ } ++ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi ++ if (done_deblock) { ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); ++ } + } else { +#if RPI_INTER -+ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); +#endif -+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); - } -- } else if (s->threads_type & FF_THREAD_FRAME && x_end) ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); ++ } +#else +#if RPI_INTER + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); -+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi +#endif - ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); +#endif + } } void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) +diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c +index a8f7876b59..1c6f15bde3 100644 +--- a/libavcodec/hevc_mvs.c ++++ b/libavcodec/hevc_mvs.c +@@ -112,7 +112,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField + return 0; + } + +-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) ++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb) + { + int tx, scale_factor; + +@@ -126,10 +126,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) + (scale_factor * src->y < 0)) >> 8); + } + +-static int check_mvset(Mv *mvLXCol, Mv *mvCol, +- int colPic, int poc, +- RefPicList *refPicList, int X, int refIdxLx, +- RefPicList *refPicList_col, int listCol, int refidxCol) ++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol, ++ const int colPic, const int poc, ++ const RefPicList * const refPicList, const int X, const int refIdxLx, ++ const RefPicList * const refPicList_col, const int listCol, const int refidxCol) + { + int cur_lt = refPicList[X].isLongTerm[refIdxLx]; + int col_lt = refPicList_col[listCol].isLongTerm[refidxCol]; +@@ -160,11 +160,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol, + refPicList_col, L ## l, temp_col.ref_idx[l]) + + // derive the motion vectors section 8.5.3.1.8 +-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, +- int refIdxLx, Mv *mvLXCol, int X, +- int colPic, RefPicList *refPicList_col) ++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col, ++ const int refIdxLx, Mv * const mvLXCol, const int X, ++ const int colPic, const RefPicList * const refPicList_col) + { +- RefPicList *refPicList = s->ref->refPicList; ++ const RefPicList * const refPicList = s->ref->refPicList; + + if (temp_col.pred_flag == PF_INTRA) + return 0; +@@ -215,20 +215,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, + /* + * 8.5.3.1.7 temporal luma motion vector prediction + */ +-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, +- int nPbW, int nPbH, int refIdxLx, +- Mv *mvLXCol, int X) ++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0, ++ const int nPbW, const int nPbH, const int refIdxLx, ++ Mv * const mvLXCol, const int X) + { + MvField *tab_mvf; + MvField temp_col; + int x, y, x_pu, y_pu; +- int min_pu_width = s->ps.sps->min_pu_width; ++ const int min_pu_width = s->ps.sps->min_pu_width; + int availableFlagLXCol = 0; + int colPic; + +- HEVCFrame *ref = s->ref->collocated_ref; ++ HEVCFrame * const ref = s->ref->collocated_ref; + +- if (!ref) { ++ if (ref == NULL || ref->tab_mvf == NULL) { + memset(mvLXCol, 0, sizeof(*mvLXCol)); + return 0; + } +@@ -240,14 +240,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, + x = x0 + nPbW; + y = y0 + nPbH; + +- if (tab_mvf && +- (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && ++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && + y < s->ps.sps->height && + x < s->ps.sps->width) { + x &= ~15; + y &= ~15; + if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_await_progress(&ref->tf, y, 0); ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); +@@ -255,13 +254,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, + } + + // derive center collocated motion vector +- if (tab_mvf && !availableFlagLXCol) { ++ if (!availableFlagLXCol) { + x = x0 + (nPbW >> 1); + y = y0 + (nPbH >> 1); + x &= ~15; + y &= ~15; + if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_await_progress(&ref->tf, y, 0); ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c -index acd55cc..c1716c2 100644 +index f2c26c4598..74e152d4b3 100644 --- a/libavcodec/hevc_ps.c +++ b/libavcodec/hevc_ps.c -@@ -780,7 +780,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) +@@ -819,7 +819,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) switch (sps->bit_depth) { case 8: if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8; @@ -6529,19 +10124,66 @@ index acd55cc..c1716c2 100644 if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P; if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P; break; -@@ -1001,6 +1006,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, - sps->amp_enabled_flag = get_bits1(gb); - sps->sao_enabled = get_bits1(gb); +@@ -831,7 +836,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) + break; + case 10: + if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY10; ++#if RPI_HEVC_SAND ++ // *** Horrid kludge s.t. we start out with sand format ++ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10; ++#else + if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10; ++#endif + if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10; + if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10; + break; +@@ -1097,7 +1107,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, + skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); + if (sps_extension_flag[0]) { + int extended_precision_processing_flag; +- int high_precision_offsets_enabled_flag; + int cabac_bypass_alignment_enabled_flag; -+ av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled); -+ - sps->pcm_enabled_flag = get_bits1(gb); - if (sps->pcm_enabled_flag) { - sps->pcm.bit_depth = get_bits(gb, 4) + 1; + sps->transform_skip_rotation_enabled_flag = get_bits1(gb); +@@ -1112,10 +1121,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, + "extended_precision_processing_flag not yet implemented\n"); + + sps->intra_smoothing_disabled_flag = get_bits1(gb); +- high_precision_offsets_enabled_flag = get_bits1(gb); +- if (high_precision_offsets_enabled_flag) ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); ++ if (sps->high_precision_offsets_enabled_flag) + av_log(avctx, AV_LOG_WARNING, +- "high_precision_offsets_enabled_flag not yet implemented\n"); ++ "high_precision_offsets_enabled_flag not fully implemented\n"); + + sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); + +diff --git a/libavcodec/hevc_ps.h b/libavcodec/hevc_ps.h +index 44de3980e1..f45d28cd80 100644 +--- a/libavcodec/hevc_ps.h ++++ b/libavcodec/hevc_ps.h +@@ -206,6 +206,7 @@ typedef struct HEVCSPS { + int implicit_rdpcm_enabled_flag; + int explicit_rdpcm_enabled_flag; + int intra_smoothing_disabled_flag; ++ int high_precision_offsets_enabled_flag; + int persistent_rice_adaptation_enabled_flag; + + ///< coded frame dimension in various units diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c -index 9103c84..eb26e7d 100644 +index d2759ba5f5..1dcc238c5a 100644 --- a/libavcodec/hevc_refs.c +++ b/libavcodec/hevc_refs.c +@@ -23,7 +23,7 @@ + + #include "libavutil/avassert.h" + #include "libavutil/pixdesc.h" +- ++#include "libavutil/rpi_sand_fns.h" + #include "internal.h" + #include "thread.h" + #include "hevc.h" @@ -206,7 +206,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) HEVCFrame *frame = &s->DPB[min_idx]; AVFrame *dst = out; @@ -6562,7 +10204,7 @@ index 9103c84..eb26e7d 100644 - int off = ((frame->window.left_offset >> hshift) << pixel_shift) + - (frame->window.top_offset >> vshift) * dst->linesize[i]; - dst->data[i] += off; -+ if (fmt == AV_PIX_FMT_SAND128) ++ if (av_rpi_is_sand_format(fmt)) + { + // Sand cannot be windowed by offset so add side data if we have an offset + const HEVCWindow * const window = &frame->window; @@ -6588,11 +10230,21 @@ index 9103c84..eb26e7d 100644 } av_log(s->avctx, AV_LOG_DEBUG, "Output frame with POC %d.\n", frame->poc); +@@ -427,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc) + frame->sequence = s->seq_decode; + frame->flags = 0; + +- if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_report_progress(&frame->tf, INT_MAX, 0); ++ ff_hevc_progress_set_all_done(frame); + + return frame; + } diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index f9e8ff0..8a3d874 100644 +index 5579a4df43..fd48468c6c 100644 --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c -@@ -42,8 +42,207 @@ +@@ -42,8 +42,346 @@ #include "hevcdec.h" #include "profiles.h" @@ -6600,26 +10252,17 @@ index f9e8ff0..8a3d874 100644 + #include "rpi_qpu.h" + #include "rpi_shader.h" + #include "rpi_shader_cmd.h" ++ #include "rpi_shader_template.h" + #include "rpi_zc.h" ++ #include "libavutil/rpi_sand_fns.h" + + // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory + #define RPI_CACHE_UNIF_MVS 1 + -+ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*) -+ //#define RPI_SIMULATE_QPUS -+ #ifdef RPI_WORKER -+ #include "pthread.h" -+ #endif -+ ++ #include "pthread.h" + #include "libavutil/atomic.h" + + static void worker_core(HEVCContext * const s); -+ -+ // We can pred any block height, but caching may make some heights better than others -+ // Currently it doesn't seem to make a lot of difference -+ // 0 => any height -+ #define Y_P_MAX_H 0 -+ #define Y_B_MAX_H 0 +#endif + +#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards @@ -6644,14 +10287,15 @@ index f9e8ff0..8a3d874 100644 + +// UV still has min 4x4 pred +// Allow for even spread +1 for setup, +1 for rounding -+// If we have load sharingw e will want different (bigger) numbers and/or a non-constant chunk size ++// As we have load sharing this can (in theory) be exceeded so we have to ++// check after each CTU, but it is a good base size + +// Worst case (all 4x4) commands per CTU +#define QPU_Y_CMD_PER_CTU_MAX (8 * 8) +#define QPU_C_CMD_PER_CTU_MAX (4 * 4) + -+#define UV_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 / QPU_N_UV + 2) -+#define Y_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / QPU_N_Y + 2) ++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX) ++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX) + +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 @@ -6679,35 +10323,127 @@ index f9e8ff0..8a3d874 100644 + mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn +}; + ++static const int * const inter_pred_setup_c10_qpu[12] = { ++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn ++}; ++ +static const int * const inter_pred_setup_y_qpu[12] = { + mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, + mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, + mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn +}; + ++static const int * const inter_pred_setup_y10_qpu[12] = { ++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn ++}; ++ +static const int * const inter_pred_sync_qpu[12] = { + mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, + mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, + mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 +}; + ++static const int * const inter_pred_sync10_qpu[12] = { ++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, ++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, ++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 ++}; ++ +static const int * const inter_pred_exit_c_qpu[12] = { -+ mc_interrupt_exit12c, mc_exit_c, mc_exit_c, mc_exit_c, -+ mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c, -+ mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c ++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn ++}; ++ ++static const int * const inter_pred_exit_c10_qpu[12] = { ++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn +}; + +static const int * const inter_pred_exit_y_qpu[12] = { -+ mc_interrupt_exit12, mc_exit, mc_exit, mc_exit, -+ mc_exit, mc_exit, mc_exit, mc_exit, -+ mc_exit, mc_exit, mc_exit, mc_exit ++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn +}; + ++static const int * const inter_pred_exit_y10_qpu[12] = { ++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn ++}; ++ ++typedef struct ipe_chan_info_s ++{ ++ const unsigned int n; ++ const int * const * setup_fns; ++ const int * const * sync_fns; ++ const int * const * exit_fns; ++} ipe_chan_info_t; ++ ++typedef struct ipe_init_info_s ++{ ++ ipe_chan_info_t luma; ++ ipe_chan_info_t chroma; ++} ipe_init_info_t; ++ ++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 ++ { // 8 ++ .luma = {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, ++ .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} ++ }, ++ { // 9 ++ .luma = {0}, ++ .chroma = {0} ++ }, ++ { // 10 ++ .luma = {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, ++ .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} ++ } ++ ++}; ++ ++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) ++{ ++ const unsigned int n = ici->n; ++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ ++ ipe->n = n; ++ ipe->max_fill = q1_size - ipe->min_gap; ++ for(unsigned int i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(ici->setup_fns[i]); ++ q->code_sync = qpu_fn(ici->sync_fns[i]); ++ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ } ++} ++ ++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth) ++{ ++ const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8; ++ ++ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++ ++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); ++ ++ for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) { ++ HEVCRpiJob *const jb = s->jobs + i; ++ set_ipe_from_ici(&jb->chroma_ip, &iii->chroma); ++ set_ipe_from_ici(&jb->luma_ip, &iii->luma); ++ } ++} ++ + +#endif + + -+#ifdef RPI_WORKER ++#ifdef RPI + +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); @@ -6715,108 +10451,154 @@ index f9e8ff0..8a3d874 100644 +#define LOG_ENTER +#define LOG_EXIT + ++#define USE_SEM 1 ++ +// Call this when we have completed pass0 and wish to trigger pass1 for the current job -+static void worker_submit_job(HEVCContext *s) ++static void worker_submit_job(HEVCContext * const s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_tail++; -+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_ENTER ++ sem_post(&s->jb0->sem_in); ++ s->jb0->pending = 1; ++ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ s->jb0 = s->jobs + s->pass0_job; ++ LOG_EXIT +} + +// Call this to say we have completed pass1 -+static void worker_complete_job(HEVCContext *s) ++static void worker_complete_job(HEVCContext * const s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_head++; -+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_ENTER ++ sem_t * const sem = &s->jb1->sem_out; ++ // Must set job no before signalling as otherwise rpi_do_all_passes ++ // may call worker_core from the main thread with a bad job number ++ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ s->jb1 = s->jobs + s->pass1_job; ++ sem_post(sem); ++ LOG_EXIT +} + -+// Call this to wait for all jobs to have completed at the end of a frame -+static void worker_wait(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ while( s->worker_head !=s->worker_tail) -+ { -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} + +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes +// available to receive the next job. +static void worker_pass0_ready(HEVCContext *s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ // tail is number of submitted jobs -+ // head is number of completed jobs -+ // tail-head is number of outstanding jobs in the queue -+ // we need to ensure there is at least 1 space left for us to use -+ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS) -+ { -+ // Wait until another job is completed -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); ++ LOG_ENTER ++ HEVCRpiJob * const jb = s->jb0; ++ if (jb->pending) { ++ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR) ++ /* Loop */; ++ jb->pending = 0; + } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_EXIT ++} ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++static void worker_wait(HEVCContext * const s) ++{ ++ LOG_ENTER ++ unsigned int i; ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ HEVCRpiJob * const jb = s->jobs + i; ++ if (jb->pending) { ++ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR) ++ /* Loop */; ++ jb->pending = 0; ++ } ++ } ++ LOG_EXIT +} + +static void *worker_start(void *arg) +{ -+ HEVCContext *s = (HEVCContext *)arg; -+ while(1) { -+ pthread_mutex_lock(&s->worker_mutex); ++ HEVCContext * const s = (HEVCContext *)arg; + -+ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0) ++ for (;;) + { -+ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); ++ HEVCRpiJob * const jb = s->jb1; ++ while (sem_wait(&jb->sem_in) == -1 && errno == EINTR) ++ /* Loop */; ++ if (jb->terminate) ++ break; + -+ if (s->kill_worker) { -+ break; ++ LOG_ENTER ++ worker_core(s); ++ worker_complete_job(s); ++ LOG_EXIT + } -+ LOG_ENTER -+ worker_core(s); -+ -+ worker_complete_job(s); -+ LOG_EXIT -+ } -+ return NULL; ++ return NULL; +} + ++static void worker_pic_free_all(HEVCContext * const s) ++{ ++ unsigned int i; ++ ++ // Free coeff stuff - allocation not the same for all buffers ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs; ++ ++ if (cf->s[0].buf != NULL) ++ av_freep(&cf->mptr); ++ if (cf->s[2].buf != NULL) ++ gpu_free(&cf->gptr); ++ memset(cf, 0, sizeof(*cf)); ++ } ++} ++ ++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count) ++{ ++ unsigned int i; ++ ++ // Free coeff stuff - allocation not the same for all buffers ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs; ++ ++// av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL); ++// av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL); ++// av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL); ++// av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL); ++ ++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) ++ goto fail; ++ cf->s[2].buf = (int16_t *)cf->gptr.arm; ++ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ ++ // Must be 64 byte aligned for our zero apping code so over-allocate & ++ // round ++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0] + 63))) == NULL) ++ goto fail; ++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); ++ } ++ return 0; ++ ++fail: ++ printf("%s: **** Failed\n", __func__); ++ worker_pic_free_all(s); ++ return -1; ++} ++ ++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++{ ++ unsigned int i; ++ for (i = 0; i != 4; ++i) { ++ cf->s[i].n = 0; ++ } ++} +#endif ++ + /** * NOTE: Each function hls_foo correspond to the function foo in the * specification (HLS stands for High Level Syntax). -@@ -56,6 +255,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 +@@ -56,6 +394,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 /* free everything allocated by pic_arrays_init() */ static void pic_arrays_free(HEVCContext *s) { +#ifdef RPI -+ int job; -+ for(job=0;jobcoeffs_buf_arm[job][0]) { -+ gpu_free(&s->coeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = 0; -+ } -+ if (s->coeffs_buf_arm[job][2]) { -+ gpu_free(&s->coeffs_buf_accelerated[job]); -+ s->coeffs_buf_arm[job][2] = 0; -+ } -+ } ++ worker_pic_free_all(s); +#endif ++ +#ifdef RPI_DEBLOCK_VPU + { + int i; @@ -6833,7 +10615,7 @@ index f9e8ff0..8a3d874 100644 av_freep(&s->sao); av_freep(&s->deblock); -@@ -92,6 +317,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) +@@ -92,6 +447,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) int ctb_count = sps->ctb_width * sps->ctb_height; int min_pu_size = sps->min_pu_width * sps->min_pu_height; @@ -6842,32 +10624,17 @@ index f9e8ff0..8a3d874 100644 + const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS; + const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; + const int coefs_per_row = coefs_per_luma + coefs_per_chroma; -+ int job; + + av_assert0(sps); -+// s->max_ctu_count = sps->ctb_width; -+// printf("CTB with=%d\n", sps->ctb_width); -+// s->max_ctu_count = coefs_per_luma / coefs_in_ctb; -+ s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width); -+ s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y; -+ s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV; ++ s->max_ctu_count = coefs_per_luma / coefs_in_ctb; ++#if RPI_ROUND_TO_LINES ++ // Round down to an integral quantity of lines ++ if (s->max_ctu_count > sps->ctb_width) ++ s->max_ctu_count -= s->max_ctu_count % sps->ctb_width; ++#endif + -+ for(job=0;jobcoeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; -+ if (!s->coeffs_buf_arm[job][0]) -+ goto fail; -+ -+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data -+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; -+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; -+ if (!s->coeffs_buf_arm[job][2]) -+ goto fail; -+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. -+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; -+ } -+ } ++ if (worker_pic_alloc_all(s, coefs_per_row) != 0) ++ goto fail; +#endif +#ifdef RPI_DEBLOCK_VPU + { @@ -6923,7 +10690,7 @@ index f9e8ff0..8a3d874 100644 s->bs_width = (width >> 2) + 1; s->bs_height = (height >> 2) + 1; -@@ -138,6 +446,29 @@ fail: +@@ -138,6 +561,29 @@ fail: return AVERROR(ENOMEM); } @@ -6950,19 +10717,21 @@ index f9e8ff0..8a3d874 100644 + } +} + - static void pred_weight_table(HEVCContext *s, GetBitContext *gb) + static int pred_weight_table(HEVCContext *s, GetBitContext *gb) { int i = 0; -@@ -332,7 +663,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, +@@ -351,8 +797,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt) { #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL) - enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; +- int ret, i; + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts; - int ret, i; ++ int ret; pic_arrays_free(s); -@@ -351,6 +682,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + s->ps.sps = NULL; +@@ -370,6 +816,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm switch (sps->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUVJ420P: @@ -6975,7 +10744,20 @@ index f9e8ff0..8a3d874 100644 #if CONFIG_HEVC_DXVA2_HWACCEL *fmt++ = AV_PIX_FMT_DXVA2_VLD; #endif -@@ -384,6 +721,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm +@@ -384,6 +836,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + #endif + break; + case AV_PIX_FMT_YUV420P10: ++#if RPI_HEVC_SAND ++ // Currently geometry calc is stuffed for big sizes ++ if (sps->width < 2048 && sps->height <= 1088) { ++ *fmt++ = AV_PIX_FMT_SAND64_10; ++ } ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -403,6 +861,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm ret = ff_thread_get_format(s->avctx, pix_fmts); if (ret < 0) goto fail; @@ -6983,24 +10765,58 @@ index f9e8ff0..8a3d874 100644 s->avctx->pix_fmt = ret; } else { -@@ -406,11 +744,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm +@@ -412,26 +871,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + ff_hevc_pred_init(&s->hpc, sps->bit_depth); + ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth); + ff_videodsp_init (&s->vdsp, sps->bit_depth); ++#ifdef RPI ++ rpi_hevc_qpu_set_fns(s, sps->bit_depth); ++#endif + +- for (i = 0; i < 3; i++) { +- av_freep(&s->sao_pixel_buffer_h[i]); +- av_freep(&s->sao_pixel_buffer_v[i]); +- } ++ av_freep(&s->sao_pixel_buffer_h[0]); ++ av_freep(&s->sao_pixel_buffer_v[0]); + + if (sps->sao_enabled && !s->avctx->hwaccel) { +- int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; +- int c_idx; ++ const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; ++ unsigned int c_idx; ++ size_t vsize[3] = {0}; ++ size_t hsize[3] = {0}; + for(c_idx = 0; c_idx < c_count; c_idx++) { int w = sps->width >> sps->hshift[c_idx]; int h = sps->height >> sps->vshift[c_idx]; -+ // ******** Very very nasty allocation kludge for plaited Chroma - s->sao_pixel_buffer_h[c_idx] = +- s->sao_pixel_buffer_h[c_idx] = - av_malloc((w * 2 * sps->ctb_height) << -+ av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) << - sps->pixel_shift); - s->sao_pixel_buffer_v[c_idx] = +- sps->pixel_shift); +- s->sao_pixel_buffer_v[c_idx] = - av_malloc((h * 2 * sps->ctb_width) << -+ av_malloc((h * 2 * sps->ctb_width * (1 + (c_idx == 1))) << - sps->pixel_shift); +- sps->pixel_shift); ++ // ctb height & width are a min of 8 so this must a multiple of 16 ++ // so no point rounding up! ++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; ++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; } ++ ++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] ++ // when we have plaited chroma ++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); ++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); ++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; ++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; ++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; ++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; } -@@ -678,6 +1017,11 @@ static int hls_slice_header(HEVCContext *s) - (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) { - pred_weight_table(s, gb); + + s->ps.sps = sps; +@@ -699,6 +1168,11 @@ static int hls_slice_header(HEVCContext *s) + if (ret < 0) + return ret; } + else + { @@ -7010,20 +10826,25 @@ index f9e8ff0..8a3d874 100644 sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -933,6 +1277,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { +@@ -954,6 +1428,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { return 0; } +#ifdef RPI ++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s) ++{ ++ return s->jb0->intra.cmds + s->jb0->intra.n++; ++} ++ +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) +{ + // U & V done on U call in the case of sliced frames -+ if (rpi_sliced_frame(s->frame) && c_idx > 1) ++ if (av_rpi_is_sand_frame(s->frame) && c_idx > 1) + return; + + if (s->enable_rpi) { + HEVCLocalContext *lc = s->HEVClc; -+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ HEVCPredCmd *cmd = rpi_new_intra_cmd(s); + cmd->type = RPI_PRED_INTRA; + cmd->size = log2_trafo_size; + cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; @@ -7032,7 +10853,7 @@ index f9e8ff0..8a3d874 100644 + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; + } -+ else if (rpi_sliced_frame(s->frame) && c_idx != 0) { ++ else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) { + s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx); + } + else { @@ -7045,7 +10866,7 @@ index f9e8ff0..8a3d874 100644 static int hls_transform_unit(HEVCContext *s, int x0, int y0, int xBase, int yBase, int cb_xBase, int cb_yBase, int log2_cb_size, int log2_trafo_size, -@@ -945,8 +1317,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -966,8 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { int trafo_size = 1 << log2_trafo_size; ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); @@ -7058,7 +10879,7 @@ index f9e8ff0..8a3d874 100644 } if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1032,7 +1407,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1053,7 +1563,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -7070,7 +10891,7 @@ index f9e8ff0..8a3d874 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1061,7 +1440,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1082,7 +1596,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -7082,7 +10903,7 @@ index f9e8ff0..8a3d874 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1090,7 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1111,7 +1629,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -7094,7 +10915,7 @@ index f9e8ff0..8a3d874 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1100,7 +1487,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1121,7 +1643,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -7106,7 +10927,7 @@ index f9e8ff0..8a3d874 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1112,26 +1503,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1133,26 +1659,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); @@ -7153,7 +10974,7 @@ index f9e8ff0..8a3d874 100644 } } } -@@ -1277,47 +1688,120 @@ do { +@@ -1298,47 +1844,119 @@ do { return 0; } @@ -7189,12 +11010,12 @@ index f9e8ff0..8a3d874 100644 - if (s->ps.sps->chroma_format_idc) { - s->hevcdsp.put_pcm(dst1, stride1, +#if RPI_HEVC_SAND -+ if (rpi_sliced_frame(s->frame)) { -+ s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0), ++ if (av_rpi_is_sand_frame(s->frame)) { ++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), + s->frame->linesize[0], + cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); + -+ s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), ++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), + s->frame->linesize[1], cb_size >> s->ps.sps->hshift[1], cb_size >> s->ps.sps->vshift[1], @@ -7233,10 +11054,9 @@ index f9e8ff0..8a3d874 100644 +#ifdef RPI +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n) +{ -+ int16_t * const coeffs = (buf_no != 3) ? -+ s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] : -+ s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n; -+ s->num_coeffs[s->pass0_job][buf_no] += n; ++ HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no; ++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); ++ cfe->n += n; + return coeffs; +} +#endif @@ -7281,7 +11101,7 @@ index f9e8ff0..8a3d874 100644 + + // Add command + { -+ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(s); + cmd->type = RPI_PRED_I_PCM; + cmd->size = log2_cb_size; + cmd->i_pcm.src = coeffs; @@ -7299,7 +11119,7 @@ index f9e8ff0..8a3d874 100644 /** * 8.5.3.2.2.1 Luma sample unidirectional interpolation process * -@@ -1349,6 +1833,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1370,6 +1988,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag); int idx = ff_hevc_pel_weight[block_w]; @@ -7310,7 +11130,7 @@ index f9e8ff0..8a3d874 100644 x_off += mv->x >> 2; y_off += mv->y >> 2; src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1395,7 +1883,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1416,7 +2038,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, * @param mv1 motion vector1 (relative to block position) to get pixel data from * @param current_mv current motion vector structure */ @@ -7319,7 +11139,7 @@ index f9e8ff0..8a3d874 100644 AVFrame *ref0, const Mv *mv0, int x_off, int y_off, int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) { -@@ -1419,6 +1907,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1440,6 +2062,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); @@ -7330,7 +11150,7 @@ index f9e8ff0..8a3d874 100644 if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1504,6 +1996,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, +@@ -1525,6 +2151,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, intptr_t _mx = mx << (1 - hshift); intptr_t _my = my << (1 - vshift); @@ -7341,7 +11161,7 @@ index f9e8ff0..8a3d874 100644 x_off += mv->x >> (2 + hshift); y_off += mv->y >> (2 + vshift); src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1568,6 +2064,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF +@@ -1589,6 +2219,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF int hshift = s->ps.sps->hshift[1]; int vshift = s->ps.sps->vshift[1]; @@ -7352,13 +11172,143 @@ index f9e8ff0..8a3d874 100644 intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1695,14 +2195,582 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, - } +@@ -1662,13 +2296,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF + _mx1, _my1, block_w); } --static void hls_prediction_unit(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, -- int log2_cb_size, int partIdx, int idx) +-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref, +- const Mv *mv, int y0, int height) ++#ifdef RPI ++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field) ++{ ++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { ++ HEVCContext *const fs = ref->tf.owner[field]->priv_data; ++ HEVCRPiFrameProgressState * const pstate = fs->progress_states + field; ++ sem_t * sem = NULL; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ if (((volatile int *)ref->tf.progress->data)[field] < val) { ++ HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait; ++ ++ av_assert0(pwait->req == -1 && pwait->next == NULL); ++ ++ pwait->req = val; ++ pwait->next = NULL; ++ if (pstate->first == NULL) ++ pstate->first = pwait; ++ else ++ pstate->last->next = pwait; ++ pstate->last = pwait; ++ sem = &pwait->sem; ++ } ++ pthread_mutex_unlock(&pstate->lock); ++ ++ if (sem != NULL) { ++ while (sem_wait(sem) != 0) ++ av_assert0(errno == EINTR); ++ } ++ } ++} ++ ++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field) ++{ ++ HEVCRPiFrameProgressState *const pstate = s->progress_states + field; ++ ++ ((int *)s->ref->tf.progress->data)[field] = val; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ { ++ HEVCRPiFrameProgressWait ** ppwait = &pstate->first; ++ HEVCRPiFrameProgressWait * pwait; ++ ++ while ((pwait = *ppwait) != NULL) { ++ if (pwait->req > val) ++ { ++ ppwait = &pwait->next; ++ pstate->last = pwait; ++ } ++ else ++ { ++ *ppwait = pwait->next; ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_post(&pwait->sem); ++ } ++ } ++ } ++ pthread_mutex_unlock(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate) + { +- int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ pstate->first = NULL; ++ pstate->last = NULL; ++ pthread_mutex_init(&pstate->lock, NULL); ++} + +- if (s->threads_type == FF_THREAD_FRAME ) +- ff_thread_await_progress(&ref->tf, y, 0); ++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait) ++{ ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_init(&pwait->sem, 0, 0); ++} ++ ++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate) ++{ ++ av_assert0(pstate->first == NULL); ++ pthread_mutex_destroy(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait) ++{ ++ sem_destroy(&pwait->sem); ++} ++#endif ++ ++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref, ++ const Mv * const mv, const int y0, const int height) ++{ ++ if (s->threads_type == FF_THREAD_FRAME) { ++ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ ++#ifdef RPI ++ if (s->enable_rpi) { ++ int16_t *const pr = s->jb0->progress + ref->dpb_no; ++ if (*pr < y) { ++ *pr = y; ++ } ++ } ++ else ++#endif ++ // It is a const ThreadFrame but the prototype isn't ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); ++ } + } + + static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, +@@ -1707,23 +2440,551 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, + ff_hevc_hls_mvd_coding(s, x0, y0, 1); + } + +- mv->pred_flag += PF_L1; +- mvp_flag = ff_hevc_mvp_lx_flag_decode(s); +- ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, +- part_idx, merge_idx, mv, mvp_flag, 1); +- mv->mv[1].x += lc->pu.mvd.x; +- mv->mv[1].y += lc->pu.mvd.y; ++ mv->pred_flag += PF_L1; ++ mvp_flag = ff_hevc_mvp_lx_flag_decode(s); ++ ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, ++ part_idx, merge_idx, mv, mvp_flag, 1); ++ mv->mv[1].x += lc->pu.mvd.x; ++ mv->mv[1].y += lc->pu.mvd.y; ++ } ++} ++ + +#if RPI_INTER + @@ -7374,7 +11324,7 @@ index f9e8ff0..8a3d874 100644 + + yp->load += load_val; + ipe->used_grp = 1; -+ ((uint32_t *)yp->qpu_mc_curr)[-1] = fn; // Link is always last el of previous cmd ++ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd + + return yp; +} @@ -7384,8 +11334,8 @@ index f9e8ff0..8a3d874 100644 +{ + for (unsigned int i = 0; i != ipe->n; ++i) { + HEVCRpiInterPredQ * const q = ipe->q + i; -+ ((uint32_t *)q->qpu_mc_curr)[-1] = q->code_sync; -+ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)((uint32_t *)q->qpu_mc_curr + 1); ++ q->qpu_mc_curr->data[-1] = q->code_sync; ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1); + q->load = 0; + } +} @@ -7428,39 +11378,40 @@ index f9e8ff0..8a3d874 100644 + } +} + -+static void rpi_alloc_inter_pred(HEVCRpiInterPredEnv * const ipe, -+ const unsigned int n, const unsigned int n_grp, -+ const unsigned int q1_size, const unsigned int min_gap, -+ const int * const * const setup_fns, -+ const int * const * const sync_fns, -+ const int * const * const exit_fns) ++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n_max, const unsigned int n_grp, ++ const unsigned int total_size, const unsigned int min_gap) +{ -+ unsigned int i; -+ + memset(ipe, 0, sizeof(*ipe)); -+ av_assert0((ipe->q = av_mallocz(n * sizeof(*ipe->q))) != NULL); -+ ipe->n = n; ++ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL); + ipe->n_grp = n_grp; -+ ipe->q1_size = q1_size; -+ ipe->max_fill = ipe->q1_size - min_gap; ++ ipe->min_gap = min_gap; + +#if RPI_CACHE_UNIF_MVS -+ gpu_malloc_cached(n * q1_size, &ipe->gptr); ++ gpu_malloc_cached(total_size, &ipe->gptr); +#else -+ gpu_malloc_uncached(n * q1_size, &ipe->gptr); ++ gpu_malloc_uncached(total_size, &ipe->gptr); +#endif -+ -+ for(i = 0; i < n; i++) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ q->qpu_mc_curr = q->qpu_mc_base = -+ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); -+ q->code_setup = qpu_fn(setup_fns[i]); -+ q->code_sync = qpu_fn(sync_fns[i]); -+ q->code_exit = qpu_fn(exit_fns[i]); -+ } +} + + ++#if RPI_QPU_EMU_Y ++#define get_mc_address_y(f) ((f)->data[0]) ++#else ++#define get_mc_address_y(f) get_vc_address_y(f) ++#endif ++#if RPI_QPU_EMU_C ++#define get_mc_address_u(f) ((f)->data[1]) ++#else ++#define get_mc_address_u(f) get_vc_address_u(f) ++#endif ++ ++static inline int offset_depth_adj(const HEVCContext *const s, const int wt) ++{ ++ return s->ps.sps->high_precision_offsets_enabled_flag ? wt : ++ wt << (s->ps.sps->bit_depth - 8); ++} ++ +static void +rpi_pred_y(HEVCContext *const s, const int x0, const int y0, + const int nPbW, const int nPbH, @@ -7469,175 +11420,157 @@ index f9e8ff0..8a3d874 100644 + const int weight_offset, + AVFrame *const src_frame) +{ -+ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); + const unsigned int mx = mv->x & 3; + const unsigned int my = mv->y & 3; + const unsigned int my_mx = (my << 8) | mx; + const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); -+ uint32_t dst_addr = get_vc_address_y(s->frame) + y_off; -+ const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul); -+ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip; ++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); ++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; ++ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + + if (my_mx == 0) + { + const int x1 = x0 + (mv->x >> 2); + const int y1 = y0 + (mv->y >> 2); -+ -+#if Y_P_MAX_H == 0 + const int bh = nPbH; -+ const int start_y = 0; -+#else -+ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H) -+ { -+ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); -+#endif + -+ for (int start_x = 0; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_p00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ for (int start_x = 0; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; + +#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ ++ts->y_pred1_x0y0; ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred1_x0y0; + -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; + -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif -+ -+ src1->x = x1 + start_x; -+ src1->y = y1 + start_y; -+ src1->base = src_vc_address_y; -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->wo1 = wo; -+ cmd_y->dst_addr = dst_addr + start_x; -+ yp->last_l0 = &cmd_y->next_src1; -+ *(qpu_mc_pred_y_p00_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; + } -+#if Y_P_MAX_H != 0 -+ } +#endif ++ ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } + } + else + { + const int x1_m3 = x0 + (mv->x >> 2) - 3; + const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ -+#if Y_P_MAX_H == 0 -+ const int bh = nPbH; -+ const int start_y = 0; -+#else -+ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H) -+ { -+ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); -+#endif -+ const uint32_t src_yx_y = y1_m3 + start_y; -+ int start_x = 0; ++ const unsigned int bh = nPbH; ++ int start_x = 0; + +#if 1 -+ // As Y-pred operates on two independant 8-wide src blocks we can merge -+ // this pred with the previous one if it the previous one is 8 pel wide, -+ // the same height as the current block, immediately to the left of our -+ // current dest block and mono-pred. ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. + -+ qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p; -+ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + 8 == dst_addr) -+ { -+ const int bw = FFMIN(nPbW, 8); -+ qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1; ++ qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1; + -+ last_y8_src2->x = x1_m3; -+ last_y8_src2->y = src_yx_y; -+ last_y8_src2->base = src_vc_address_y; -+ last_y8_p->w += bw; -+ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); -+ last_y8_p->wo2 = wo; ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = y1_m3; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; + -+ s->last_y8_p = NULL; -+ s->last_y8_l1 = NULL; -+ start_x = bw; ++ s->last_y8_p = NULL; ++ s->last_y8_l1 = NULL; ++ start_x = bw; +#if RPI_TSTATS -+ ++s->tstats.y_pred1_y8_merge; ++ ++s->tstats.y_pred1_y8_merge; +#endif -+ } -+#endif -+ -+ for (; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ if (mx == 0 && my == 0) -+ ++ts->y_pred1_x0y0; -+ else if (mx == 0) -+ ++ts->y_pred1_x0; -+ else if (my == 0) -+ ++ts->y_pred1_y0; -+ else -+ ++ts->y_pred1_xy; -+ -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; -+ -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif -+ src1->x = x1_m3 + start_x; -+ src1->y = src_yx_y; -+ src1->base = src_vc_address_y; -+ if (bw <= 8) -+ { -+ src2->x = MC_DUMMY_X; -+ src2->y = MC_DUMMY_Y; -+ src2->base = s->qpu_dummy_frame; -+ } -+ else -+ { -+ src2->x = x1_m3 + start_x + 8; -+ src2->y = src_yx_y; -+ src2->base = src_vc_address_y; -+ } -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo; -+ cmd_y->wo2 = wo; -+ cmd_y->dst_addr = dst_addr + start_x; -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; -+ -+ if (bw == 8) { -+ s->last_y8_l1 = src2; -+ s->last_y8_p = cmd_y; -+ } -+ } -+#if Y_P_MAX_H != 0 + } +#endif ++ ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = y1_m3; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++#if RPI_QPU_EMU_Y ++ src2->base = s->qpu_dummy_frame_emu; ++#else ++ src2->base = s->qpu_dummy_frame_qpu; ++#endif ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = y1_m3; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ ++ if (bw == 8) { ++ s->last_y8_l1 = src2; ++ s->last_y8_p = cmd_y; ++ } ++ } + } +} + @@ -7649,7 +11582,7 @@ index f9e8ff0..8a3d874 100644 + AVFrame *const src_frame, + AVFrame *const src_frame2) +{ -+ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); + const Mv * const mv = mv_field->mv + 0; + const Mv * const mv2 = mv_field->mv + 1; + @@ -7662,15 +11595,16 @@ index f9e8ff0..8a3d874 100644 + const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; + const unsigned int ref_idx0 = mv_field->ref_idx[0]; + const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + -+ s->sh.luma_offset_l1[ref_idx1] + 1; ++ const uint32_t wt_offset = ++ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1; + const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); + const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); + -+ uint32_t dst = get_vc_address_y(s->frame) + y_off; -+ const uint32_t src1_base = get_vc_address_y(src_frame); -+ const uint32_t src2_base = get_vc_address_y(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip; + + if (my2_mx2_my_mx == 0) + { @@ -7678,52 +11612,42 @@ index f9e8ff0..8a3d874 100644 + const int y1 = y0 + (mv->y >> 2); + const int x2 = x0 + (mv2->x >> 2); + const int y2 = y0 + (mv2->y >> 2); -+ -+#if Y_B_MAX_H == 0 + const int bh = nPbH; -+ const int start_y = 0; -+#else -+ for (int start_y = 0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H) -+ { -+ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); -+#endif -+ // Can do chunks a full 16 wide if we don't want the H filter -+ for (int start_x=0; start_x < nPbW; start_x += 16) -+ { -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_b00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ ++ts->y_pred2_x0y0; + -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1 + start_y; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2 + start_y; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 16); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = 0; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + start_x; -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; + } -+#if Y_P_MAX_H != 0 -+ } +#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } + } + else + { @@ -7732,118 +11656,106 @@ index f9e8ff0..8a3d874 100644 + const int y1 = y0 + (mv->y >> 2) - 3; + const int x2 = x0 + (mv2->x >> 2) - 3; + const int y2 = y0 + (mv2->y >> 2) - 3; -+ -+#if Y_B_MAX_H == 0 + const int bh = nPbH; -+ const int start_y = 0; -+#else -+ for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H) -+ { -+ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); -+#endif -+ for (int start_x=0; start_x < nPbW; start_x += 8) -+ { // B blocks work 8 at a time -+ // B weights aren't doubled as the QPU code does the same -+ // amount of work as it does for P -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter_b); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ const unsigned int mmx = mx | mx2; -+ const unsigned int mmy = my | my2; -+ if (mmx == 0 && mmy == 0) -+ ++ts->y_pred2_x0y0; -+ else if (mmx == 0) -+ ++ts->y_pred2_x0; -+ else if (mmy == 0) -+ ++ts->y_pred2_y0; -+ else -+ ++ts->y_pred2_xy; + -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1 + start_y; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2 + start_y; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 8); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + start_x; -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; + } -+#if Y_B_MAX_H != 0 -+ } +#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } + } +} + -+ ++// h/v shifts fixed at one as that is all the qasm copes with +static void -+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c, ++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, + const Mv * const mv, + const int16_t * const c_weights, + const int16_t * const c_offsets, + AVFrame * const src_frame) +{ -+ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // = s->ps.sps->hshift[1]; ++ const int vshift = 1; // = s->ps.sps->vshift[1]; + + const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; + const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; -+ const uint32_t src_base_u = get_vc_address_u(src_frame); ++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); + const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; + const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); -+ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip; ++ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); ++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; + -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) + { -+ const int bh = FFMIN(nPbH_c-start_y, 16); ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); ++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; ++ qpu_mc_src_t * const last_lx = *plast_lx; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) -+ { -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, s->qpu_filter_uv); -+ qpu_mc_pred_c_p_t * const u = &cp->qpu_mc_curr->c.p; -+ qpu_mc_src_t * const last_l0 = cp->last_l0; -+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ -+ last_l0->x = x1_c + start_x; -+ last_l0->y = y1_c + start_y; -+ last_l0->base = src_base_u; -+ u[0].h = bh; -+ u[0].w = bw; -+ u[0].coeffs_x = x_coeffs; -+ u[0].coeffs_y = y_coeffs; -+ u[0].wo_u = wo_u; -+ u[0].wo_v = wo_v; -+ u[0].dst_addr_c = dst_base_u + start_x * 2; -+ cp->last_l0 = &u->next_src; -+ *(qpu_mc_pred_c_p_t **)&cp->qpu_mc_curr = u + 1; -+ } -+ -+ dst_base_u += s->frame->linesize[1] * 16; ++ last_lx->x = x1_c + start_x; ++ last_lx->y = y1_c; ++ last_lx->base = src_base_u; ++ cmd_c->h = bh; ++ cmd_c->w = bw; ++ cmd_c->coeffs_x = x_coeffs; ++ cmd_c->coeffs_y = y_coeffs; ++ cmd_c->wo_u = wo_u; ++ cmd_c->wo_v = wo_v; ++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); ++ *plast_lx = &cmd_c->next_src; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); + } + return; +} + ++// h/v shifts fixed at one as that is all the qasm copes with +static void +rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, @@ -7855,9 +11767,9 @@ index f9e8ff0..8a3d874 100644 + AVFrame * const src_frame, + AVFrame * const src_frame2) +{ -+ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // s->ps.sps->hshift[1]; ++ const int vshift = 1; // s->ps.sps->vshift[1]; + const Mv * const mv = mv_field->mv + 0; + const Mv * const mv2 = mv_field->mv + 1; + @@ -7876,52 +11788,53 @@ index f9e8ff0..8a3d874 100644 + const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; + const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; + -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ const uint32_t src1_base = get_vc_address_u(src_frame); -+ const uint32_t src2_base = get_vc_address_u(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip; ++ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); ++ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); + -+ for (int start_y = 0; start_y < nPbH_c; start_y += 16) ++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) + { -+ const unsigned int bh = FFMIN(nPbH_c-start_y, 16); ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) -+ { -+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; + -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu_filter_uv_b0); -+ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; -+ qpu_mc_src_t * const src_l0 = cp->last_l0; -+ qpu_mc_src_t * const src_l1 = cp->last_l1; ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c; ++ src_l1->base = src2_base; + -+ src_l0->x = x1_c + start_x; -+ src_l0->y = y1_c + start_y; -+ src_l0->base = src1_base; -+ src_l1->x = x2_c + start_x; -+ src_l1->y = y2_c + start_y; -+ src_l1->base = src2_base; -+ -+ u[0].h = bh; -+ u[0].w = bw; -+ u[0].coeffs_x1 = coefs0_x; -+ u[0].coeffs_y1 = coefs0_y; -+ u[0].weight_u1 = c_weights[0]; // Weight L0 U -+ u[0].weight_v1 = c_weights[1]; // Weight L0 V -+ u[0].coeffs_x2 = coefs1_x; -+ u[0].coeffs_y2 = coefs1_y; -+ u[0].wo_u2 = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]); -+ u[0].wo_v2 = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]); -+ u[0].dst_addr_c = dst_base_u + start_x * 2; -+ -+ cp->last_l0 = &u[0].next_src1; -+ cp->last_l1 = &u[0].next_src2; -+ *(qpu_mc_pred_c_b_t **)&cp->qpu_mc_curr = u + 1; -+ } -+ -+ dst_base_u += s->frame->linesize[1] * 16; -+ } -+} ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = wo_u2; ++ u[0].wo_v2 = wo_v2; ++ u[0].dst_addr_c = dst_base_u + (start_x << xshl); + ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } + } + +-static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +- int nPbW, int nPbH, +- int log2_cb_size, int partIdx, int idx) + +#endif + @@ -7939,7 +11852,7 @@ index f9e8ff0..8a3d874 100644 int merge_idx = 0; struct MvField current_mv = {{{ 0 }}}; -@@ -1720,8 +2788,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1741,8 +3002,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int y_cb = y0 >> log2_min_cb_size; int x_pu, y_pu; int i, j; @@ -7949,7 +11862,7 @@ index f9e8ff0..8a3d874 100644 if (!skip_flag) lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -@@ -1765,12 +2832,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1786,12 +3046,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -7971,7 +11884,7 @@ index f9e8ff0..8a3d874 100644 if (s->ps.sps->chroma_format_idc) { +#if RPI_INTER + if (s->enable_rpi) { -+ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + return; @@ -7980,7 +11893,7 @@ index f9e8ff0..8a3d874 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -@@ -1784,12 +2868,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1805,12 +3082,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -8002,7 +11915,7 @@ index f9e8ff0..8a3d874 100644 if (s->ps.sps->chroma_format_idc) { +#if RPI_INTER + if (s->enable_rpi) { -+ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, + s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + return; @@ -8011,7 +11924,7 @@ index f9e8ff0..8a3d874 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); -@@ -1804,11 +2905,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1825,11 +3119,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -8044,7 +11957,7 @@ index f9e8ff0..8a3d874 100644 chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); -@@ -2083,7 +3204,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) +@@ -2104,7 +3418,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); ret = hls_pcm_sample(s, x0, y0, log2_cb_size); if (s->ps.sps->pcm.loop_filter_disable_flag) @@ -8054,21 +11967,22 @@ index f9e8ff0..8a3d874 100644 if (ret < 0) return ret; -@@ -2306,6 +3429,373 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, +@@ -2327,6 +3643,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); } +#ifdef RPI +static void rpi_execute_dblk_cmds(HEVCContext *s) +{ -+ int n; -+ int job = s->pass1_job; -+ int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ int (*p)[2] = s->dblk_cmds[job]; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) { -+ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size); ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ HEVCRpiDeblkEnv *const de = &s->jb1->deblk; ++ unsigned int i; ++ ++ for (i = 0; i != de->n; ++i) ++ { ++ ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size); + } -+ s->num_dblk_cmds[job] = 0; ++ de->n = 0; +} + +#if 0 @@ -8101,10 +12015,11 @@ index f9e8ff0..8a3d874 100644 +#endif + + -+// I-pred, transform_and_add for all blocks types done here -+// All ARM +#define RPI_OPT_SEP_PRED 0 + ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM +#if RPI_OPT_SEP_PRED +static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma) +#else @@ -8112,15 +12027,15 @@ index f9e8ff0..8a3d874 100644 +#endif +{ + int i; -+ int job = s->pass1_job; -+ const HEVCPredCmd *cmd = s->univ_pred_cmds[job]; -+#ifdef RPI_WORKER ++ HEVCRpiIntraPredEnv * iap = &s->jb1->intra; ++ const HEVCPredCmd *cmd = iap->cmds; ++#ifdef RPI + HEVCLocalContextIntra *lc = &s->HEVClcIntra; +#else + HEVCLocalContext *lc = s->HEVClc; +#endif + -+ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { ++ for(i = iap->n; i > 0; i--, cmd++) { +// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); +#if RPI_OPT_SEP_PRED + if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) { @@ -8137,7 +12052,7 @@ index f9e8ff0..8a3d874 100644 + lc->na.cand_up_left = (cmd->na >> 2) & 1; + lc->na.cand_up = (cmd->na >> 1) & 1; + lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0) ++ if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0) + s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); + else + s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); @@ -8146,17 +12061,25 @@ index f9e8ff0..8a3d874 100644 + case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; ++ case RPI_PRED_ADD_DC: ++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; +#if RPI_HEVC_SAND + case RPI_PRED_ADD_RESIDUAL_U: -+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; + case RPI_PRED_ADD_RESIDUAL_V: -+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; + case RPI_PRED_ADD_RESIDUAL_C: + s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; ++ case RPI_PRED_ADD_DC_U: ++ case RPI_PRED_ADD_DC_V: ++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; +#endif ++ + case RPI_PRED_I_PCM: + pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); + break; @@ -8170,7 +12093,7 @@ index f9e8ff0..8a3d874 100644 + if (do_luma) +#endif + { -+ s->num_pred_cmds[job] = 0; ++ iap->n = 0; + } +} + @@ -8183,9 +12106,8 @@ index f9e8ff0..8a3d874 100644 +static void rpi_begin(HEVCContext *s) +{ +#if RPI_INTER -+ int job = s->pass0_job; -+ int i; -+ HEVCRpiJob * const jb = s->jobs + job; ++ unsigned int i; ++ HEVCRpiJob * const jb = s->jb0; + HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; + HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; + @@ -8196,7 +12118,7 @@ index f9e8ff0..8a3d874 100644 + const uint16_t pic_height_c = s->ps.sps->height >> s->ps.sps->vshift[1]; + + rpi_inter_pred_reset(cipe); -+ for(i=0; i < QPU_N_UV;i++) { ++ for (i = 0; i < cipe->n; i++) { + HEVCRpiInterPredQ * const cp = cipe->q + i; + qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; + @@ -8205,9 +12127,9 @@ index f9e8ff0..8a3d874 100644 + u->next_src1.base = 0; + u->pic_cw = pic_width_c; + u->pic_ch = pic_height_c; -+ u->stride2 = rpi_sliced_frame_stride2(s->frame); -+ u->stride1 = s->frame->linesize[1]; -+ u->wdenom = s->sh.chroma_log2_weight_denom + 6; ++ u->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ u->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ u->wdenom = s->sh.chroma_log2_weight_denom; + cp->last_l0 = &u->next_src1; + + u->next_fn = 0; @@ -8216,12 +12138,12 @@ index f9e8ff0..8a3d874 100644 + u->next_src2.base = 0; + cp->last_l1 = &u->next_src2; + -+ *(qpu_mc_pred_c_s_t **)&cp->qpu_mc_curr = u + 1; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } + + rpi_inter_pred_reset(yipe); -+ for(i=0;i < QPU_N_Y;i++) { -+ HEVCRpiInterPredQ * const yp = s->jobs[job].luma_ip.q + i; ++ for (i = 0; i < yipe->n; i++) { ++ HEVCRpiInterPredQ * const yp = yipe->q + i; + qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; + + y->next_src1.x = 0; @@ -8232,18 +12154,23 @@ index f9e8ff0..8a3d874 100644 + y->next_src2.base = 0; + y->pic_h = pic_height_y; + y->pic_w = pic_width_y; -+ y->stride2 = rpi_sliced_frame_stride2(s->frame); -+ y->stride1 = s->frame->linesize[0]; -+ y->wdenom = s->sh.luma_log2_weight_denom + 6; ++ y->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ y->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ y->wdenom = s->sh.luma_log2_weight_denom; + y->next_fn = 0; + yp->last_l0 = &y->next_src1; + yp->last_l1 = &y->next_src2; + -+ *(qpu_mc_pred_y_s_t **)&yp->qpu_mc_curr = y + 1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); + } + + s->last_y8_p = NULL; + s->last_y8_l1 = NULL; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { ++ jb->progress[i] = -1; ++ } ++ +#endif + s->ctu_count = 0; +} @@ -8251,13 +12178,15 @@ index f9e8ff0..8a3d874 100644 + + +#if RPI_INTER -+static unsigned int mc_terminate_add(HEVCContext * const s, ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_qpu(HEVCContext * const s, + const vpu_qpu_job_h vqj, + rpi_cache_flush_env_t * const rfe, + HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; + uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; ++ unsigned int max_block = 0; + + if (!ipe->used) { + return 0; @@ -8272,18 +12201,20 @@ index f9e8ff0..8a3d874 100644 + HEVCRpiInterPredQ * const yp = ipe->q + i; + qpu_mc_src_t *const p0 = yp->last_l0; + qpu_mc_src_t *const p1 = yp->last_l1; ++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; + -+ ((uint32_t *)yp->qpu_mc_curr)[-1] = yp->code_exit; ++ if (block_size > max_block) ++ max_block = block_size; + -+ av_assert0((char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base <= ipe->q1_size); ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; + + // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched + p0->x = MC_DUMMY_X; + p0->y = MC_DUMMY_Y; -+ p0->base = s->qpu_dummy_frame; ++ p0->base = s->qpu_dummy_frame_qpu; + p1->x = MC_DUMMY_X; + p1->y = MC_DUMMY_Y; -+ p1->base = s->qpu_dummy_frame; ++ p1->base = s->qpu_dummy_frame_qpu; + + yp->last_l0 = NULL; + yp->last_l1 = NULL; @@ -8294,13 +12225,73 @@ index f9e8ff0..8a3d874 100644 + } + +#if RPI_CACHE_UNIF_MVS -+ rpi_cache_flush_add_gm_ptr(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ // We don't need invalidate here as the uniforms aren't changed by the QPU ++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing ++ // new values which seems to give us a small performance advantage ++ // ++ // In most cases we will not have a completely packed set of uniforms and as ++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the ++ // fullest ++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, ++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, ++ ipe->n, ipe->max_fill + ipe->min_gap); +#endif -+ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, (uint32_t *)mail); ++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); + + return 1; +} ++#endif + ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_emu(HEVCContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_emu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_emu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ } ++ ++ return 1; ++} ++#endif ++ ++ ++#if RPI_QPU_EMU_Y ++#define mc_terminate_add_y mc_terminate_add_emu ++#else ++#define mc_terminate_add_y mc_terminate_add_qpu ++#endif ++#if RPI_QPU_EMU_C ++#define mc_terminate_add_c mc_terminate_add_emu ++#else ++#define mc_terminate_add_c mc_terminate_add_qpu ++#endif +#endif + +#ifdef RPI @@ -8322,47 +12313,33 @@ index f9e8ff0..8a3d874 100644 +#endif + vpu_qpu_wait_h sync_y; + -+ const int job = s->pass1_job; -+ unsigned int flush_start = 0; -+ unsigned int flush_count = 0; ++ HEVCRpiJob * const jb = s->jb1; ++ int pred_y, pred_c; + + const vpu_qpu_job_h vqj = vpu_qpu_job_new(); + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); + -+ if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) { -+ vpu_qpu_job_add_vpu(vqj, -+ vpu_get_fn(), -+ vpu_get_constants(), -+ s->coeffs_buf_vc[job][2], -+ s->num_coeffs[job][2] >> 8, -+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], -+ s->num_coeffs[job][3] >> 10, -+ 0); -+ -+ rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ } -+ -+ -+#if RPI_INTER + { -+ int (*d)[2] = s->dblk_cmds[job]; -+ unsigned int high=(*d)[1]; -+ int n; ++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ if (cf->s[3].n + cf->s[2].n != 0) ++ { ++ const unsigned int csize = sizeof(cf->s[3].buf[0]); ++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(s->ps.sps->bit_depth), ++ vpu_get_constants(), ++ cf->gptr.vc, ++ cf->s[2].n >> 8, ++ cf->gptr.vc + offset32, ++ cf->s[3].n >> 10, ++ 0); + -+ flush_start = high; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { -+ unsigned int y = (*d)[1]; -+ flush_start = FFMIN(flush_start, y); -+ high=FFMAX(high,y); ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); + } -+ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->ps.sps->height) - flush_start; + } + -+ if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].chroma_ip) != 0) -+ { -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ 0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 0, 1); -+ } ++ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip); + +// We can take a sync here and try to locally overlap QPU processing with ARM +// but testing showed a slightly negative benefit with noticable extra complexity @@ -8370,25 +12347,109 @@ index f9e8ff0..8a3d874 100644 + vpu_qpu_job_add_sync_this(vqj, &sync_c); +#endif + -+ if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].luma_ip) != 0) -+ { -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ 0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 1, 0); -+ } -+#endif ++ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip); + + vpu_qpu_job_add_sync_this(vqj, &sync_y); + ++ ++ // We are expecting a contiguous Z-shaped set of blocks ++ // So generate up to 3 blocks: ++ // 1st line ++ // body ++ // last line ++ // This will work even if we don't have the expected geometry ++ if (pred_y || pred_c) ++ { ++ const HEVCRpiDeblkEnv *const de = &jb->deblk; ++ const HEVCRpiDeblkBlk * db = de->blks + 0; ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ unsigned int x0 = db->x_ctb; ++ unsigned int xx = x0 + ctb_size; ++ unsigned int y0 = db->y_ctb; ++ ++ unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}}; ++ unsigned int b = 0; ++ unsigned int i; ++ ++ for (i = 1, ++db; i < de->n; ++i, ++db) ++ { ++ if (db->x_ctb == xx && db->y_ctb == y0) { ++ xx += ctb_size; ++ } ++ else ++ { ++ unsigned int * const tlbr = blks_tlbr[b]; ++ if (tlbr[0] > y0) ++ tlbr[0] = y0; ++ if (tlbr[1] > x0) ++ tlbr[1] = x0; ++ if (tlbr[2] < y0 + ctb_size) ++ tlbr[2] = y0 + ctb_size; ++ if (tlbr[3] < xx) ++ tlbr[3] = xx; ++ x0 = db->x_ctb; ++ xx = x0 + ctb_size; ++ y0 = db->y_ctb; ++ b = 1; ++ } ++ } ++ ++ if (blks_tlbr[b][0] != ~0U) ++ ++b; ++ ++ { ++ unsigned int * const tlbr = blks_tlbr[b]; ++ tlbr[0] = y0; ++ tlbr[1] = x0; ++ tlbr[2] = y0 + ctb_size; ++ tlbr[3] = xx; ++ } ++ ++ // ??? Coalesce blocks ??? ++ for (i = 0; i <= b; ++i) { ++ const unsigned int * const tlbr = blks_tlbr[i]; ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, ++ tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c); ++ } ++ } ++ ++ + // Having accumulated some commands - do them + rpi_cache_flush_finish(rfe); ++ ++ // Await progress as required ++ { ++ unsigned int i; ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { ++ if (jb->progress[i] >= 0) { ++ ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]); ++ } ++ } ++ } ++ + vpu_qpu_job_finish(vqj); + -+ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); ++ worker_pic_reset(&jb->coeffs); + -+ // We would do ARM inter prediction here but no longer -+ // Look back in git if you find you want it back - As we have -+ // no arm/neon sand pred code there doesn't seem a lot of point -+ // keeping it around ++ // If we have emulated VPU ops - do it here ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ if (av_rpi_is_sand8_frame(s->frame)) ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ rpi_shader_c8(s, &jb->luma_ip, NULL); ++#else ++ rpi_shader_c8(s, NULL, &jb->chroma_ip); ++#endif ++ else ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ rpi_shader_c16(s, &jb->luma_ip, NULL); ++#else ++ rpi_shader_c16(s, NULL, &jb->chroma_ip); ++#endif ++#endif + +#if RPI_OPT_SEP_PRED + // Wait for transform completion @@ -8416,6 +12477,9 @@ index f9e8ff0..8a3d874 100644 + +static void rpi_do_all_passes(HEVCContext *s) +{ ++ // Called from main thread - must be no pending background jobs ++ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending); ++ + // Do the various passes - common with the worker code + worker_core(s); + // Prepare next batch @@ -8428,68 +12492,66 @@ index f9e8ff0..8a3d874 100644 static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) { HEVCContext *s = avctxt->priv_data; -@@ -2315,6 +3805,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2336,6 +4170,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) int y_ctb = 0; int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; +#ifdef RPI -+ s->enable_rpi = s->ps.sps->bit_depth == 8 && -+ s->frame->format == AV_PIX_FMT_SAND128 && -+ !s->ps.pps->cross_component_prediction_enabled_flag; -+ -+ if (!s->enable_rpi) { -+ if (s->ps.pps->cross_component_prediction_enabled_flag) -+ printf("Cross component\n"); -+ } ++ // * We don't support cross_component_prediction_enabled_flag but as that ++ // must be 0 unless we have 4:4:4 there is no point testing for it as we ++ // only deal with sand which is never 4:4:4 ++ // [support wouldn't be hard] ++ s->enable_rpi = ++ ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) || ++ (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10)); +#endif + //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]); + if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); return AVERROR_INVALIDDATA; -@@ -2328,6 +3830,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2349,8 +4194,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) } } -+#ifdef RPI_WORKER -+ s->pass0_job = 0; -+ s->pass1_job = 0; -+#endif +#ifdef RPI ++ // Worker must be idle at start ++ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending); + rpi_begin(s); +#endif + while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { - int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; +- int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -@@ -2335,6 +3845,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; - hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); - -+ - ff_hevc_cabac_init(s, ctb_addr_ts); - - hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -@@ -2344,6 +3855,49 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2365,6 +4216,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + +#ifdef RPI ++ // Report progress so we can use our MVs in other frames ++ // If we are tiled then this isn't really optimal but given that tiling ++ // can change on a per pic basis (described in PPS) other schemes are ++ // quite a lot harder ++ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) { ++ ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1); ++ } ++ + if (s->enable_rpi) { -+ int q_full = (s->ctu_count >= s->max_ctu_count); ++ int q_full = (++s->ctu_count >= s->max_ctu_count); + -+ if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].luma_ip) != 0) ++ if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0) + q_full = 1; -+ if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].chroma_ip) != 0) ++ if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0) + q_full = 1; + -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; -+ s->ctu_count++; ++ s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb; ++ s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb; + + if (q_full) { -+#ifdef RPI_WORKER + if (s->used_for_ref) + { +// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); @@ -8508,9 +12570,6 @@ index f9e8ff0..8a3d874 100644 + // Non-ref frame so do it all on this thread + rpi_do_all_passes(s); + } -+#else -+ rpi_do_all_passes(s); -+#endif + } + + } @@ -8520,7 +12579,7 @@ index f9e8ff0..8a3d874 100644 if (more_data < 0) { s->tab_slice_address[ctb_addr_rs] = -1; return more_data; -@@ -2352,9 +3906,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2373,9 +4270,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ctb_addr_ts++; ff_hevc_save_states(s, ctb_addr_ts); @@ -8533,12 +12592,10 @@ index f9e8ff0..8a3d874 100644 +#ifdef RPI + -+#ifdef RPI_WORKER + // Wait for the worker to finish all its jobs + if (s->enable_rpi) { + worker_wait(s); + } -+#endif + + // Finish off any half-completed rows + if (s->enable_rpi && s->ctu_count) { @@ -8563,7 +12620,7 @@ index f9e8ff0..8a3d874 100644 if (x_ctb + ctb_size >= s->ps.sps->width && y_ctb + ctb_size >= s->ps.sps->height) ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -@@ -2389,6 +3976,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int +@@ -2410,6 +4338,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int s = s1->sList[self_id]; lc = s->HEVClc; @@ -8575,7 +12632,7 @@ index f9e8ff0..8a3d874 100644 if(ctb_row) { ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); -@@ -2771,6 +4363,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) +@@ -2792,6 +4725,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) if (ret < 0) return ret; @@ -8609,44 +12666,77 @@ index f9e8ff0..8a3d874 100644 if (s->sh.first_slice_in_pic_flag) { if (s->max_ra == INT_MAX) { if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { -@@ -2894,10 +4513,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) +@@ -2915,10 +4875,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) } } -fail: - if (s->ref && s->threads_type == FF_THREAD_FRAME) -+fail: // Also success path -+ if (s->ref && s->threads_type == FF_THREAD_FRAME) { -+#if RPI_INTER -+ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); -+#endif - ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); +- ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); - -+ } -+#if RPI_INTER -+ else if (s->ref && s->enable_rpi) { -+ // When running single threaded we need to flush the whole frame -+ flush_frame(s,s->frame); -+ } ++fail: // Also success path ++ if (s->ref != NULL) { ++ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) { ++#ifdef RPI ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); +#endif ++ ff_hevc_progress_signal_all_done(s); ++ } ++#ifdef RPI ++ // * Flush frame will become confused if we pass it something ++ // that doesn't have an expected number of planes (e.g. 400) ++ // So only flush if we are sure we can. ++ else if (s->enable_rpi) { ++ // Flush frame to real memory as we expect to be able to pass ++ // it straight on to mmal ++ flush_frame(s, s->frame); ++ } ++#endif ++ } return ret; } -@@ -3150,6 +4778,48 @@ fail: +@@ -3171,6 +5146,83 @@ fail: return AVERROR(ENOMEM); } -+#ifdef RPI_WORKER -+static av_cold void hevc_init_worker(HEVCContext *s) ++#ifdef RPI ++static av_cold void hevc_init_worker(HEVCContext * const s) +{ + int err; -+ pthread_cond_init(&s->worker_cond_head, NULL); -+ pthread_cond_init(&s->worker_cond_tail, NULL); -+ pthread_mutex_init(&s->worker_mutex, NULL); + -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; ++ memset(s->jobs, 0, sizeof(s->jobs)); ++ ++ for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) { ++ HEVCRpiJob * const jb = s->jobs + job; ++ ++ sem_init(&jb->sem_in, 0, 0); ++ sem_init(&jb->sem_out, 0, 0); ++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); ++ ++ jb->intra.n = 0; ++ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS); ++ ++ // ** Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too small) ++ ++ rpi_inter_pred_alloc(&jb->chroma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t), ++ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t)); ++ rpi_inter_pred_alloc(&jb->luma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t), ++ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t)); ++ ++ jb->deblk.n = 0; ++ jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS); ++ } ++ s->pass0_job = 0; ++ s->pass1_job = 0; ++ s->jb0 = s->jobs + 0; ++ s->jb1 = s->jobs + 0; ++ + err = pthread_create(&s->worker_thread, NULL, worker_start, s); + if (err) { + printf("Failed to create worker thread\n"); @@ -8654,62 +12744,66 @@ index f9e8ff0..8a3d874 100644 + } +} + -+static av_cold void hevc_exit_worker(HEVCContext *s) -+{ -+ void *res; -+ s->kill_worker=1; -+ pthread_cond_broadcast(&s->worker_cond_tail); -+ pthread_join(s->worker_thread, &res); -+ -+ pthread_cond_destroy(&s->worker_cond_head); -+ pthread_cond_destroy(&s->worker_cond_tail); -+ pthread_mutex_destroy(&s->worker_mutex); -+ -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; -+} -+ +static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) +{ + av_freep(&ipe->q); + gpu_free(&ipe->gptr); +} + ++static av_cold void hevc_exit_worker(HEVCContext *s) ++{ ++ void *res; ++ unsigned int i; ++ ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ s->jobs[i].terminate = 1; ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ sem_post(&s->jobs[i].sem_in); ++ pthread_join(s->worker_thread, &res); ++ ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiJob * const jb = s->jobs + i; ++ ++ sem_destroy(&jb->sem_in); ++ sem_destroy(&jb->sem_out); ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ av_freep(&jb->intra.cmds); ++ av_freep(&jb->deblk.blks); ++ rpi_free_inter_pred(&jb->chroma_ip); ++ rpi_free_inter_pred(&jb->luma_ip); ++ } ++} ++ +#endif + static av_cold int hevc_decode_free(AVCodecContext *avctx) { HEVCContext *s = avctx->priv_data; -@@ -3161,6 +4831,27 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3182,10 +5234,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) av_freep(&s->cabac_state); +- for (i = 0; i < 3; i++) { +- av_freep(&s->sao_pixel_buffer_h[i]); +- av_freep(&s->sao_pixel_buffer_v[i]); +#ifdef RPI + -+#ifdef RPI_WORKER + hevc_exit_worker(s); -+#endif -+ -+ for(i=0;iuniv_pred_cmds[i]); -+ -+#if RPI_INTER -+ rpi_free_inter_pred(&s->jobs[i].chroma_ip); -+ rpi_free_inter_pred(&s->jobs[i].luma_ip); -+#endif -+ } -+ + vpu_qpu_term(); ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_kill_state(s->progress_states + i); + } + + av_rpi_zc_uninit(avctx); +#endif + - for (i = 0; i < 3; i++) { - av_freep(&s->sao_pixel_buffer_h[i]); - av_freep(&s->sao_pixel_buffer_v[i]); -@@ -3202,10 +4893,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) ++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] ++ av_freep(&s->sao_pixel_buffer_v[0]); + av_frame_free(&s->output_frame); + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { +@@ -3223,6 +5284,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) return 0; } @@ -8717,14 +12811,7 @@ index f9e8ff0..8a3d874 100644 static av_cold int hevc_init_context(AVCodecContext *avctx) { HEVCContext *s = avctx->priv_data; - int i; -+#ifdef RPI -+ unsigned int job; -+#endif - - s->avctx = avctx; - -@@ -3215,6 +4910,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) +@@ -3236,6 +5298,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) s->HEVClcList[0] = s->HEVClc; s->sList[0] = s; @@ -8738,53 +12825,39 @@ index f9e8ff0..8a3d874 100644 + if (vpu_qpu_init() != 0) + goto fail; + -+ for(job = 0; job < RPI_MAX_JOBS; job++) { -+ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); -+ if (!s->univ_pred_cmds[job]) -+ goto fail; -+ } -+ +#if RPI_INTER -+ -+ for (job = 0; job < RPI_MAX_JOBS; job++) { -+ HEVCRpiJob * const jb = s->jobs + job; -+ // ** Sizeof the union structure might be overkill but at the moment it -+ // is correct (it certainly isn't going to be too samll) -+ -+ rpi_alloc_inter_pred(&jb->chroma_ip, -+ QPU_N_UV, QPU_N_GRP_UV, -+ UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), -+ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t), -+ inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu); -+ rpi_alloc_inter_pred(&jb->luma_ip, -+ QPU_N_Y, QPU_N_GRP_Y, -+ Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), -+ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t), -+ inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu); ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ { ++ static const uint32_t dframe[1] = {0x80808080}; ++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; + } -+ -+ s->qpu_filter_uv = qpu_fn(mc_filter_uv); -+ s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0); -+ s->qpu_dummy_frame = qpu_fn(mc_start); // Use our code as a dummy frame -+ s->qpu_filter = qpu_fn(mc_filter); -+ s->qpu_filter_y_p00 = qpu_fn(mc_filter_y_p00); -+ s->qpu_filter_y_b00 = qpu_fn(mc_filter_y_b00); -+ s->qpu_filter_b = qpu_fn(mc_filter_b); ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame ++#endif +#endif + //gpu_malloc_uncached(2048*64,&s->dummy); + + s->enable_rpi = 0; + -+#ifdef RPI_WORKER ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_init_state(s->progress_states + i); ++ } + hevc_init_worker(s); +#endif -+ -+#endif + s->cabac_state = av_malloc(HEVC_CONTEXTS); if (!s->cabac_state) goto fail; -@@ -3357,9 +5105,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) +@@ -3249,6 +5342,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) + if (!s->DPB[i].frame) + goto fail; + s->DPB[i].tf.f = s->DPB[i].frame; ++ s->DPB[i].dpb_no = i; + } + + s->max_ra = INT_MAX; +@@ -3378,9 +5472,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) } if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) @@ -8797,7 +12870,7 @@ index f9e8ff0..8a3d874 100644 return 0; } -@@ -3418,6 +5166,8 @@ AVCodec ff_hevc_decoder = { +@@ -3439,6 +5533,8 @@ AVCodec ff_hevc_decoder = { .update_thread_context = hevc_update_thread_context, .init_thread_copy = hevc_init_thread_copy, .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | @@ -8807,7 +12880,7 @@ index f9e8ff0..8a3d874 100644 .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE, .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h -index 0c78812..c268d39 100644 +index 0c7881286c..70394aab76 100644 --- a/libavcodec/hevcdec.h +++ b/libavcodec/hevcdec.h @@ -334,17 +334,6 @@ typedef struct CodingUnit { @@ -8828,11 +12901,17 @@ index 0c78812..c268d39 100644 typedef struct NeighbourAvailable { int cand_bottom_left; int cand_left; -@@ -421,7 +410,17 @@ typedef struct HEVCFrame { +@@ -419,9 +408,23 @@ typedef struct HEVCFrame { + * A combination of HEVC_FRAME_FLAG_* + */ uint8_t flags; ++ ++ // Entry no in DPB - can be used as a small unique ++ // frame identifier (within the current thread) ++ uint8_t dpb_no; } HEVCFrame; -+#ifdef RPI_WORKER ++#ifdef RPI +typedef struct HEVCLocalContextIntra { + TransformUnit tu; + NeighbourAvailable na; @@ -8846,7 +12925,7 @@ index 0c78812..c268d39 100644 uint8_t cabac_state[HEVC_CONTEXTS]; uint8_t stat_coeff[4]; -@@ -436,8 +435,6 @@ typedef struct HEVCLocalContext { +@@ -436,8 +439,6 @@ typedef struct HEVCLocalContext { int qPy_pred; @@ -8855,7 +12934,7 @@ index 0c78812..c268d39 100644 uint8_t ctb_left_flag; uint8_t ctb_up_flag; uint8_t ctb_up_right_flag; -@@ -453,7 +450,6 @@ typedef struct HEVCLocalContext { +@@ -453,7 +454,6 @@ typedef struct HEVCLocalContext { int ct_depth; CodingUnit cu; PredictionUnit pu; @@ -8863,7 +12942,7 @@ index 0c78812..c268d39 100644 #define BOUNDARY_LEFT_SLICE (1 << 0) #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -464,6 +460,149 @@ typedef struct HEVCLocalContext { +@@ -464,6 +464,207 @@ typedef struct HEVCLocalContext { int boundary_flags; } HEVCLocalContext; @@ -8874,6 +12953,7 @@ index 0c78812..c268d39 100644 +// but allocate more memory and increase the latency before data in the next frame can be processed +#define RPI_NUM_CHUNKS 4 +#define RPI_CHUNK_SIZE 12 ++#define RPI_ROUND_TO_LINES 0 + +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code +#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE) @@ -8920,6 +13000,9 @@ index 0c78812..c268d39 100644 + RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx + RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx + RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_ADD_DC, ++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C ++ RPI_PRED_ADD_DC_V, + RPI_PRED_INTRA, + RPI_PRED_I_PCM, + RPI_PRED_CMD_MAX @@ -8934,8 +13017,14 @@ index 0c78812..c268d39 100644 + struct { // TRANSFORM_ADD + uint8_t * dst; + const int16_t * buf; -+ uint32_t stride; ++ uint16_t stride; // Should be good enough for all pic fmts we use ++ int16_t dc; + } ta; ++ struct { ++ uint8_t * dst; ++ uint32_t stride; ++ int dc; ++ } dc; + struct { // INTRA + uint16_t x; + uint16_t y; @@ -8953,6 +13042,7 @@ index 0c78812..c268d39 100644 +#endif + +#ifdef RPI ++#include + +union qpu_mc_pred_cmd_s; +struct qpu_mc_pred_y_p_s; @@ -8979,13 +13069,60 @@ index 0c78812..c268d39 100644 + int used; // 0 if nothing in any Q, 1 otherwise + int used_grp; // 0 if nothing in any Q in the current group + unsigned int max_fill; ++ unsigned int min_gap; + GPU_MEM_PTR_T gptr; -+ unsigned int q1_size; // size of 1 uniform Q +} HEVCRpiInterPredEnv; + ++typedef struct HEVCRpiIntraPredEnv { ++ unsigned int n; // Number of commands ++ HEVCPredCmd * cmds; ++} HEVCRpiIntraPredEnv; ++ ++typedef struct HEVCRpiCeoffEnv { ++ unsigned int n; ++ uint16_t * buf; ++} HEVCRpiCoeffEnv; ++ ++typedef struct HEVCRpiCeoffsEnv { ++ HEVCRpiCoeffEnv s[4]; ++ GPU_MEM_PTR_T gptr; ++ void * mptr; ++} HEVCRpiCoeffsEnv; ++ ++typedef struct HEVCRpiDeblkBlk { ++ uint16_t x_ctb; ++ uint16_t y_ctb; ++} HEVCRpiDeblkBlk; ++ ++typedef struct HEVCRpiDeblkEnv { ++ unsigned int n; ++ HEVCRpiDeblkBlk * blks; ++} HEVCRpiDeblkEnv; ++ ++typedef struct HEVCRPiFrameProgressWait { ++ int req; ++ struct HEVCRPiFrameProgressWait * next; ++ sem_t sem; ++} HEVCRPiFrameProgressWait; ++ ++typedef struct HEVCRPiFrameProgressState { ++ struct HEVCRPiFrameProgressWait * first; ++ struct HEVCRPiFrameProgressWait * last; ++ pthread_mutex_t lock; ++} HEVCRPiFrameProgressState; ++ +typedef struct HEVCRpiJob { ++ volatile int terminate; ++ int pending; ++ sem_t sem_in; // set by main ++ sem_t sem_out; // set by worker + HEVCRpiInterPredEnv chroma_ip; + HEVCRpiInterPredEnv luma_ip; ++ int16_t progress[32]; // index by dpb_no ++ HEVCRpiIntraPredEnv intra; ++ HEVCRpiCoeffsEnv coeffs; ++ HEVCRpiDeblkEnv deblk; ++ HEVCRPiFrameProgressWait progress_wait; +} HEVCRpiJob; + +#if RPI_TSTATS @@ -9013,43 +13150,20 @@ index 0c78812..c268d39 100644 typedef struct HEVCContext { const AVClass *c; // needed by private avoptions AVCodecContext *avctx; -@@ -472,6 +611,9 @@ typedef struct HEVCContext { - - HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; - HEVCLocalContext *HEVClc; -+#ifdef RPI_WORKER -+ HEVCLocalContextIntra HEVClcIntra; -+#endif - - uint8_t threads_type; - uint8_t threads_number; -@@ -479,6 +621,90 @@ typedef struct HEVCContext { +@@ -479,6 +680,69 @@ typedef struct HEVCContext { int width; int height; + int used_for_ref; // rpi +#ifdef RPI + int enable_rpi; -+ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; -+ int buf_width; -+ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; -+ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS]; -+ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4]; -+ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4]; -+ int num_coeffs[RPI_MAX_JOBS][4]; -+ int num_xfm_cmds[RPI_MAX_JOBS]; -+ int num_mv_cmds_y[RPI_MAX_JOBS]; -+ int num_mv_cmds_c[RPI_MAX_JOBS]; -+ int num_pred_cmds[RPI_MAX_JOBS]; -+ int num_dblk_cmds[RPI_MAX_JOBS]; -+ int vpu_id; -+ int pass0_job; // Pass0 does coefficient decode -+ int pass1_job; // Pass1 does pixel processing ++ unsigned int pass0_job; // Pass0 does coefficient decode ++ unsigned int pass1_job; // Pass1 does pixel processing + int ctu_count; // Number of CTUs done in pass0 so far + int max_ctu_count; // Number of CTUs when we trigger a round of processing -+ int ctu_per_y_chan; // Number of CTUs per luma QPU -+ int ctu_per_uv_chan; // Number of CTUs per chroma QPU + ++ HEVCRpiJob * jb0; ++ HEVCRpiJob * jb1; + HEVCRpiJob jobs[RPI_MAX_JOBS]; +#if RPI_TSTATS + HEVCRpiStats tstats; @@ -9059,29 +13173,19 @@ index 0c78812..c268d39 100644 + struct qpu_mc_src_s * last_y8_l1; + + // Function pointers -+ uint32_t qpu_filter_uv; -+ uint32_t qpu_filter_uv_b0; -+ uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory -+ uint32_t qpu_filter; -+ uint32_t qpu_filter_b; -+ uint32_t qpu_filter_y_p00; -+ uint32_t qpu_filter_y_b00; ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ const uint8_t * qpu_dummy_frame_emu; ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory ++#endif ++ HEVCRpiQpu qpu; +#endif + -+#ifdef RPI_WORKER + pthread_t worker_thread; -+ pthread_cond_t worker_cond_head; -+ pthread_cond_t worker_cond_tail; -+ pthread_mutex_t worker_mutex; -+ -+ int worker_tail; // Contains the number of posted jobs -+ int worker_head; // Contains the number of completed jobs -+ int kill_worker; // set to 1 to terminate the worker -+#endif -+ -+#define RPI_DEBLOCK_VPU_Q_COUNT 2 + +#ifdef RPI_DEBLOCK_VPU ++#define RPI_DEBLOCK_VPU_Q_COUNT 2 + int enable_rpi_deblock; + + int uv_setup_width; @@ -9109,22 +13213,22 @@ index 0c78812..c268d39 100644 + unsigned int dvq_n; + +#endif ++ HEVCLocalContextIntra HEVClcIntra; ++ HEVCRPiFrameProgressState progress_states[2]; +#endif + uint8_t *cabac_state; /** 1 if the independent slice segment header was successfully parsed */ -@@ -596,6 +822,9 @@ typedef struct HEVCContext { +@@ -595,7 +859,6 @@ typedef struct HEVCContext { + uint16_t white_point[2]; uint32_t max_mastering_luminance; uint32_t min_mastering_luminance; - -+#ifdef RPI -+ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2]; -+#endif +- } HEVCContext; int ff_hevc_decode_nal_sei(HEVCContext *s); -@@ -703,6 +932,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -703,6 +966,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); @@ -9136,7 +13240,7 @@ index 0c78812..c268d39 100644 /** * Reset SEI values that are stored on the Context. * e.g. Caption data that was extracted during NAL -@@ -716,4 +950,15 @@ extern const uint8_t ff_hevc_qpel_extra_before[4]; +@@ -716,4 +984,89 @@ extern const uint8_t ff_hevc_qpel_extra_before[4]; extern const uint8_t ff_hevc_qpel_extra_after[4]; extern const uint8_t ff_hevc_qpel_extra[4]; @@ -9149,11 +13253,85 @@ index 0c78812..c268d39 100644 +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); +#endif + ++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field); ++ ++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field); ++ ++// All of these expect that s->threads_type == FF_THREAD_FRAME ++ ++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++ else ++ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); ++} ++ ++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y) ++{ ++ if (s->enable_rpi && s->used_for_ref) ++ ff_hevc_rpi_progress_signal_field(s, y, 1); ++} ++ ++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); ++ else ++ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); ++} ++ ++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y) ++{ ++ if (s->used_for_ref) ++ { ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_signal_field(s, y, 0); ++ else ++ ff_thread_report_progress(&s->ref->tf, y, 0); ++ } ++} ++ ++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s) ++{ ++ if (s->enable_rpi) ++ { ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); ++ } ++ else ++ ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); ++} ++ ++#else ++ ++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world ++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) ++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) ++#define ff_hevc_progress_signal_mv(s, y) ++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0) ++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0) ++ +#endif ++ ++// Set all done - signal nothing (used in missing refs) ++// Works for both rpi & non-rpi ++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref) ++{ ++ if (ref->tf.progress != NULL) ++ { ++ int * const p = (int *)&ref->tf.progress->data; ++ p[0] = INT_MAX; ++ p[1] = INT_MAX; ++ } ++} + #endif /* AVCODEC_HEVCDEC_H */ diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c -index 23e923f..c4f1a6c 100644 +index 23e923f8e5..82009c4ed4 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { @@ -9277,13 +13455,14 @@ index 23e923f..c4f1a6c 100644 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) { #undef FUNC -@@ -193,12 +307,38 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +@@ -193,12 +307,54 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) +#if !RPI_HEVC_SAND +#define SLICED_LOOP_FILTERS(depth) +#define SLICED_ADD_RESIDUAL(depth) ++#define SLICED_SAO(depth) +#else +#define SLICED_ADD_RESIDUAL(depth)\ + hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ @@ -9298,13 +13477,24 @@ index 23e923f..c4f1a6c 100644 + hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ + hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ + hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ -+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth); ++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) +#define SLICED_LOOP_FILTERS(depth)\ + hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ + hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ + hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) -+#endif ++#define SLICED_SAO(depth)\ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) + ++#endif + #define HEVC_DSP(depth) \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ @@ -9312,31 +13502,39 @@ index 23e923f..c4f1a6c 100644 hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ ++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ ++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ ++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ ++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ + SLICED_ADD_RESIDUAL(depth); \ hevcdsp->dequant = FUNC(dequant, depth); \ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ -@@ -225,6 +365,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ + hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ + \ +- hevcdsp->sao_band_filter[0] = \ +- hevcdsp->sao_band_filter[1] = \ +- hevcdsp->sao_band_filter[2] = \ +- hevcdsp->sao_band_filter[3] = \ +- hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth); \ +- hevcdsp->sao_edge_filter[0] = \ +- hevcdsp->sao_edge_filter[1] = \ +- hevcdsp->sao_edge_filter[2] = \ +- hevcdsp->sao_edge_filter[3] = \ +- hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth); \ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ ++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ ++ } \ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ ++ SLICED_SAO(depth); \ \ -+ hevcdsp->sao_band_filter_c[0] = \ -+ hevcdsp->sao_band_filter_c[1] = \ -+ hevcdsp->sao_band_filter_c[2] = \ -+ hevcdsp->sao_band_filter_c[3] = \ -+ hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth); \ -+ hevcdsp->sao_edge_filter_c[0] = \ -+ hevcdsp->sao_edge_filter_c[1] = \ -+ hevcdsp->sao_edge_filter_c[2] = \ -+ hevcdsp->sao_edge_filter_c[3] = \ -+ hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth); \ -+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ -+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth); \ -+ \ QPEL_FUNCS(depth); \ QPEL_UNI_FUNCS(depth); \ - QPEL_BI_FUNCS(depth); \ -@@ -232,6 +385,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) EPEL_UNI_FUNCS(depth); \ EPEL_BI_FUNCS(depth); \ \ @@ -9344,7 +13542,7 @@ index 23e923f..c4f1a6c 100644 hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ -@@ -257,6 +411,8 @@ int i = 0; +@@ -257,6 +409,8 @@ int i = 0; break; } @@ -9354,7 +13552,7 @@ index 23e923f..c4f1a6c 100644 ff_hevc_dsp_init_x86(hevcdsp, bit_depth); if (ARCH_ARM) diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h -index eefb3cd..9e44e7f 100644 +index eefb3cd152..4b48055def 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -25,6 +25,7 @@ @@ -9365,7 +13563,7 @@ index eefb3cd..9e44e7f 100644 #include "get_bits.h" #define MAX_PB_SIZE 64 -@@ -42,11 +43,30 @@ typedef struct SAOParams { +@@ -42,11 +43,39 @@ typedef struct SAOParams { uint8_t type_idx[3]; ///< sao_type_idx } SAOParams; @@ -9379,48 +13577,67 @@ index eefb3cd..9e44e7f 100644 + int8_t ref_idx[2]; + int8_t pred_flag; +} MvField; ++ ++#ifdef RPI ++#define SAO_FILTER_N 6 ++#else ++#define SAO_FILTER_N 5 ++#endif ++ + typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, struct GetBitContext *gb, int pcm_bit_depth); void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); ++ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc); +#if RPI_HEVC_SAND -+ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); -+ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v); ++ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u); + + void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv); + void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, + struct GetBitContext *gb, int pcm_bit_depth); +#endif void (*dequant)(int16_t *coeffs, int16_t log2_size); -@@ -60,14 +80,23 @@ typedef struct HEVCDSPContext { +@@ -58,16 +87,31 @@ typedef struct HEVCDSPContext { - void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + void (*idct_dc[4])(int16_t *coeffs); + +- void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height); -+ void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++#if RPI_HEVC_SAND ++ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); ++#endif /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */ - void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, +- void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, int16_t *sao_offset_val, int sao_eo_class, int width, int height); -+ void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++#if RPI_HEVC_SAND ++ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height); ++#endif void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++#if RPI_HEVC_SAND + void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, + uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++#endif void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width); -@@ -120,6 +149,22 @@ typedef struct HEVCDSPContext { +@@ -120,6 +164,22 @@ typedef struct HEVCDSPContext { void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); @@ -9444,24 +13661,23 @@ index eefb3cd..9e44e7f 100644 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c -index 25f1a81..d475b3d 100644 +index 75763ce85e..60053d4a95 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c -@@ -26,6 +26,10 @@ +@@ -26,6 +26,8 @@ #include "bit_depth_template.c" #include "hevcdsp.h" -+#ifdef RPI -+#include "rpi_zc.h" -+#endif ++#include "rpi_shader_template.h" + static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) { -@@ -41,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height +@@ -41,6 +43,30 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height } } ++#if RPI_HEVC_SAND +static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height, + GetBitContext *gb, int pcm_bit_depth) +{ @@ -9483,18 +13699,34 @@ index 25f1a81..d475b3d 100644 + dst += stride; + } +} -+ ++#endif + static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, ptrdiff_t stride, int size) { -@@ -58,6 +85,44 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, +@@ -58,6 +84,106 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, } } ++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ dst[x] = av_clip_pixel(dst[x] + dc); ++ } ++ dst += stride; ++ } ++} ++ ++ +#if RPI_HEVC_SAND -+static av_always_inline void FUNC(add_residual_u_v)(uint8_t *_dst, const int16_t *res, -+ ptrdiff_t stride, int size) ++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, const int dc_v, int size) +{ + int x, y; + pixel *dst = (pixel *)_dst; @@ -9504,6 +13736,25 @@ index 25f1a81..d475b3d 100644 + for (y = 0; y < size; y++) { + for (x = 0; x < size * 2; x += 2) { + dst[x] = av_clip_pixel(dst[x] + *res); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ res++; ++ } ++ dst += stride; ++ } ++} ++ ++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, const int dc_u, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res); + res++; + } + dst += stride; @@ -9518,6 +13769,10 @@ index 25f1a81..d475b3d 100644 + const int16_t * ru = res; + const int16_t * rv = res + size * size; + ++// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1); ++// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0); ++// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0); ++ + stride /= sizeof(pixel); + + for (y = 0; y < size; y++) { @@ -9527,39 +13782,82 @@ index 25f1a81..d475b3d 100644 + } + dst += stride; + } ++ ++// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1); +} ++ ++ ++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ const int dc_v = dc >> 16; ++ const int dc_u = (dc << 16) >> 16; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ } ++ dst += stride; ++ } ++} ++ ++ +#endif + static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, ptrdiff_t stride) { -@@ -82,6 +147,90 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, +@@ -82,6 +208,132 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, FUNC(add_residual)(_dst, res, stride, 32); } ++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 32); ++} ++ +#if RPI_HEVC_SAND +// -- U -- (plaited) + +static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_u_v)(_dst, res, stride, 4); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); +} + +static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_u_v)(_dst, res, stride, 8); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); +} + +static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_u_v)(_dst, res, stride, 16); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); +} + +static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_u) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); @@ -9568,25 +13866,25 @@ index 25f1a81..d475b3d 100644 +// -- V -- (plaited) + +static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_v) +{ -+ FUNC(add_residual_u_v)(_dst + 1, res, stride, 4); ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); +} + +static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_v) +{ -+ FUNC(add_residual_u_v)(_dst + 1, res, stride, 8); ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); +} + +static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_v) +{ -+ FUNC(add_residual_u_v)(_dst + 1, res, stride, 16); ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); +} + +static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) ++ ptrdiff_t stride, int dc_v) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); @@ -9618,13 +13916,68 @@ index 25f1a81..d475b3d 100644 + // Should never occur for 420, which is all that sand supports + av_assert0(0); +} ++ ++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ +#endif + + static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) { int16_t *coeffs = (int16_t *) _coeffs; -@@ -361,7 +510,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, +@@ -352,6 +604,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride + } + } + ++ ++#if BIT_DEPTH == 10 ++#if RPI_HEVC_SAND ++// We need a 32 bit variation for the _c restores so hijack bit depth 10 ++#undef pixel ++#undef BIT_DEPTH ++#define pixel uint32_t ++#define BIT_DEPTH 32 ++#endif ++// All 16 bit variations are the same ++#define sao_edge_restore_0_10 sao_edge_restore_0_9 ++#define sao_edge_restore_1_10 sao_edge_restore_1_9 ++#define sao_edge_restore_0_11 sao_edge_restore_0_9 ++#define sao_edge_restore_1_11 sao_edge_restore_1_9 ++#define sao_edge_restore_0_12 sao_edge_restore_0_9 ++#define sao_edge_restore_1_12 sao_edge_restore_1_9 ++#define sao_edge_restore_0_13 sao_edge_restore_0_9 ++#define sao_edge_restore_1_13 sao_edge_restore_1_9 ++#define sao_edge_restore_0_14 sao_edge_restore_0_9 ++#define sao_edge_restore_1_14 sao_edge_restore_1_9 ++#define sao_edge_restore_0_15 sao_edge_restore_0_9 ++#define sao_edge_restore_1_15 sao_edge_restore_1_9 ++#define sao_edge_restore_0_16 sao_edge_restore_0_9 ++#define sao_edge_restore_1_16 sao_edge_restore_1_9 ++#endif ++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 + static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, + int *borders, int _width, int _height, +@@ -361,7 +639,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, int x, y; pixel *dst = (pixel *)_dst; pixel *src = (pixel *)_src; @@ -9632,7 +13985,7 @@ index 25f1a81..d475b3d 100644 int sao_eo_class = sao->eo_class[c_idx]; int init_x = 0, width = _width, height = _height; -@@ -370,33 +518,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, +@@ -370,33 +647,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, if (sao_eo_class != SAO_EO_VERT) { if (borders[0]) { @@ -9670,7 +14023,7 @@ index 25f1a81..d475b3d 100644 height--; } } -@@ -411,7 +555,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -411,7 +684,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, int x, y; pixel *dst = (pixel *)_dst; pixel *src = (pixel *)_src; @@ -9678,7 +14031,7 @@ index 25f1a81..d475b3d 100644 int sao_eo_class = sao->eo_class[c_idx]; int init_x = 0, init_y = 0, width = _width, height = _height; -@@ -420,34 +563,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -420,34 +692,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, if (sao_eo_class != SAO_EO_VERT) { if (borders[0]) { @@ -9717,24 +14070,22 @@ index 25f1a81..d475b3d 100644 height--; } } -@@ -488,6 +627,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -487,6 +755,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + } } - ++#endif ++#if BIT_DEPTH == 32 ++#undef BIT_DEPTH ++#undef pixel ++#define BIT_DEPTH 10 ++#define pixel uint16_t ++#endif + +// --- Plaited chroma versions + -+#if BIT_DEPTH != 8 -+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else ++#if RPI_HEVC_SAND ++ +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, @@ -9760,23 +14111,17 @@ index 25f1a81..d475b3d 100644 + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) + { -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); ++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); ++ // *** & 31 shouldn't be wanted but just now we generate broken input that ++ // crashes us in 10-bit world ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); + } + dst += stride_dst; + src += stride_src; + } +} -+#endif -+ -+#if BIT_DEPTH != 8 -+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, -+ int eo, int width, int height) { -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else + +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, @@ -9794,9 +14139,12 @@ index 25f1a81..d475b3d 100644 + int a_stride, b_stride; + int x, y; + ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ + stride_dst /= sizeof(pixel); + width *= 2; + ++ av_assert0(width <= 64); ++ + a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; + b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; + for (y = 0; y < height; y++) { @@ -9814,43 +14162,42 @@ index 25f1a81..d475b3d 100644 + dst += stride_dst; + } +} -+#endif + -+#if BIT_DEPTH != 8 -+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else ++// Do once ++#if BIT_DEPTH == 8 +// Any old 2 byte 'normal' restore will work for these -+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10 -+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10 ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 ++// We need 32 bit for 9 bit+ ++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 +#endif + ++#endif // RPI_HEVC_SAND + + #undef CMP - //////////////////////////////////////////////////////////////////////////////// -@@ -1690,3 +1950,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, +@@ -1690,3 +2073,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, #undef TQ1 #undef TQ2 #undef TQ3 + -+#ifdef RPI ++#if RPI_HEVC_SAND + +// line zero +#define P3 pix_l[0 * xstride] @@ -10064,7 +14411,7 @@ index 25f1a81..d475b3d 100644 +#endif + diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c -index 7a86ed3..7d32c4a 100644 +index 7a86ed3d31..7d32c4ab14 100644 --- a/libavcodec/hevcpred.c +++ b/libavcodec/hevcpred.c @@ -24,6 +24,7 @@ @@ -10146,7 +14493,7 @@ index 7a86ed3..7d32c4a 100644 case 9: HEVC_PRED(9); diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h -index eb17663..00ba3f9 100644 +index eb17663683..00ba3f94c0 100644 --- a/libavcodec/hevcpred.h +++ b/libavcodec/hevcpred.h @@ -38,6 +38,17 @@ typedef struct HEVCPredContext { @@ -10168,10 +14515,10 @@ index eb17663..00ba3f9 100644 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c -index 6ae87cc..c14dddd 100644 +index 6fe33546b1..2f9f5f2798 100644 --- a/libavcodec/hevcpred_template.c +++ b/libavcodec/hevcpred_template.c -@@ -20,13 +20,55 @@ +@@ -20,13 +20,110 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -10183,34 +14530,90 @@ index 6ae87cc..c14dddd 100644 #include "hevcpred.h" +#ifdef RPI -+#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" +#endif + +#define DUMP_PRED 0 + #define POS(x, y) src[(x) + stride * (y)] -+#if PRED_C -+ ++// REPEAT_INCLUDE defined at EOF ++#if defined(RPI) && !defined(INCLUDED_ONCE) +typedef uint8_t (* c8_dst_ptr_t)[2]; +typedef const uint8_t (* c8_src_ptr_t)[2]; ++typedef uint16_t (* c16_dst_ptr_t)[2]; ++typedef const uint16_t (* c16_src_ptr_t)[2]; ++ ++// *** On ARM make these NEON registers ++typedef struct pixel4_16 { ++ uint16_t x[4]; ++} pixel4_16; ++typedef struct pixel4_32 { ++ uint32_t x[4]; ++} pixel4_32; ++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) ++{ ++ pixel4_16 t = {{x, x, x, x}}; ++ return t; ++} ++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) ++{ ++ pixel4_32 t = {{x, x, x, x}}; ++ return t; ++} ++#endif ++ ++#if PRED_C ++// For chroma we double pixel size so we copy pairs ++#undef pixel ++#undef pixel2 ++#undef pixel4 ++#undef dctcoef ++#undef INIT_CLIP ++#undef no_rnd_avg_pixel4 ++#undef rnd_avg_pixel4 ++#undef AV_RN2P ++#undef AV_RN4P ++#undef AV_RN4PA ++#undef AV_WN2P ++#undef AV_WN4P ++#undef AV_WN4PA ++#undef CLIP ++#undef FUNC ++#undef FUNCC ++#undef av_clip_pixel ++#undef PIXEL_SPLAT_X4 + +#if BIT_DEPTH == 8 -+#undef BIT_DEPTH -+#define BIT_DEPTH 16 -+#include "bit_depth_template.c" -+#undef FUNC -+#define FUNC(a) FUNC3(a, 8, _c) ++#define pixel uint16_t ++#define pixel4 pixel4_16 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 ++#define cpel uint8_t ++#define c_src_ptr_t c8_src_ptr_t ++#define c_dst_ptr_t c8_dst_ptr_t +#else -+#undef FUNC -+#define FUNC FUNCC ++#define pixel uint32_t ++#define pixel4 pixel4_32 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 ++#define cpel uint16_t ++#define c_src_ptr_t c16_dst_ptr_t ++#define c_dst_ptr_t c16_dst_ptr_t ++#endif ++#define AV_RN4P(p) (*(pixel4*)(p)) ++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) ++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) +#endif + ++ ++// Get PW prior to horrid PRED_C trickery ++#if BIT_DEPTH == 8 ++#define PW 1 ++#else ++#define PW 2 +#endif + -+#if DUMP_PRED -+#ifndef DEBUG_ONCE -+#define DEBUG_ONCE ++ ++#if DUMP_PRED && !defined(INCLUDE_ONCE) +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) +{ + for (unsigned int y = 0; y != size; y++, data += stride * 2) { @@ -10222,17 +14625,16 @@ index 6ae87cc..c14dddd 100644 + printf("\n"); +} +#endif -+#endif + static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, int log2_size, int c_idx) { -@@ -69,8 +111,11 @@ do { \ +@@ -69,8 +166,11 @@ do { \ AV_WN4P(&ptr[i], a); \ else \ a = PIXEL_SPLAT_X4(ptr[i + 3]) - -+#ifdef RPI_WORKER ++#ifdef RPI + HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ; +#else HEVCLocalContext *lc = s->HEVClc; @@ -10240,7 +14642,7 @@ index 6ae87cc..c14dddd 100644 int i; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; -@@ -79,15 +124,23 @@ do { \ +@@ -79,15 +179,23 @@ do { \ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; int size_in_luma_v = size << vshift; int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; @@ -10256,18 +14658,18 @@ index 6ae87cc..c14dddd 100644 - ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); + const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); +#if defined(RPI) -+ pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ? ++ pixel *const src = !av_rpi_is_sand_frame(s->frame) ? + (pixel*)s->frame->data[c_idx] + x + y * stride : + c_idx == 0 ? -+ (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) : -+ (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y); ++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : ++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); +#else pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride; +#endif int min_pu_width = s->ps.sps->min_pu_width; -@@ -95,14 +148,20 @@ do { \ +@@ -95,14 +203,20 @@ do { \ lc->tu.intra_pred_mode; pixel4 a; pixel left_array[2 * MAX_TB_SIZE + 1]; @@ -10288,7 +14690,7 @@ index 6ae87cc..c14dddd 100644 int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); int cand_left = lc->na.cand_left; int cand_up_left = lc->na.cand_up_left; -@@ -114,6 +173,26 @@ do { \ +@@ -114,6 +228,27 @@ do { \ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - (x0 + size_in_luma_h)) >> hshift; @@ -10301,10 +14703,11 @@ index 6ae87cc..c14dddd 100644 +#endif + +#if defined(RPI) -+ if (s->frame->format == AV_PIX_FMT_SAND128) { ++ if (av_rpi_is_sand_frame(s->frame)) { ++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs + const AVFrame * const frame = s->frame; + const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 -+ const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride; ++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; + if ((x & mask) == 0) + src_l -= stripe_adj; + if (((x + size) & mask) == 0) @@ -10315,7 +14718,7 @@ index 6ae87cc..c14dddd 100644 if (s->ps.pps->constrained_intra_pred_flag == 1) { int size_in_luma_pu_v = PU(size_in_luma_v); int size_in_luma_pu_h = PU(size_in_luma_h); -@@ -163,23 +242,24 @@ do { \ +@@ -163,23 +298,24 @@ do { \ top[-1] = 128; } if (cand_up_left) { @@ -10347,29 +14750,29 @@ index 6ae87cc..c14dddd 100644 size - bottom_left_size); } -@@ -268,7 +348,11 @@ do { \ +@@ -268,7 +404,11 @@ do { \ cand_up_left = 1; cand_left = 1; } else { // No samples available -+#if PRED_C && BIT_DEPTH == 16 -+ left[-1] = 0x8080; ++#if PRED_C ++ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); +#else left[-1] = (1 << (BIT_DEPTH - 1)); +#endif EXTEND(top, left[-1], 2 * size); EXTEND(left, left[-1], 2 * size); } -@@ -287,6 +371,9 @@ do { \ +@@ -287,6 +427,9 @@ do { \ top[-1] = left[-1]; // Filtering process -+ // Sand128 can only apply to chroma_format_idc == 1 so we don't need to ++ // Sand can only apply to chroma_format_idc == 1 so we don't need to + // worry about chroma smoothing for that case +#if !PRED_C if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { if (mode != INTRA_DC && size != 4){ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; -@@ -342,13 +429,46 @@ do { \ +@@ -342,6 +485,30 @@ do { \ mode); break; } @@ -10399,24 +14802,8 @@ index 6ae87cc..c14dddd 100644 +#endif } -+#if !PRED_C || BIT_DEPTH == 16 #define INTRA_PRED(size) \ - static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ - { \ - FUNC(intra_pred)(s, x0, y0, size, c_idx); \ - } -+#else -+#define INTRA_PRED(size) \ -+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ -+{ \ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#endif - - INTRA_PRED(2) - INTRA_PRED(3) -@@ -357,6 +477,7 @@ INTRA_PRED(5) +@@ -357,6 +524,7 @@ INTRA_PRED(5) #undef INTRA_PRED @@ -10424,7 +14811,7 @@ index 6ae87cc..c14dddd 100644 static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int trafo_size) -@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to +@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); } @@ -10435,9 +14822,9 @@ index 6ae87cc..c14dddd 100644 +{ + int x, y; + int size = 1 << trafo_size; -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ const c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; + + for (y = 0; y < size; y++, src += stride) + { @@ -10452,26 +14839,9 @@ index 6ae87cc..c14dddd 100644 +} +#endif -+#if !PRED_C || BIT_DEPTH == 16 #define PRED_PLANAR(size)\ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ - const uint8_t *left, ptrdiff_t stride) \ - { \ - FUNC(pred_planar)(src, top, left, stride, size + 2); \ - } -+#else -+#define PRED_PLANAR(size)\ -+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ -+ const uint8_t *left, ptrdiff_t stride) \ -+{ \ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__); \ -+ abort(); \ -+} -+#endif - - PRED_PLANAR(0) - PRED_PLANAR(1) -@@ -386,6 +540,7 @@ PRED_PLANAR(3) +@@ -386,6 +577,7 @@ PRED_PLANAR(3) #undef PRED_PLANAR @@ -10479,7 +14849,7 @@ index 6ae87cc..c14dddd 100644 static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int log2_size, int c_idx) -@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, +@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, POS(0, y) = (left[y] + 3 * dc + 2) >> 2; } } @@ -10490,9 +14860,9 @@ index 6ae87cc..c14dddd 100644 +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ const c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; + unsigned int dc0 = size; + unsigned int dc1 = size; + @@ -10533,7 +14903,7 @@ index 6ae87cc..c14dddd 100644 static av_always_inline void FUNC(pred_angular)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, -@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, +@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, const pixel *top = (const pixel *)_top; const pixel *left = (const pixel *)_left; @@ -10549,7 +14919,7 @@ index 6ae87cc..c14dddd 100644 int angle = intra_pred_angle[mode - 2]; pixel ref_array[3 * MAX_TB_SIZE + 4]; pixel *ref_tmp = ref_array + size; -@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, +@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, } } } @@ -10561,26 +14931,26 @@ index 6ae87cc..c14dddd 100644 + int mode, int size) +{ + int x, y; -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ c_src_ptr_t top = (c_src_ptr_t)_top; ++ c_src_ptr_t left = (c_src_ptr_t)_left; + + const int angle = intra_pred_angle[mode - 2]; -+ uint8_t ref_array[3 * MAX_TB_SIZE + 4][2]; -+ c8_dst_ptr_t ref_tmp = ref_array + size; -+ c8_src_ptr_t ref; ++ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c_dst_ptr_t ref_tmp = ref_array + size; ++ c_src_ptr_t ref; + const int last = (size * angle) >> 5; + + if (mode >= 18) { + ref = top - 1; + if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, top - 1, (size + 1) * 2); ++ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW); + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } -+ ref = (c8_src_ptr_t)ref_tmp; ++ ref = (c_src_ptr_t)ref_tmp; + } + + for (y = 0; y < size; y++, src += stride) { @@ -10594,19 +14964,19 @@ index 6ae87cc..c14dddd 100644 + fact * ref[x + idx + 2][1] + 16) >> 5; + } + } else { -+ memcpy(src, ref + idx + 1, size * 2); ++ memcpy(src, ref + idx + 1, size * 2 * PW); + } + } + } else { + ref = left - 1; + if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, left - 1, (size + 1) * 2); ++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } -+ ref = (c8_src_ptr_t)ref_tmp; ++ ref = (c_src_ptr_t)ref_tmp; + } + + for (x = 0; x < size; x++, src++) { @@ -10633,8 +15003,29 @@ index 6ae87cc..c14dddd 100644 static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, const uint8_t *left, +@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); + } + ++#undef cpel ++#undef c_src_ptr_t ++#undef c_dst_ptr_t ++ + #undef EXTEND_LEFT_CIP + #undef EXTEND_RIGHT_CIP + #undef EXTEND_UP_CIP +@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + #undef EXTEND + #undef MIN_TB_ADDR_ZS + #undef POS ++#undef PW ++ ++#ifndef INCLUDED_ONCE ++#define INCLUDED_ONCE ++#endif ++ diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index 81fcebc..7858478 100644 +index 81fcebce23..7858478b5d 100644 --- a/libavcodec/mmaldec.c +++ b/libavcodec/mmaldec.c @@ -24,6 +24,9 @@ @@ -10656,10 +15047,10 @@ index 81fcebc..7858478 100644 #include "avcodec.h" diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c -index 54b7be1..894dcdc 100644 +index 8f85e9362d..23080e8910 100644 --- a/libavcodec/mpeg4videodec.c +++ b/libavcodec/mpeg4videodec.c -@@ -2247,6 +2247,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) if (ctx->divx_version >= 0) s->workaround_bugs |= FF_BUG_HPEL_CHROMA; @@ -10669,7 +15060,7 @@ index 54b7be1..894dcdc 100644 } if (s->workaround_bugs & FF_BUG_STD_QPEL) { -@@ -2271,6 +2274,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) s->workaround_bugs, ctx->lavc_build, ctx->xvid_build, ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : ""); @@ -10678,95 +15069,40 @@ index 54b7be1..894dcdc 100644 s->codec_id == AV_CODEC_ID_MPEG4 && avctx->idct_algo == FF_IDCT_AUTO) { diff --git a/libavcodec/raw.c b/libavcodec/raw.c -index 7146e3a..240b274 100644 +index 7146e3a0f8..a8dcb1c251 100644 --- a/libavcodec/raw.c +++ b/libavcodec/raw.c -@@ -273,6 +273,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { +@@ -273,6 +273,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, + /* RPI */ +#ifdef RPI + { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, +#endif + /* special */ { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index d181b74..84f8e8c 100644 +index d181b74570..3fe75711c1 100644 --- a/libavcodec/rawenc.c +++ b/libavcodec/rawenc.c -@@ -31,6 +31,7 @@ +@@ -31,6 +31,8 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" #include "libavutil/internal.h" +#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" static av_cold int raw_encode_init(AVCodecContext *avctx) { -@@ -49,6 +50,101 @@ FF_ENABLE_DEPRECATION_WARNINGS +@@ -49,6 +51,71 @@ FF_ENABLE_DEPRECATION_WARNINGS return 0; } -+// x0 & width in luma units (so chroma * 2) -+// x0 odd for v -+static uint8_t * sand_copy_line_u(uint8_t * dst, const uint8_t * src, -+ unsigned int x0, const unsigned int width, -+ const unsigned int stride1, const unsigned int stride2) -+{ -+ unsigned int xend; -+ -+ // Skip any empty slices -+ src += (x0 & ~(stride1 - 1)) * stride2; -+ x0 &= (stride1 - 1); -+ -+ xend = x0 + width; -+ for (unsigned int x = 0; x < xend; x += stride1) -+ { -+ const unsigned int w = FFMIN(stride1, xend - x) - x0; -+ for (unsigned int i = 0; i < w; i += 2) -+ *dst++ = src[x0 + i]; -+ src += stride1 * stride2; -+ x0 &= 1; -+ } -+ -+ return dst; -+} -+ -+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int width, const unsigned int height) -+{ -+ for (unsigned int y = y0; y < height + y0; ++y) { -+ dst = sand_copy_line_u(dst, frame->data[1] + y * frame->linesize[1], x0, width, frame->linesize[1], frame->linesize[3]); -+ } -+ return dst; -+} -+ -+static uint8_t * sand_copy_line_y(uint8_t * dst, const uint8_t * src, -+ unsigned int x0, const unsigned int width, -+ const unsigned int stride1, const unsigned int stride2) -+{ -+ unsigned int xend; -+ -+ // Skip any empty slices -+ src += (x0 & ~(stride1 - 1)) * stride2; -+ x0 &= (stride1 - 1); -+ -+ xend = x0 + width; -+ for (unsigned int x = 0; x < xend; x += stride1) -+ { -+ const unsigned int w = FFMIN(stride1, xend - x) - x0; -+ memcpy(dst, src + x0, w); -+ dst += w; -+ src += stride1 * stride2; -+ x0 = 0; -+ } -+ return dst; -+} -+ -+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ + const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO); @@ -10781,8 +15117,6 @@ index d181b74..84f8e8c 100644 + if (sd != NULL) { + const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; + -+// printf("PScan: h/w=%d/%d, off=%d,%d\n", pscan->height, pscan->width, pscan->position[0][0], pscan->position[0][0]); -+ + x0 = si->left_offset; + y0 = si->top_offset; + } @@ -10793,26 +15127,55 @@ index d181b74..84f8e8c 100644 + + dst = pkt->data; + -+ // Luma is "easy" -+ for (int y = y0; y < height + y0; ++y) { -+ dst = sand_copy_line_y(dst, frame->data[0] + y * frame->linesize[0], x0, width, frame->linesize[0], frame->linesize[3]); -+ } -+ -+ // Chroma is dull -+ dst = cpy_sand_c(dst, frame, x0 & ~1, y0 / 2, width, height / 2); -+ dst = cpy_sand_c(dst, frame, x0 | 1, y0 / 2, width, height / 2); ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); + return 0; +} ++ ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO); ++ int size; ++ int width = frame->width; ++ int height = frame->height; ++ int x0 = 0; ++ int y0 = 0; ++ uint8_t * dst; ++ int ret; ++ ++ if (sd != NULL) { ++ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ ++ x0 = si->left_offset; ++ y0 = si->top_offset; ++ } ++ ++ size = width * height * 3; ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet) { -@@ -58,6 +154,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, +@@ -58,6 +125,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, if (ret < 0) return ret; -+ if (frame->format == AV_PIX_FMT_SAND128) { -+ ret = raw_sand_as_yuv420(avctx, pkt, frame); ++ if (av_rpi_is_sand_frame(frame)) { ++ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame); + *got_packet = (ret == 0); + return ret; + } @@ -10820,13 +15183,4018 @@ index d181b74..84f8e8c 100644 if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) return ret; if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, -diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s new file mode 100644 -index 0000000..4309f1c +index 0000000000..391f761df9 --- /dev/null -+++ b/libavcodec/rpi_hevc_transform.h ++++ b/libavcodec/rpi_hevc_transform.s +@@ -0,0 +1,923 @@ ++# ****************************************************************************** ++# Argon Design Ltd. ++# (c) Copyright 2015 Argon Design Ltd. All rights reserved. ++# ++# Module : HEVC ++# Author : Peter de Rivaz ++# ****************************************************************************** ++ ++# HEVC VPU Transform ++# fe ++# Transform matrix can be thought of as ++# output row vector = input row vector * transMatrix2 ++# ++# The even rows of the matrix are symmetric ++# The odd rows of the matrix are antisymmetric ++# ++# So only need to compute the first half of the results, then can compute the remainder with a butterfly ++# ++# EXAMPLE ++# (a b c d) (1 2 2 1) ++# (3 4 -4 -3) ++# (5 6 6 5) ++# (7 8 -8 -7) ++# ++# x=(a c)(1 2) = 1a+5c 2a+6c ++# (5 6) ++# ++# y=(b d)(3 4) = 3b+7d 4b+8d ++# (7 8) ++# ++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d ++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d ++# ++# Final results are (u , v[::-1]) ++# ++# ++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) ++# Apply the even matrix first and stop before rounding ++# Then apply the odd matrix in a full manner: ++# ++# First step is to compute partial products with the first input (16 cycles) ++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output ++# 2a 4b 6c 8d ++# 2a -4b 6c -8d ++# 1a -3b 5c -7d ++# ++# Second step is to sum partial products into final position (8 cycles) ++# 1a+3b+5c+7d ++# 2a+4b+6c+8d ++# 2a-4b+6c-8d ++# 1a-3b+5c-7d ++# ++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) ++# ++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) ++# ++# For 8x8 we could compute two in parallel. ++# ++# ++ ++# Columns are transformed first ++# ++# Store top left half of transMatrix2 in ++# Store bottom left half of transMatrix2 in HX(32,32) ++# ++# For 16x16 ++# HX(0:15,0) contains input data before transform ++# HY(0:15,0) contains 32bit output data after transform ++# HX(32,0) contains even rows of left half of transMatrix2 ++# HX(32,32) contains odd rows of left half of transMatrix2 ++# HY(48,0) contains partial products ready for summing ++# ++ ++ ++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# coeffs32 ++# num32: number of 32x32 transforms ++# command 0 for transform, 1 for memclear16(int16_t *dst,num16) ++# ++ ++.equ TRANS_SHIFT, 20 - BIT_DEPTH ++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) ++.equ TRANS_ASL2, 16 - TRANS_SHIFT ++ ++ ++hevc_trans_16x16: ++ cmp r5,1 ++ beq memclear16 ++ cmp r5,2 ++ beq hevc_deblock_16x16 ++ cmp r5,3 ++ beq hevc_uv_deblock_16x16 ++ cmp r5,4 ++ beq hevc_uv_deblock_16x16_with_clear ++ cmp r5,5 ++ beq hevc_run_command_list ++ ++ push r6-r15, lr # TODO cut down number of used registers ++ mov r14,r3 # coeffs32 ++ mov r15,r4 # num32 ++ mov r3, 16*2 # Stride of transMatrix2 in bytes ++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix ++ ++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix ++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ # Now use r0 to describe which matrix we are working on. ++ # Allows us to prefetch the next block of coefficients for efficiency. ++ mov r0,0 # This describes the location where we read our coefficients from ++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) ++ mov r7,16*16*2 # Total block size ++ mov r8,64*16 # Value used to swap from current to next VRF location ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ mov r4,64 # Constant used for rounding first pass ++ mov r5,TRANS_RND2 # Constant used for rounding second pass ++ ++ # At start of block r0,r1 point to the current block (that has already been loaded) ++block_loop: ++ eor r0,r8 ++ add r1,r7 ++ # Prefetch the next block ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ eor r0,r8 ++ sub r1,r7 ++ ++ # Transform the current block ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? ++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position ++ ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) ++ ++ # Save results - note there has been a transposition during the processing so we save columns ++ vsth VX(0,32++)+r0, (r1 += r3) REP 16 ++ ++ # Move onto next block ++ eor r0,r8 ++ add r1,r7 ++ ++ addcmpbgt r2,-1,0,block_loop ++ ++ # Now go and do any 32x32 transforms ++ b hevc_trans_32x32 ++ ++ pop r6-r15, pc ++ ++# r1,r2,r3 r7,r8 should be preserved ++# HX(0++,0)+r0 is the block to be transformed ++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients ++# Use HY(48,0) for intermediate results ++# r0 can be used, but should be returned to its original value at the end ++col_trans_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++col_trans_odd_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_odd_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_odd_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# ++hevc_trans_32x32: ++ mov r1,r14 # coeffs ++ mov r2,r15 # num ++ ++ # Fetch odd transform matrix ++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) ++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix ++ #add r0, 16*16*2 ++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer ++ mov r7, 16*16*2 # Total block size ++ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) ++ # set r8 to 32byte aligned stack pointer ++ add r8,sp,31 ++ lsr r8,5 ++ lsl r8,5 ++ mov r9,r8 # Backup of the temporary storage ++ mov r10,r1 # Backup of the coefficient buffer ++block_loop32: ++ ++ # COLUMN TRANSFORM ++ mov r4, 64 # Constant used for rounding first pass ++ mov r5, 9 # left shift used for rounding first pass ++ ++ # Transform the first 16 columns ++ mov r1,r10 # Input Coefficient buffer ++ mov r8,r9 # Output temporary storage ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ # ROW TRANSFORM ++ mov r4, TRANS_RND2 # Constant used for rounding second pass ++ mov r5, TRANS_ASL2 # left shift used for rounding second pass ++ ++ mov r1,r9 # Input temporary storage ++ mov r8,r10 # Output Coefficient buffer ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ add r10, 32*32*2 # move onto next block of coefficients ++ addcmpbgt r2,-1,0,block_loop32 ++ ++ add sp,sp,32*32*2+32 # Restore stack ++ ++ pop r6-r15, pc ++ ++trans32: ++ push lr ++ # We can no longer afford the VRF space to do prefetching when doing 32x32 ++ # Fetch the even rows ++ vldh HX(0++,0),(r1 += r3) REP 16 ++ # Fetch the odd rows ++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 ++ ++ # Transform the even rows using even matrix ++ mov r0, 0 # Even rows ++ bl col_trans_16 ++ ++ # Now transform the odd rows using odd matrix ++ mov r0, 64*16 # Odd rows ++ bl col_trans_odd_16 ++ ++ # Now apply butterfly to compute the first 16 results ++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ # 16bit results now in HX(48,32) ++ mov r0,r8 ++ mov r6,32*2 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ ++ # Now apply butterfly to compute the second 16 results (in reverse order) ++ vsub HY(63,0),HY(0 ,0),HY(16,0) ++ vsub HY(62,0),HY(1 ,0),HY(17,0) ++ vsub HY(61,0),HY(2 ,0),HY(18,0) ++ vsub HY(60,0),HY(3 ,0),HY(19,0) ++ vsub HY(59,0),HY(4 ,0),HY(20,0) ++ vsub HY(58,0),HY(5 ,0),HY(21,0) ++ vsub HY(57,0),HY(6 ,0),HY(22,0) ++ vsub HY(56,0),HY(7 ,0),HY(23,0) ++ vsub HY(55,0),HY(8 ,0),HY(24,0) ++ vsub HY(54,0),HY(9 ,0),HY(25,0) ++ vsub HY(53,0),HY(10,0),HY(26,0) ++ vsub HY(52,0),HY(11,0),HY(27,0) ++ vsub HY(51,0),HY(12,0),HY(28,0) ++ vsub HY(50,0),HY(13,0),HY(29,0) ++ vsub HY(49,0),HY(14,0),HY(30,0) ++ vsub HY(48,0),HY(15,0),HY(31,0) ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ add r0,r8,32 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ pop pc ++ ++memclear16: ++ # r0 is address ++ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) ++ vmov HX(0++,0),0 REP 16 ++ mov r2,32 ++loop: ++ vsth HX(0++,0),(r0+=r2) REP 16 ++ add r0,16*16*2 ++ sub r1,16*16 ++ cmp r1,0 ++ bgt loop ++ b lr ++ ++ ++################################################################################ ++# HEVC VPU Deblock ++# ++# Vertical edges before horizontal ++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked ++# ++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. ++# The VPU code works in units of 16x16 blocks. ++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). ++# One final horizontal filter is required at the end. ++# PCM is not allowed in this code. ++# ++# ++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) ++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. ++ ++.set P0,63 ++.set P1,62 ++.set P2,61 ++.set P3,60 ++.set Q0,59 ++.set Q1,58 ++.set Q2,57 ++.set Q3,56 ++ ++.set dp,32 ++.set dq,33 ++.set d,34 ++.set decision,35 ++.set beta,36 ++.set beta2,37 ++.set beta3,38 ++.set ptest,39 ++.set qtest,40 ++.set pqtest,41 ++.set thresh,42 ++.set deltatest, 44 ++.set deltap1, 45 ++.set tc25, 46 ++.set setup,47 ++.set tc,48 ++.set tc25,49 ++.set tc2, 50 ++.set do_filter, 51 ++.set delta, 52 ++.set tc10, 53 ++.set delta0, 54 ++.set delta1, 55 ++.set zeros, 0 ++.set setup_input, 1 ++.set deltaq1, 2 ++ ++ ++ ++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. ++# Row has num16 16x16 blocks across ++# Beta goes from 0 to 64 ++# tc goes from 0 to 24 ++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] ++# has 8 bytes per edge ++# has 16 bytes per direction ++# has 32 bytes per 16x16 block ++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) ++hevc_deblock_16x16: ++ push r6-r15, lr ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++ ++process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl vert_filter ++ sub r3,8 ++ b start_deblock_loop ++deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt start_again ++ pop r6-r15, pc ++start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++vert_filter: ++ push lr ++ ++ vmov HX(P3,0), V(16,12)+r3 ++ vmov HX(P2,0), V(16,13)+r3 ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ vmov HX(Q2,0), V(16,18)+r3 ++ vmov HX(Q3,0), V(16,19)+r3 ++ ++ bl do_luma_filter ++ ++ vadds V(16,13)+r3, HX(P2,0), 0 ++ vadds V(16,14)+r3, HX(P1,0), 0 ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ vadds V(16,17)+r3, HX(Q1,0), 0 ++ vadds V(16,18)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++horz_filter: ++ push lr ++ ++ vmov HX(P3,0), H(12,0)+r3 ++ vmov HX(P2,0), H(13,0)+r3 ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ vmov HX(Q2,0), H(18,0)+r3 ++ vmov HX(Q3,0), H(19,0)+r3 ++ ++ bl do_luma_filter ++ ++ vadds H(13,0)+r3, HX(P2,0), 0 ++ vadds H(14,0)+r3, HX(P1,0), 0 ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ vadds H(17,0)+r3, HX(Q1,0), 0 ++ vadds H(18,0)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_luma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 ++ valtl HX(beta,0),H(setup,0),H(setup,0) ++ valtu HX(tc,0),H(setup,0),H(setup,0) ++ vmul HX(tc25,0), HX(tc,0), 5 ++ vadd HX(tc25,0),HX(tc25,0), 1 ++ vasr HX(tc25,0), HX(tc25,0), 1 ++ ++ # Compute decision ++ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 ++ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 ++ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 ++ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 ++ ++ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 ++ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 ++ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 ++ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 ++ ++ vadd HX(d,0), HX(dp,0), HX(dq,0) ++ vasr HX(beta2,0),HX(beta,0),2 ++ vasr HX(beta3,0),HX(beta,0),3 ++ ++ # Compute flags that are negative if all conditions pass ++ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC ++ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC ++ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF ++ ++ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF ++ vadd HX(decision,0), HX(d,0), HX(d,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF ++ vmov HX(decision,0), 1 IFNN ++ vadd H(decision,0),H(decision,3),0 IFN ++ vadd H(decision,16),H(decision,19),0 IFN ++ vmov -,HX(decision,0) SETF # N marks strong filter ++ vmov HX(decision,0), 1 IFNN # NN marks normal filter ++ ++ vadd HX(do_filter,0), HX(d,3), HX(d,0) ++ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter ++ vmov HX(decision,0),0 IFNN # Z marks no filter ++ ++ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 ++ # First extract out even terms ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 ++ # Now expand back ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 ++ ++ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering ++ ++ # Do a quick check to see if there is anything to do ++ mov r11, 0 # Signal no filtering ++ vmov -,1 IFNZ SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ mov r11, 1 # Signal some filtering ++ # And whether there is any strong filtering ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq normal_filtering ++ ++ ############################################################################## ++ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) ++ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 ++ ++ # Take a copy of the original pixels for use in decision calculation ++ vmov HX(P0,32),HX(P0,0) ++ vmov HX(Q0,32),HX(Q0,0) ++ vmov HX(P1,32),HX(P1,0) ++ vmov HX(Q1,32),HX(Q1,0) ++ vmov HX(P2,32),HX(P2,0) ++ vmov HX(Q2,32),HX(Q2,0) ++ ++ vadd -,HX(P2,32),4 CLRA SACC ++ vshl -,HX(P1,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl HX(delta,0),HX(Q1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN ++ ++ vadd -,HX(P2,32),2 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vshl HX(delta,0),HX(Q0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(P1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q0,32),4 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vmul -,HX(P2,32),3 SACC ++ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN ++ #vmov HX(P2,0),3 IFN ++ ++ # Now reverse all P/Qs ++ ++ vadd -,HX(Q2,32),4 CLRA SACC ++ vshl -,HX(Q1,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl HX(delta,0),HX(P1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q2,32),2 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vshl HX(delta,0),HX(P0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(Q1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN ++ ++ vadd -,HX(P0,32),4 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vmul -,HX(Q2,32),3 SACC ++ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN ++ ++ ############################################################################## ++ # Normal filtering ++normal_filtering: ++ # Invert the decision flags ++ # make instruction more complicated as assembler has error and loses SETF ++ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering ++ vmov -, HX(tc10,0) SETF # IFN means normal filtering ++ ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ ++ vasr HX(tc2,0), HX(tc,0), 1 ++ vmul HX(tc10,0), HX(tc,0), 10 ++ ++ vasr HX(thresh,0), HX(beta,0), 1 ++ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) ++ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC ++ ++ vadd HX(ptest,0),HX(dp,3),HX(dp,0) ++ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel ++ vadd HX(qtest,0),HX(dq,3),HX(dq,0) ++ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel ++ # Expand ptest and qtest together ++ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q ++ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ ++ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq ++ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) ++ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) ++ ++ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) ++ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) ++ vmov -,8 CLRA SACC ++ vmul -,HX(delta0,0), 9 SACC ++ vmul HX(delta0,0),HX(delta1,0), r6 SACC ++ vasr HX(delta0,0), HX(delta0,0), 4 ++ vdist HX(deltatest,0), HX(delta0,0), 0 ++ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something ++ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later ++ ++ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) ++ ++ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) ++ vadd HX(deltap1,0), HX(deltap1,0), 1 ++ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC ++ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC ++ vasr HX(deltap1,0), HX(deltap1,0), 1 ++ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) ++ ++ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) ++ vadd HX(deltaq1,0), HX(deltaq1,0), 1 ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC ++ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) ++ vrsub -, HX(delta0,0), 0 SACC ++ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 ++ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) ++ ++ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN ++ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN ++ ++ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 ++ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN ++ ++ vmov -,HX(deltatest,0) SETF ++ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 ++ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN ++ ++ #vmov HX(P2,0),1 IFN ++ ++filtering_done: ++ b lr ++ ++ ++hevc_uv_deblock_16x16: ++ push r6-r15, lr ++ mov r14,0 ++ b hevc_uv_start ++hevc_uv_deblock_16x16_with_clear: ++ push r6-r15, lr ++ mov r14,1 ++ b hevc_uv_start ++ ++hevc_uv_start: ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++# r14 is 1 if we should clear the old contents, or 0 if not ++ ++uv_process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ cmp r14,1 ++ bne uv_skip0 ++ vstb H(zeros,0),(r4) ++uv_skip0: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl uv_vert_filter ++ sub r3,8 ++ b uv_start_deblock_loop ++uv_deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ cmp r14,1 ++ bne uv_skip1 ++ vstb H(zeros,0),(r4) ++uv_skip1: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip3 ++ vstb H(zeros,0),-16(r4) ++uv_skip3: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,uv_skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++uv_start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt uv_deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip2 ++ vstb H(zeros,0),-16(r4) ++uv_skip2: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,uv_skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt uv_start_again ++ pop r6-r15, pc ++uv_start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b uv_process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++uv_vert_filter: ++ push lr ++ ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++uv_horz_filter: ++ push lr ++ ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_chroma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 ++ valtl HX(tc,0),H(setup,0),H(setup,0) ++ ++ vsub HX(delta,0),HX(Q0,0),HX(P0,0) ++ vshl HX(delta,0),HX(delta,0),2 CLRA SACC ++ vsub -,HX(P1,0),HX(Q1,0) SACC ++ vmov HX(delta,0),4 SACC ++ vasr HX(delta,0),HX(delta,0),3 ++ vclamps HX(delta,0), HX(delta,0), HX(tc,0) ++ vadd HX(P0,0),HX(P0,0),HX(delta,0) ++ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) ++ b lr ++ ++# r0 = list ++# r1 = number ++hevc_run_command_list: ++ push r6-r7, lr ++ mov r6, r0 ++ mov r7, r1 ++loop_cmds: ++ ld r0,(r6) # How to encode r6++? ++ add r6,4 ++ ld r1,(r6) ++ add r6,4 ++ ld r2,(r6) ++ add r6,4 ++ ld r3,(r6) ++ add r6,4 ++ ld r4,(r6) ++ add r6,4 ++ ld r5,(r6) ++ add r6,4 ++ bl hevc_trans_16x16 ++ sub r7,1 ++ cmp r7,0 ++ bgt loop_cmds ++ ++ pop r6-r7, pc +diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h +new file mode 100644 +index 0000000000..b0e9902d82 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform10.h @@ -0,0 +1,3070 @@ -+unsigned char rpi_hevc_transform [] = { ++static const unsigned char rpi_hevc_transform10 [] = { ++21, ++106, ++0, ++144, ++47, ++1, ++37, ++106, ++0, ++144, ++66, ++1, ++53, ++106, ++0, ++144, ++192, ++4, ++69, ++106, ++0, ++144, ++192, ++4, ++85, ++106, ++0, ++144, ++220, ++5, ++169, ++3, ++62, ++64, ++79, ++64, ++3, ++232, ++32, ++0, ++0, ++0, ++12, ++248, ++0, ++136, ++0, ++0, ++192, ++248, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++12, ++248, ++0, ++168, ++0, ++0, ++192, ++248, ++0, ++0, ++0, ++96, ++3, ++232, ++32, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++8, ++232, ++0, ++4, ++0, ++0, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++4, ++232, ++64, ++0, ++0, ++0, ++5, ++232, ++0, ++2, ++0, ++0, ++128, ++69, ++113, ++66, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++128, ++69, ++113, ++70, ++128, ++144, ++40, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++16, ++0, ++76, ++254, ++48, ++192, ++9, ++4, ++32, ++8, ++0, ++0, ++4, ++254, ++0, ++144, ++128, ++2, ++0, ++8, ++2, ++0, ++128, ++144, ++23, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++20, ++0, ++76, ++254, ++48, ++192, ++6, ++4, ++32, ++8, ++0, ++0, ++140, ++248, ++44, ++0, ++0, ++0, ++32, ++48, ++4, ++0, ++128, ++69, ++113, ++66, ++242, ++140, ++211, ++192, ++34, ++31, ++41, ++3, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++96, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++224, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++225, ++64, ++242, ++64, ++3, ++232, ++128, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++57, ++239, ++224, ++247, ++255, ++255, ++72, ++192, ++95, ++207, ++88, ++122, ++88, ++124, ++137, ++64, ++26, ++64, ++4, ++232, ++64, ++0, ++0, ++0, ++149, ++96, ++161, ++64, ++152, ++64, ++128, ++144, ++35, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++27, ++0, ++4, ++232, ++0, ++2, ++0, ++0, ++101, ++96, ++145, ++64, ++168, ++64, ++128, ++144, ++19, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++11, ++0, ++74, ++232, ++0, ++8, ++0, ++0, ++242, ++140, ++221, ++192, ++57, ++239, ++32, ++8, ++0, ++0, ++41, ++3, ++239, ++3, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++248, ++4, ++0, ++12, ++248, ++0, ++132, ++64, ++0, ++192, ++248, ++4, ++0, ++0, ++96, ++255, ++159, ++154, ++255, ++0, ++232, ++0, ++4, ++0, ++0, ++255, ++159, ++165, ++255, ++4, ++255, ++48, ++204, ++16, ++3, ++224, ++251, ++62, ++0, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++128, ++64, ++6, ++232, ++64, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++32, ++247, ++240, ++207, ++16, ++3, ++32, ++247, ++176, ++207, ++17, ++19, ++32, ++247, ++112, ++207, ++18, ++35, ++32, ++247, ++48, ++207, ++19, ++51, ++32, ++247, ++240, ++206, ++20, ++67, ++32, ++247, ++176, ++206, ++21, ++83, ++32, ++247, ++112, ++206, ++22, ++99, ++32, ++247, ++48, ++206, ++23, ++115, ++32, ++247, ++240, ++205, ++24, ++131, ++32, ++247, ++176, ++205, ++25, ++147, ++32, ++247, ++112, ++205, ++26, ++163, ++32, ++247, ++48, ++205, ++27, ++179, ++32, ++247, ++240, ++204, ++28, ++195, ++32, ++247, ++176, ++204, ++29, ++211, ++32, ++247, ++112, ++204, ++30, ++227, ++32, ++247, ++48, ++204, ++31, ++243, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++0, ++237, ++32, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++111, ++3, ++4, ++254, ++0, ++128, ++0, ++4, ++0, ++248, ++0, ++0, ++2, ++232, ++32, ++0, ++0, ++0, ++140, ++248, ++32, ++0, ++0, ++0, ++224, ++35, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++193, ++232, ++0, ++1, ++0, ++0, ++1, ++106, ++116, ++30, ++90, ++0, ++169, ++3, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++137, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++129, ++0, ++131, ++102, ++0, ++158, ++67, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++108, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++100, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++161, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++182, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++112, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++101, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++103, ++255, ++239, ++3, ++0, ++254, ++0, ++143, ++92, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++93, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++210, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++211, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++107, ++0, ++8, ++255, ++99, ++23, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++23, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++52, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++52, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++0, ++143, ++12, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++13, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++18, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++19, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++33, ++0, ++8, ++255, ++99, ++3, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++3, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++4, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++4, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++137, ++47, ++240, ++40, ++246, ++2, ++140, ++47, ++240, ++128, ++245, ++99, ++140, ++5, ++4, ++0, ++247, ++99, ++140, ++1, ++20, ++88, ++246, ++99, ++140, ++1, ++20, ++0, ++247, ++35, ++136, ++62, ++226, ++32, ++247, ++35, ++136, ++32, ++210, ++0, ++247, ++34, ++136, ++63, ++2, ++208, ++246, ++34, ++136, ++0, ++4, ++0, ++247, ++99, ++136, ++58, ++162, ++32, ++247, ++99, ++136, ++33, ++146, ++0, ++247, ++98, ++136, ++59, ++18, ++208, ++246, ++98, ++136, ++0, ++20, ++0, ++247, ++162, ++136, ++33, ++2, ++88, ++246, ++98, ++137, ++2, ++68, ++88, ++246, ++162, ++137, ++3, ++68, ++208, ++254, ++227, ++136, ++60, ++242, ++192, ++243, ++188, ++11, ++208, ++254, ++227, ++136, ++56, ++178, ++192, ++243, ++188, ++10, ++32, ++255, ++226, ++136, ++38, ++58, ++192, ++243, ++60, ++0, ++208, ++254, ++227, ++136, ++59, ++242, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++49, ++58, ++192, ++243, ++60, ++128, ++0, ++255, ++226, ++136, ++34, ++34, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++37, ++58, ++192, ++243, ++60, ++128, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++194, ++8, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++255, ++202, ++40, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++254, ++0, ++240, ++35, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++226, ++140, ++34, ++34, ++195, ++243, ++60, ++0, ++32, ++255, ++227, ++140, ++36, ++58, ++192, ++243, ++60, ++0, ++0, ++254, ++192, ++136, ++0, ++4, ++0, ++240, ++0, ++160, ++16, ++246, ++226, ++136, ++35, ++50, ++16, ++246, ++226, ++136, ++35, ++50, ++32, ++246, ++226, ++136, ++35, ++50, ++32, ++254, ++226, ++136, ++35, ++58, ++192, ++243, ++60, ++0, ++11, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++115, ++5, ++106, ++0, ++144, ++173, ++1, ++27, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++227, ++0, ++64, ++246, ++163, ++140, ++1, ++4, ++0, ++246, ++192, ++175, ++63, ++2, ++0, ++246, ++192, ++174, ++59, ++2, ++0, ++246, ++128, ++175, ++62, ++2, ++0, ++246, ++128, ++174, ++58, ++2, ++0, ++246, ++64, ++175, ++61, ++2, ++0, ++246, ++64, ++174, ++57, ++2, ++0, ++255, ++43, ++240, ++4, ++212, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++228, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++191, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++143, ++52, ++242, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++212, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++180, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++190, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++143, ++52, ++226, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++180, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++212, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++196, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++189, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++143, ++52, ++210, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++148, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++164, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++228, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++187, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++142, ++52, ++178, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++148, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++244, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++186, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++142, ++52, ++162, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++244, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++148, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++132, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++185, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++142, ++52, ++146, ++192, ++243, ++60, ++128, ++64, ++255, ++98, ++141, ++0, ++52, ++192, ++243, ++0, ++0, ++0, ++254, ++0, ++240, ++53, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++177, ++0, ++88, ++246, ++163, ++140, ++1, ++4, ++128, ++245, ++99, ++141, ++10, ++4, ++88, ++246, ++162, ++138, ++1, ++68, ++0, ++247, ++162, ++138, ++36, ++162, ++88, ++254, ++162, ++138, ++3, ++164, ++192, ++243, ++128, ++11, ++0, ++255, ++226, ++137, ++32, ++2, ++195, ++243, ++60, ++0, ++32, ++247, ++226, ++137, ++42, ++114, ++0, ++255, ++34, ++138, ++33, ++18, ++195, ++243, ++60, ++0, ++32, ++247, ++34, ++138, ++42, ++130, ++16, ++246, ++98, ++138, ++40, ++114, ++16, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++226, ++137, ++41, ++146, ++40, ++246, ++34, ++138, ++41, ++146, ++32, ++247, ++163, ++141, ++63, ++178, ++32, ++247, ++227, ++141, ++62, ++162, ++0, ++254, ++0, ++240, ++8, ++4, ++0, ++240, ++128, ++11, ++128, ++253, ++35, ++240, ++9, ++100, ++192, ++243, ++128, ++10, ++128, ++253, ++163, ++141, ++128, ++115, ++192, ++243, ++152, ++10, ++88, ++246, ++163, ++141, ++4, ++100, ++208, ++246, ++35, ++139, ++0, ++100, ++32, ++255, ++34, ++139, ++53, ++202, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++139, ++0, ++4, ++0, ++240, ++0, ++160, ++240, ++246, ++163, ++141, ++48, ++98, ++0, ++247, ++99, ++139, ++63, ++210, ++0, ++247, ++98, ++139, ++1, ++212, ++88, ++254, ++98, ++139, ++1, ++212, ++192, ++243, ++128, ++11, ++32, ++255, ++99, ++139, ++62, ++98, ++192, ++243, ++188, ++10, ++88, ++246, ++98, ++139, ++1, ++212, ++240, ++246, ++98, ++139, ++50, ++210, ++0, ++247, ++163, ++128, ++59, ++146, ++0, ++247, ++160, ++128, ++1, ++36, ++88, ++254, ++160, ++128, ++1, ++36, ++192, ++243, ++128, ++11, ++0, ++247, ++163, ++128, ++58, ++98, ++64, ++255, ++35, ++240, ++0, ++100, ++192, ++243, ++128, ++10, ++64, ++255, ++163, ++128, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++160, ++128, ++1, ++36, ++240, ++246, ++160, ++128, ++50, ++34, ++8, ++255, ++227, ++143, ++54, ++242, ++192, ++243, ++60, ++128, ++40, ++255, ++227, ++142, ++54, ++178, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++39, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++143, ++45, ++226, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++44, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++40, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++142, ++2, ++162, ++192, ++243, ++60, ++128, ++90, ++0, ++169, ++3, ++14, ++96, ++4, ++31, ++169, ++3, ++30, ++96, ++1, ++31, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++132, ++24, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++143, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++135, ++0, ++131, ++102, ++0, ++158, ++71, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++132, ++24, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++112, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++104, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++134, ++24, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++123, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++112, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++178, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++134, ++24, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++72, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++61, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++95, ++255, ++239, ++3, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++47, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++13, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++140, ++47, ++240, ++32, ++247, ++35, ++141, ++63, ++178, ++64, ++254, ++35, ++141, ++2, ++68, ++192, ++243, ++128, ++11, ++32, ++255, ++35, ++240, ++58, ++226, ++192, ++243, ++188, ++10, ++0, ++254, ++0, ++141, ++4, ++4, ++0, ++240, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++240, ++246, ++35, ++141, ++48, ++66, ++0, ++247, ++227, ++143, ++52, ++242, ++32, ++247, ++227, ++142, ++52, ++178, ++90, ++0, ++161, ++3, ++6, ++64, ++23, ++64, ++96, ++8, ++70, ++98, ++97, ++8, ++70, ++98, ++98, ++8, ++70, ++98, ++99, ++8, ++70, ++98, ++100, ++8, ++70, ++98, ++101, ++8, ++70, ++98, ++255, ++159, ++8, ++250, ++23, ++102, ++7, ++106, ++112, ++30, ++33, ++3, ++}; +diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h +new file mode 100644 +index 0000000000..2901b6568d +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform8.h +@@ -0,0 +1,3070 @@ ++static const unsigned char rpi_hevc_transform8 [] = { +21, +106, +0, @@ -13896,932 +22264,9 @@ index 0000000..4309f1c +33, +3, +}; -diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s -new file mode 100644 -index 0000000..5543093 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform.s -@@ -0,0 +1,917 @@ -+# ****************************************************************************** -+# Argon Design Ltd. -+# (c) Copyright 2015 Argon Design Ltd. All rights reserved. -+# -+# Module : HEVC -+# Author : Peter de Rivaz -+# ****************************************************************************** -+ -+# HEVC VPU Transform -+# -+# Transform matrix can be thought of as -+# output row vector = input row vector * transMatrix2 -+# -+# The even rows of the matrix are symmetric -+# The odd rows of the matrix are antisymmetric -+# -+# So only need to compute the first half of the results, then can compute the remainder with a butterfly -+# -+# EXAMPLE -+# (a b c d) (1 2 2 1) -+# (3 4 -4 -3) -+# (5 6 6 5) -+# (7 8 -8 -7) -+# -+# x=(a c)(1 2) = 1a+5c 2a+6c -+# (5 6) -+# -+# y=(b d)(3 4) = 3b+7d 4b+8d -+# (7 8) -+# -+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d -+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d -+# -+# Final results are (u , v[::-1]) -+# -+# -+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) -+# Apply the even matrix first and stop before rounding -+# Then apply the odd matrix in a full manner: -+# -+# First step is to compute partial products with the first input (16 cycles) -+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output -+# 2a 4b 6c 8d -+# 2a -4b 6c -8d -+# 1a -3b 5c -7d -+# -+# Second step is to sum partial products into final position (8 cycles) -+# 1a+3b+5c+7d -+# 2a+4b+6c+8d -+# 2a-4b+6c-8d -+# 1a-3b+5c-7d -+# -+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) -+# -+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) -+# -+# For 8x8 we could compute two in parallel. -+# -+# -+ -+# Columns are transformed first -+# -+# Store top left half of transMatrix2 in -+# Store bottom left half of transMatrix2 in HX(32,32) -+# -+# For 16x16 -+# HX(0:15,0) contains input data before transform -+# HY(0:15,0) contains 32bit output data after transform -+# HX(32,0) contains even rows of left half of transMatrix2 -+# HX(32,32) contains odd rows of left half of transMatrix2 -+# HY(48,0) contains partial products ready for summing -+# -+ -+ -+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# coeffs32 -+# num32: number of 32x32 transforms -+# command 0 for transform, 1 for memclear16(int16_t *dst,num16) -+# -+hevc_trans_16x16: -+ cmp r5,1 -+ beq memclear16 -+ cmp r5,2 -+ beq hevc_deblock_16x16 -+ cmp r5,3 -+ beq hevc_uv_deblock_16x16 -+ cmp r5,4 -+ beq hevc_uv_deblock_16x16_with_clear -+ cmp r5,5 -+ beq hevc_run_command_list -+ -+ push r6-r15, lr # TODO cut down number of used registers -+ mov r14,r3 # coeffs32 -+ mov r15,r4 # num32 -+ mov r3, 16*2 # Stride of transMatrix2 in bytes -+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix -+ -+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix -+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ # Now use r0 to describe which matrix we are working on. -+ # Allows us to prefetch the next block of coefficients for efficiency. -+ mov r0,0 # This describes the location where we read our coefficients from -+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) -+ mov r7,16*16*2 # Total block size -+ mov r8,64*16 # Value used to swap from current to next VRF location -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ mov r4,64 # Constant used for rounding first pass -+ mov r5,1<<11 # Constant used for rounding second pass -+ -+ # At start of block r0,r1 point to the current block (that has already been loaded) -+block_loop: -+ eor r0,r8 -+ add r1,r7 -+ # Prefetch the next block -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ eor r0,r8 -+ sub r1,r7 -+ -+ # Transform the current block -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? -+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position -+ -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) -+ -+ # Save results - note there has been a transposition during the processing so we save columns -+ vsth VX(0,32++)+r0, (r1 += r3) REP 16 -+ -+ # Move onto next block -+ eor r0,r8 -+ add r1,r7 -+ -+ addcmpbgt r2,-1,0,block_loop -+ -+ # Now go and do any 32x32 transforms -+ b hevc_trans_32x32 -+ -+ pop r6-r15, pc -+ -+# r1,r2,r3 r7,r8 should be preserved -+# HX(0++,0)+r0 is the block to be transformed -+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients -+# Use HY(48,0) for intermediate results -+# r0 can be used, but should be returned to its original value at the end -+col_trans_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+col_trans_odd_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_odd_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_odd_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# -+hevc_trans_32x32: -+ mov r1,r14 # coeffs -+ mov r2,r15 # num -+ -+ # Fetch odd transform matrix -+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) -+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix -+ #add r0, 16*16*2 -+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer -+ mov r7, 16*16*2 # Total block size -+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) -+ # set r8 to 32byte aligned stack pointer -+ add r8,sp,31 -+ lsr r8,5 -+ lsl r8,5 -+ mov r9,r8 # Backup of the temporary storage -+ mov r10,r1 # Backup of the coefficient buffer -+block_loop32: -+ -+ # COLUMN TRANSFORM -+ mov r4, 64 # Constant used for rounding first pass -+ mov r5, 9 # left shift used for rounding first pass -+ -+ # Transform the first 16 columns -+ mov r1,r10 # Input Coefficient buffer -+ mov r8,r9 # Output temporary storage -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ # ROW TRANSFORM -+ mov r4, 1<<11 # Constant used for rounding second pass -+ mov r5, 4 # left shift used for rounding second pass -+ -+ mov r1,r9 # Input temporary storage -+ mov r8,r10 # Output Coefficient buffer -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ add r10, 32*32*2 # move onto next block of coefficients -+ addcmpbgt r2,-1,0,block_loop32 -+ -+ add sp,sp,32*32*2+32 # Restore stack -+ -+ pop r6-r15, pc -+ -+trans32: -+ push lr -+ # We can no longer afford the VRF space to do prefetching when doing 32x32 -+ # Fetch the even rows -+ vldh HX(0++,0),(r1 += r3) REP 16 -+ # Fetch the odd rows -+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 -+ -+ # Transform the even rows using even matrix -+ mov r0, 0 # Even rows -+ bl col_trans_16 -+ -+ # Now transform the odd rows using odd matrix -+ mov r0, 64*16 # Odd rows -+ bl col_trans_odd_16 -+ -+ # Now apply butterfly to compute the first 16 results -+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ # 16bit results now in HX(48,32) -+ mov r0,r8 -+ mov r6,32*2 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ -+ # Now apply butterfly to compute the second 16 results (in reverse order) -+ vsub HY(63,0),HY(0 ,0),HY(16,0) -+ vsub HY(62,0),HY(1 ,0),HY(17,0) -+ vsub HY(61,0),HY(2 ,0),HY(18,0) -+ vsub HY(60,0),HY(3 ,0),HY(19,0) -+ vsub HY(59,0),HY(4 ,0),HY(20,0) -+ vsub HY(58,0),HY(5 ,0),HY(21,0) -+ vsub HY(57,0),HY(6 ,0),HY(22,0) -+ vsub HY(56,0),HY(7 ,0),HY(23,0) -+ vsub HY(55,0),HY(8 ,0),HY(24,0) -+ vsub HY(54,0),HY(9 ,0),HY(25,0) -+ vsub HY(53,0),HY(10,0),HY(26,0) -+ vsub HY(52,0),HY(11,0),HY(27,0) -+ vsub HY(51,0),HY(12,0),HY(28,0) -+ vsub HY(50,0),HY(13,0),HY(29,0) -+ vsub HY(49,0),HY(14,0),HY(30,0) -+ vsub HY(48,0),HY(15,0),HY(31,0) -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ add r0,r8,32 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ pop pc -+ -+memclear16: -+ # r0 is address -+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) -+ vmov HX(0++,0),0 REP 16 -+ mov r2,32 -+loop: -+ vsth HX(0++,0),(r0+=r2) REP 16 -+ add r0,16*16*2 -+ sub r1,16*16 -+ cmp r1,0 -+ bgt loop -+ b lr -+ -+ -+################################################################################ -+# HEVC VPU Deblock -+# -+# Vertical edges before horizontal -+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked -+# -+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. -+# The VPU code works in units of 16x16 blocks. -+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). -+# One final horizontal filter is required at the end. -+# PCM is not allowed in this code. -+# -+# -+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) -+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. -+ -+.set P0,63 -+.set P1,62 -+.set P2,61 -+.set P3,60 -+.set Q0,59 -+.set Q1,58 -+.set Q2,57 -+.set Q3,56 -+ -+.set dp,32 -+.set dq,33 -+.set d,34 -+.set decision,35 -+.set beta,36 -+.set beta2,37 -+.set beta3,38 -+.set ptest,39 -+.set qtest,40 -+.set pqtest,41 -+.set thresh,42 -+.set deltatest, 44 -+.set deltap1, 45 -+.set tc25, 46 -+.set setup,47 -+.set tc,48 -+.set tc25,49 -+.set tc2, 50 -+.set do_filter, 51 -+.set delta, 52 -+.set tc10, 53 -+.set delta0, 54 -+.set delta1, 55 -+.set zeros, 0 -+.set setup_input, 1 -+.set deltaq1, 2 -+ -+ -+ -+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. -+# Row has num16 16x16 blocks across -+# Beta goes from 0 to 64 -+# tc goes from 0 to 24 -+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] -+# has 8 bytes per edge -+# has 16 bytes per direction -+# has 32 bytes per 16x16 block -+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) -+hevc_deblock_16x16: -+ push r6-r15, lr -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+ -+process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl vert_filter -+ sub r3,8 -+ b start_deblock_loop -+deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt start_again -+ pop r6-r15, pc -+start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+vert_filter: -+ push lr -+ -+ vmov HX(P3,0), V(16,12)+r3 -+ vmov HX(P2,0), V(16,13)+r3 -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ vmov HX(Q2,0), V(16,18)+r3 -+ vmov HX(Q3,0), V(16,19)+r3 -+ -+ bl do_luma_filter -+ -+ vadds V(16,13)+r3, HX(P2,0), 0 -+ vadds V(16,14)+r3, HX(P1,0), 0 -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ vadds V(16,17)+r3, HX(Q1,0), 0 -+ vadds V(16,18)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+horz_filter: -+ push lr -+ -+ vmov HX(P3,0), H(12,0)+r3 -+ vmov HX(P2,0), H(13,0)+r3 -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ vmov HX(Q2,0), H(18,0)+r3 -+ vmov HX(Q3,0), H(19,0)+r3 -+ -+ bl do_luma_filter -+ -+ vadds H(13,0)+r3, HX(P2,0), 0 -+ vadds H(14,0)+r3, HX(P1,0), 0 -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ vadds H(17,0)+r3, HX(Q1,0), 0 -+ vadds H(18,0)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_luma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 -+ valtl HX(beta,0),H(setup,0),H(setup,0) -+ valtu HX(tc,0),H(setup,0),H(setup,0) -+ vmul HX(tc25,0), HX(tc,0), 5 -+ vadd HX(tc25,0),HX(tc25,0), 1 -+ vasr HX(tc25,0), HX(tc25,0), 1 -+ -+ # Compute decision -+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 -+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 -+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 -+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 -+ -+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 -+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 -+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 -+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 -+ -+ vadd HX(d,0), HX(dp,0), HX(dq,0) -+ vasr HX(beta2,0),HX(beta,0),2 -+ vasr HX(beta3,0),HX(beta,0),3 -+ -+ # Compute flags that are negative if all conditions pass -+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC -+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC -+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF -+ -+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF -+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF -+ vmov HX(decision,0), 1 IFNN -+ vadd H(decision,0),H(decision,3),0 IFN -+ vadd H(decision,16),H(decision,19),0 IFN -+ vmov -,HX(decision,0) SETF # N marks strong filter -+ vmov HX(decision,0), 1 IFNN # NN marks normal filter -+ -+ vadd HX(do_filter,0), HX(d,3), HX(d,0) -+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter -+ vmov HX(decision,0),0 IFNN # Z marks no filter -+ -+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 -+ # First extract out even terms -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 -+ # Now expand back -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 -+ -+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering -+ -+ # Do a quick check to see if there is anything to do -+ mov r11, 0 # Signal no filtering -+ vmov -,1 IFNZ SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ mov r11, 1 # Signal some filtering -+ # And whether there is any strong filtering -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq normal_filtering -+ -+ ############################################################################## -+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) -+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 -+ -+ # Take a copy of the original pixels for use in decision calculation -+ vmov HX(P0,32),HX(P0,0) -+ vmov HX(Q0,32),HX(Q0,0) -+ vmov HX(P1,32),HX(P1,0) -+ vmov HX(Q1,32),HX(Q1,0) -+ vmov HX(P2,32),HX(P2,0) -+ vmov HX(Q2,32),HX(Q2,0) -+ -+ vadd -,HX(P2,32),4 CLRA SACC -+ vshl -,HX(P1,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl HX(delta,0),HX(Q1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN -+ -+ vadd -,HX(P2,32),2 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vshl HX(delta,0),HX(Q0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(P1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q0,32),4 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vmul -,HX(P2,32),3 SACC -+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN -+ #vmov HX(P2,0),3 IFN -+ -+ # Now reverse all P/Qs -+ -+ vadd -,HX(Q2,32),4 CLRA SACC -+ vshl -,HX(Q1,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl HX(delta,0),HX(P1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q2,32),2 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vshl HX(delta,0),HX(P0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(Q1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN -+ -+ vadd -,HX(P0,32),4 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vmul -,HX(Q2,32),3 SACC -+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN -+ -+ ############################################################################## -+ # Normal filtering -+normal_filtering: -+ # Invert the decision flags -+ # make instruction more complicated as assembler has error and loses SETF -+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering -+ vmov -, HX(tc10,0) SETF # IFN means normal filtering -+ -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ -+ vasr HX(tc2,0), HX(tc,0), 1 -+ vmul HX(tc10,0), HX(tc,0), 10 -+ -+ vasr HX(thresh,0), HX(beta,0), 1 -+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) -+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC -+ -+ vadd HX(ptest,0),HX(dp,3),HX(dp,0) -+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel -+ vadd HX(qtest,0),HX(dq,3),HX(dq,0) -+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel -+ # Expand ptest and qtest together -+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q -+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ -+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq -+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) -+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) -+ -+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) -+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) -+ vmov -,8 CLRA SACC -+ vmul -,HX(delta0,0), 9 SACC -+ vmul HX(delta0,0),HX(delta1,0), r6 SACC -+ vasr HX(delta0,0), HX(delta0,0), 4 -+ vdist HX(deltatest,0), HX(delta0,0), 0 -+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something -+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later -+ -+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) -+ -+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) -+ vadd HX(deltap1,0), HX(deltap1,0), 1 -+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC -+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC -+ vasr HX(deltap1,0), HX(deltap1,0), 1 -+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) -+ -+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) -+ vadd HX(deltaq1,0), HX(deltaq1,0), 1 -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC -+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) -+ vrsub -, HX(delta0,0), 0 SACC -+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 -+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) -+ -+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN -+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN -+ -+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 -+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN -+ -+ vmov -,HX(deltatest,0) SETF -+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 -+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN -+ -+ #vmov HX(P2,0),1 IFN -+ -+filtering_done: -+ b lr -+ -+ -+hevc_uv_deblock_16x16: -+ push r6-r15, lr -+ mov r14,0 -+ b hevc_uv_start -+hevc_uv_deblock_16x16_with_clear: -+ push r6-r15, lr -+ mov r14,1 -+ b hevc_uv_start -+ -+hevc_uv_start: -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+# r14 is 1 if we should clear the old contents, or 0 if not -+ -+uv_process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ cmp r14,1 -+ bne uv_skip0 -+ vstb H(zeros,0),(r4) -+uv_skip0: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl uv_vert_filter -+ sub r3,8 -+ b uv_start_deblock_loop -+uv_deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ cmp r14,1 -+ bne uv_skip1 -+ vstb H(zeros,0),(r4) -+uv_skip1: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip3 -+ vstb H(zeros,0),-16(r4) -+uv_skip3: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,uv_skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+uv_start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt uv_deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip2 -+ vstb H(zeros,0),-16(r4) -+uv_skip2: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,uv_skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt uv_start_again -+ pop r6-r15, pc -+uv_start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b uv_process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+uv_vert_filter: -+ push lr -+ -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+uv_horz_filter: -+ push lr -+ -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_chroma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 -+ valtl HX(tc,0),H(setup,0),H(setup,0) -+ -+ vsub HX(delta,0),HX(Q0,0),HX(P0,0) -+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC -+ vsub -,HX(P1,0),HX(Q1,0) SACC -+ vmov HX(delta,0),4 SACC -+ vasr HX(delta,0),HX(delta,0),3 -+ vclamps HX(delta,0), HX(delta,0), HX(tc,0) -+ vadd HX(P0,0),HX(P0,0),HX(delta,0) -+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) -+ b lr -+ -+# r0 = list -+# r1 = number -+hevc_run_command_list: -+ push r6-r7, lr -+ mov r6, r0 -+ mov r7, r1 -+loop_cmds: -+ ld r0,(r6) # How to encode r6++? -+ add r6,4 -+ ld r1,(r6) -+ add r6,4 -+ ld r2,(r6) -+ add r6,4 -+ ld r3,(r6) -+ add r6,4 -+ ld r4,(r6) -+ add r6,4 -+ ld r5,(r6) -+ add r6,4 -+ bl hevc_trans_16x16 -+ sub r7,1 -+ cmp r7,0 -+ bgt loop_cmds -+ -+ pop r6-r7, pc diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c new file mode 100644 -index 0000000..0255f5d +index 0000000000..0255f5dd44 --- /dev/null +++ b/libavcodec/rpi_mailbox.c @@ -0,0 +1,149 @@ @@ -14976,7 +22421,7 @@ index 0000000..0255f5d + diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h new file mode 100644 -index 0000000..b316878 +index 0000000000..b3168788d2 --- /dev/null +++ b/libavcodec/rpi_mailbox.h @@ -0,0 +1,58 @@ @@ -15040,10 +22485,10 @@ index 0000000..b316878 +#endif diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000..36c8ab6 +index 0000000000..e872b855b7 --- /dev/null +++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,878 @@ +@@ -0,0 +1,935 @@ +#ifdef RPI +#include +#include @@ -15062,8 +22507,9 @@ index 0000000..36c8ab6 +#include "rpi_mailbox.h" +#include "rpi_qpu.h" +#include "rpi_shader.h" -+#include "rpi_hevc_transform.h" -+#include "rpi_zc.h" ++#include "rpi_hevc_transform8.h" ++#include "rpi_hevc_transform10.h" ++#include "libavutil/rpi_sand_fns.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files @@ -15090,7 +22536,7 @@ index 0000000..36c8ab6 +#define vcos_verify_ge0(x) ((x)>=0) + +// Size in 32bit words -+#define QPU_CODE_SIZE 2048 ++#define QPU_CODE_SIZE 4098 +#define VPU_CODE_SIZE 2048 + +static const short rpi_transMatrix2even[32][16] = { // Even rows first @@ -15133,7 +22579,8 @@ index 0000000..36c8ab6 +struct GPU +{ + unsigned int qpu_code[QPU_CODE_SIZE]; -+ unsigned int vpu_code[VPU_CODE_SIZE]; ++ unsigned int vpu_code8[VPU_CODE_SIZE]; ++ unsigned int vpu_code10[VPU_CODE_SIZE]; + short transMatrix2even[16*16*2]; +}; + @@ -15145,8 +22592,9 @@ index 0000000..36c8ab6 +#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) + +struct rpi_cache_flush_env_s { -+ unsigned int n; -+ struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++// unsigned int n; ++// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++ struct vcsm_user_clean_invalid2_s v; +}; + +#define WAIT_COUNT_MAX 16 @@ -15188,6 +22636,7 @@ index 0000000..36c8ab6 + int open_count; + int init_count; + int mb; ++ int vpu_i_cache_flushed; + GPU_MEM_PTR_T code_gm_ptr; + vq_wait_pool_t wait_pool; +#if RPI_TRACE_TIME_VPU_QPU_WAIT @@ -15260,8 +22709,8 @@ index 0000000..36c8ab6 + +// GPU_MEM_PTR_T alloc fns +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); @@ -15272,12 +22721,14 @@ index 0000000..36c8ab6 + av_assert0(p->arm); + p->vc = mbox_mem_lock(mb, p->vc_handle); + av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); ++ + return 0; +} + +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { + p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" ); + av_assert0(p->vcsm_handle); + p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); + av_assert0(p->vc_handle); @@ -15285,6 +22736,7 @@ index 0000000..36c8ab6 + av_assert0(p->arm); + p->vc = mbox_mem_lock(mb, p->vc_handle); + av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); + return 0; +} + @@ -15293,6 +22745,7 @@ index 0000000..36c8ab6 + vcsm_unlock_ptr(p->arm); + vcsm_free(p->vcsm_handle); + memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++// printf("***** %s\n", __func__); +} + + @@ -15349,9 +22802,14 @@ index 0000000..36c8ab6 + } + // And the VPU code + { -+ int num_bytes = sizeof(rpi_hevc_transform); ++ int num_bytes = sizeof(rpi_hevc_transform8); + av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes); ++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); ++ } ++ { ++ int num_bytes = sizeof(rpi_hevc_transform10); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); + } + // And the transform coefficients + memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); @@ -15442,10 +22900,18 @@ index 0000000..36c8ab6 + gpu_unlock_unref(ge); +} + -+unsigned int vpu_get_fn(void) { ++unsigned int vpu_get_fn(const unsigned int bit_depth) { + // Make sure that the gpu is initialized + av_assert0(gpu != NULL); -+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code); ++ switch (bit_depth){ ++ case 8: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); ++ case 10: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); ++ default: ++ av_assert0(0); ++ } ++ return 0; +} + +unsigned int vpu_get_constants(void) { @@ -15475,95 +22941,75 @@ index 0000000..36c8ab6 +// +// Cache flush functions + ++#define CACHE_EL_MAX 16 + +rpi_cache_flush_env_t * rpi_cache_flush_init() +{ -+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t)); -+ if (rfe == NULL) -+ return NULL; ++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) + ++ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX); ++ if (rfe == NULL) ++ return NULL; + -+ rfe->n = 0; -+ return rfe; ++ rfe->v.op_count = 0; ++ return rfe; +} + +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) +{ -+ if (rfe != NULL) -+ free(rfe); ++ if (rfe != NULL) ++ free(rfe); +} + +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) +{ -+ int rc = 0; -+ unsigned int na; -+ unsigned int nr; ++ int rc = 0; + -+ // Clear any reamaining ents in the final block -+ if ((nr = rfe->n % CFE_ENTS_PER_A) != 0) -+ memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0])); ++ if (vcsm_clean_invalid2(&rfe->v) != 0) ++ rc = -1; + -+ for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na) -+ { -+ if (vcsm_clean_invalid(rfe->a + na) != 0) -+ rc = -1; -+ } ++ free(rfe); + -+ free(rfe); ++ if (rc == 0) ++ return 0; + -+ if (rc == 0) -+ return 0; -+ -+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); -+ return rc; ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); ++ return rc; +} + -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) +{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || gm->numbytes == 0) -+ return; ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + -+ { -+ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); -+ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ av_assert0(rfe->v.op_count <= CACHE_EL_MAX); + -+ av_assert0(rfe->n < CFE_ENT_COUNT); -+ -+ a->s[n].cmd = mode; -+ a->s[n].handle = gm->vcsm_handle; -+ a->s[n].addr = (unsigned int)gm->arm; -+ a->s[n].size = gm->numbytes; -+ ++rfe->n; -+ } ++ b->invalidate_mode = mode; ++ b->block_count = blocks; ++ b->start_address = gm->arm + offset0; ++ b->block_size = block_size; ++ b->inter_block_stride = block_stride; +} + +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, + const unsigned int offset, const unsigned int size) +{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || size == 0) -+ return; ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; + -+// printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes); ++ av_assert0(offset <= gm->numbytes); ++ av_assert0(size <= gm->numbytes); ++ av_assert0(offset + size <= gm->numbytes); + -+ av_assert0(offset <= gm->numbytes); -+ av_assert0(size <= gm->numbytes); -+ av_assert0(offset + size <= gm->numbytes); -+ -+ { -+ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); -+ const unsigned int n = rfe->n % CFE_ENTS_PER_A; -+ -+ av_assert0(rfe->n < CFE_ENT_COUNT); -+ -+ a->s[n].cmd = mode; -+ a->s[n].handle = gm->vcsm_handle; -+ a->s[n].addr = (unsigned int)gm->arm + offset; -+ a->s[n].size = size; -+ ++rfe->n; -+ } ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); +} + ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); ++} ++ ++ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) +{ +#if !RPI_ONE_BUF @@ -15580,6 +23026,8 @@ index 0000000..36c8ab6 + } +} + ++// Flush an area of a frame ++// Width, height, x0, y0 in luma pels +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, + const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, + const unsigned int uv_shift, const int do_luma, const int do_chroma) @@ -15610,7 +23058,7 @@ index 0000000..36c8ab6 + rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); + } + } -+ else if (!rpi_sliced_frame(frame)) ++ else if (!av_rpi_is_sand_frame(frame)) + { + const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); + if (do_luma) { @@ -15623,17 +23071,30 @@ index 0000000..36c8ab6 + } + else + { -+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); -+// printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' '); -+ // **** Use x0! -+ for (int x = 0; x < x0 + width; x += frame->linesize[0]) { -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, y0), y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, -+ (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, y0 >> 1), uv_size); -+ } ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); ++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); ++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C ++ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); ++ ++ if (do_chroma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); ++ b->block_size = uv_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ if (do_luma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); ++ b->block_size = y_size; ++ b->inter_block_stride = stride1 * stride2; + } + } +} @@ -15788,13 +23249,17 @@ index 0000000..36c8ab6 + vqj->mask |= VPU_QPU_MASK_VPU; + + j->command = EXECUTE_VPU; -+ j->u.v.q[0] = vpu_code; ++ // The bottom two bits of the execute address contain no-flush flags ++ // b0 will flush the VPU I-cache if unset so we nearly always want that set ++ // as we never reload code ++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; + j->u.v.q[1] = r0; + j->u.v.q[2] = r1; + j->u.v.q[3] = r2; + j->u.v.q[4] = r3; + j->u.v.q[5] = r4; + j->u.v.q[6] = r5; ++ gpu->vpu_i_cache_flushed = 1; + } +} + @@ -15921,13 +23386,50 @@ index 0000000..36c8ab6 + return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code); +} + ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) ++{ ++ // Dummy values we can catch with emulation ++ qf->y_pxx = ~1U; ++ qf->y_bxx = ~2U; ++ qf->y_p00 = ~3U; ++ qf->y_b00 = ~4U; ++ qf->c_pxx = ~5U; ++ qf->c_bxx = ~6U; ++ ++ switch (bit_depth) { ++ case 8: ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y_b00); ++ qf->c_pxx = qpu_fn(mc_filter_c_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c_b); ++ break; ++ case 10: ++ qf->c_pxx = qpu_fn(mc_filter_c10_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c10_b); ++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y10_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y10_b00); ++ break; ++ default: ++ return -1; ++ } ++ return 0; ++} ++ +#endif // RPI diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000..636e420 +index 0000000000..485a08f8ba --- /dev/null +++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,201 @@ +@@ -0,0 +1,206 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + @@ -16072,6 +23574,8 @@ index 0000000..636e420 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, + const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, + const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, @@ -16082,12 +23586,22 @@ index 0000000..636e420 + + +// QPU specific functions ++ ++typedef struct HEVCRpiQpu { ++ uint32_t c_pxx; ++ uint32_t c_pxx_l1; ++ uint32_t c_bxx; ++ uint32_t y_pxx; ++ uint32_t y_bxx; ++ uint32_t y_p00; ++ uint32_t y_b00; ++} HEVCRpiQpu; ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); ++ +uint32_t qpu_fn(const int * const mc_fn); + -+#define QPU_N_GRP_UV 4 -+#define QPU_N_UV 12 -+#define QPU_N_GRP_Y 4 // 4 QPUs per TMU -+#define QPU_N_Y 12 ++#define QPU_N_GRP 4 +#define QPU_N_MAX 12 + +#define QPU_MAIL_EL_VALS 2 @@ -16109,8 +23623,7 @@ index 0000000..636e420 +int vpu_qpu_job_start(const vpu_qpu_job_h vqj); +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); + -+ -+extern unsigned int vpu_get_fn(void); ++extern unsigned int vpu_get_fn(const unsigned int bit_depth); +extern unsigned int vpu_get_constants(void); + +// Waits for previous post_codee to complete and Will null out *wait_h after use @@ -16118,12 +23631,6 @@ index 0000000..636e420 +int vpu_qpu_init(void); +void vpu_qpu_term(void); + -+// Simple test of shader code -+extern int rpi_test_shader(void); -+ -+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst); -+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch); -+ +extern int gpu_get_mailbox(void); +void gpu_ref(void); +void gpu_unref(void); @@ -16131,10 +23638,10 @@ index 0000000..636e420 +#endif diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c new file mode 100644 -index 0000000..f2842b6 +index 0000000000..2c6541a8fb --- /dev/null +++ b/libavcodec/rpi_shader.c -@@ -0,0 +1,734 @@ +@@ -0,0 +1,1570 @@ +#include "rpi_shader.h" + +#ifdef _MSC_VER @@ -16164,706 +23671,1542 @@ index 0000000..f2842b6 +// ::mc_setup_c_qn +/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1 +/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_base, unif -+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 -+/* [0x00000028] */ 0x0c9e7000, 0x10021667, // add rb_max_x, r0, r0 -+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00000038] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00000040] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000048] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000050] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif ; mov ra12, 0 -+/* [0x00000058] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif ; mov ra13, 0 -+/* [0x00000060] */ 0x00000000, 0xe00059ce, // nop ; mov ra14, 0 -+/* [0x00000068] */ 0x8c5103f6, 0x1802560f, // add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 -+/* [0x00000070] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num -+/* [0x00000078] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 -+/* [0x00000080] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num -+/* [0x00000088] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b -+/* [0x00000090] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000000a8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x000000b0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x000000b8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x000000c0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x000000c8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000000d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000000d8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x000000e0] */ 0x0c809f80, 0xd0021367, // add rb_wt_den_p15, 9, unif -+/* [0x000000e8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x000000f0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000100] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000108] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000110] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000118] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000138] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000140] */ 0x15827d80, 0x10020667, // mov ra_base2, unif -+/* [0x00000148] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b -+/* [0x00000150] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a -+/* [0x00000158] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000160] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000168] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000170] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000178] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x00000180] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x00000188] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000190] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 -+/* [0x00000198] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 -+/* [0x000001a0] */ 0x95442ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y -+// :c_preload -+/* [0x000001a8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000001b0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x000001b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001c0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000001c8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x000001d0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x000001d8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:c_preload -+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001e8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000001f0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x000001f8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000200] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x00000208] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000210] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00000218] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00000220] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 -+// ::mc_filter_uv -+/* [0x00000228] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000230] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1 -+/* [0x00000238] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 -+/* [0x00000240] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x00000248] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x00000250] */ 0x935401f6, 0xd4024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next -+/* [0x00000258] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif -+/* [0x00000260] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000268] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4 ; mov ra0, unif -+/* [0x00000270] */ 0x800a7036, 0x122059d3, // nop ; mov ra_y_next, ra2.16a -+/* [0x00000278] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 -+/* [0x00000280] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000288] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a -+/* [0x00000290] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0 -+/* [0x00000298] */ 0x119c73c0, 0xd0020827, // shl r0, r1, 7 -+/* [0x000002a0] */ 0x8d818eb6, 0x10025743, // sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif -+/* [0x000002a8] */ 0x8c8013f6, 0xd0025456, // add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif -+/* [0x000002b0] */ 0x8c8033f6, 0xd002d496, // add rb_lcount, r1, 3 ; mov.ifnz ra_wt_off_mul_l0, unif -+/* [0x000002b8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2 ; mov rb8, ra3.8a -+/* [0x000002c0] */ 0x910d01f6, 0xda024809, // shl r0, r0, i_shift16 ; mov rb9, ra3.8b -+/* [0x000002c8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000002d0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y -+/* [0x000002d8] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c -+/* [0x000002e0] */ 0x950c0ff6, 0xde02494b, // mov r5quad, 0 ; mov rb11, ra3.8d -+/* [0x000002e8] */ 0x8f8013f6, 0xd002531e, // asr rb_wt_off, r1, 1 ; mov ra_link, unif -+/* [0x000002f0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 -+/* [0x000002f8] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -+// :uvloop -+/* [0x00000300] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x00000308] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+/* [0x00000310] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, 8 ; mov.ifnz r3, ra_y -+/* [0x00000318] */ 0x8c6817f6, 0xd0029818, // add r0, r3, 1 ; mov.ifz ra_base, ra_base_next -+/* [0x00000320] */ 0x94981f80, 0xd02279d1, // and.setf -, 1, elem_num ; mov ra_y, r0 -+/* [0x00000328] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000330] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 -+/* [0x00000338] */ 0x559d049f, 0x10044822, // mov.ifz r0, r2 ; mul24 r2, r3, rb_pitch -+/* [0x00000340] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000348] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3 ; mov ra4, ra5 -+/* [0x00000350] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x00000358] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000360] */ 0x40034031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000368] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000370] */ 0x40032031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000378] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d , r1 -+/* [0x00000380] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x00000388] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 -+/* [0x00000390] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 -+/* [0x00000398] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+/* [0x000003a0] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 -+/* [0x000003a8] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x000003b0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x000003b8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003c8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off -+/* [0x000003e0] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x000003e8] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb_wt_den_p15 -+/* [0x000003f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000003f8] */ 0x15067d80, 0x18020c27, // mov vpm, ra1.8a -+/* [0x00000400] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000408] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0 -+/* [0x00000410] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1 -+/* [0x00000418] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest -+// ::mc_filter_uv_b0 -+/* [0x00000420] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000428] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1 -+/* [0x00000430] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 -+/* [0x00000438] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+/* [0x00000440] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x00000448] */ 0x935401f6, 0xd4125815, // max r0, r0, 0 ; mov ra_xshift, ra_xshift_next -+/* [0x00000450] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif -+/* [0x00000458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000460] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4 ; mov ra0, unif -+/* [0x00000468] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 -+/* [0x00000470] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000478] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a -+/* [0x00000480] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0 -+/* [0x00000488] */ 0x918073f6, 0xd0025802, // shl r0, r1, 7 ; mov ra2, unif -+/* [0x00000490] */ 0x0d9d8e80, 0x10021767, // sub rb_dma1, rb_dma1_base, r2 -+/* [0x00000498] */ 0x0c9c13c0, 0xd0021467, // add rb_i_tmu, r1, 3 - PREREAD -+/* [0x000004a0] */ 0x0c9c33c0, 0xd00214a7, // add rb_lcount, r1, 3 -+/* [0x000004a8] */ 0x8c8270b6, 0x10125816, // add r0, r0, r2 ; mov ra_wt_mul_l0, unif -+/* [0x000004b0] */ 0x915201bf, 0x1c12d816, // shl r0, r0, ra_k16 ; mov.ifnz ra_wt_mul_l0, unif -+/* [0x000004b8] */ 0x8c81b1f6, 0x10025683, // add rb_dma0, r0, rb_dma0_base ; mov ra3, unif -+/* [0x000004c0] */ 0x159defc0, 0x10020267, // mov ra9, rb_max_y -+/* [0x000004c8] */ 0xec0e7d89, 0x14024821, // add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1 -+/* [0x000004d0] */ 0x8c0c21f6, 0x12125813, // add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a -+/* [0x000004d8] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x000004e0] */ 0x935011bf, 0x18024800, // max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000004e8] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif -+/* [0x000004f0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x000004f8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000500] */ 0x94827076, 0x10025843, // and r1, r0, r1 ; mov ra3, unif -+/* [0x00000508] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000510] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1 ; mov rb8, ra3.8a -+/* [0x00000518] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 -+/* [0x00000520] */ 0x950e0ff6, 0x1a024489, // mov ra_wt_off_mul_l1, unif ; mov rb9, ra3.8b -+/* [0x00000528] */ 0x950e0ff6, 0x1c06448a, // mov.ifnz ra_wt_off_mul_l1, unif ; mov rb10, ra3.8c -+/* [0x00000530] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif -+/* [0x00000538] */ 0x950c0ff6, 0xde02494b, // mov r5quad,0 ; mov rb11, ra3.8d -+/* [0x00000540] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 -+/* [0x00000548] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif -+/* [0x00000550] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -+// :uvloop_b -+/* [0x00000558] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 -+/* [0x00000560] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next -+/* [0x00000568] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, 8 ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00000570] */ 0x95685ff6, 0x10029118, // mov rb4, rb5 ; mov.ifz ra_base, ra_base_next -+/* [0x00000578] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y -+/* [0x00000580] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num -+/* [0x00000588] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000590] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 -+/* [0x00000598] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x000005a0] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_k255 -+/* [0x000005a8] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3 ; mov ra4, ra5 -+/* [0x000005b0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x000005b8] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000005c0] */ 0x40034031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000005c8] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000005d0] */ 0x40032031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000005d8] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x000005e0] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 -+/* [0x000005e8] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 -+/* [0x000005f0] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, 8 ; mov r3, ra_y2 -+/* [0x000005f8] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 -+/* [0x00000600] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num -+/* [0x00000608] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000610] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 -+/* [0x00000618] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00000620] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255 -+/* [0x00000628] */ 0x950c3ff6, 0x100269c7, // mov.setf -, rb3 ; mov rb7, ra3 -+/* [0x00000630] */ 0x540563f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra1.8a, r0 -+/* [0x00000638] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000640] */ 0x40074031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000648] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000650] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000658] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 -+/* [0x00000660] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000668] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x00000670] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 -+/* [0x00000678] */ 0x4d08443e, 0x180241e0, // sub ra7, r2, r0 ; mul24 r0, rb4, ra2.8a -+/* [0x00000680] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00000688] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00000690] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 -+/* [0x00000698] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 -+/* [0x000006a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x000006a8] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 -+/* [0x000006b0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 -+/* [0x000006b8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 -+/* [0x000006c0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x000006c8] */ 0x0c9e7280, 0x10020867, // add r1, r1, r2 -+/* [0x000006d0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x000006d8] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x000006e0] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 -+/* [0x000006e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000006f0] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a -+/* [0x000006f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000700] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0 -+/* [0x00000708] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1 -+/* [0x00000710] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest ++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch ++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num ++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif ++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000118] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++// :1 ++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c_p ++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 ++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b ++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c_p_l1 ++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 ++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b ++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c_b ++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif ++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif ++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif ++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift ++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a ++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b ++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c ++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif ++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d ++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 ++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++// :1 ++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 ++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 ++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_sync_q0 -+/* [0x00000718] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000720] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000728] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000730] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000738] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000740] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000748] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000750] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000758] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q1 -+/* [0x00000760] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000770] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000778] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000780] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000788] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q2 -+/* [0x00000790] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000798] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000007a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000007a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000007b0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000007b8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q3 -+/* [0x000007c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000007c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000007d0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000007d8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000007e0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000007e8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q4 -+/* [0x000007f0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000007f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000800] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000808] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000810] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000818] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000820] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000828] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000830] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q5 -+/* [0x00000838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000850] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000858] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000860] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q6 -+/* [0x00000868] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000878] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000880] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000888] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000890] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q7 -+/* [0x00000898] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000008a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000008a8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000008b0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x000008b8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q8 -+/* [0x000008c8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000008d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000008d8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000008e0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000008e8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000008f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000008f8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000900] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000908] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q9 -+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000920] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000928] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000930] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000938] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q10 -+/* [0x00000940] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000948] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000950] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000958] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000960] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000968] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q11 -+/* [0x00000970] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000980] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000988] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000990] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000998] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_exit -+// ::mc_exit_c -+/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000009a8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x000009b0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000009b8] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x000009c0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000009c8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000009d0] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_interrupt_exit12 -+// ::mc_interrupt_exit12c -+/* [0x000009d8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000009e0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000009f0] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x000009f8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a00] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000a08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 -+/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_qn ++// ::mc_exit_y_qn ++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_q0 ++// ::mc_exit_y_q0 ++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop +// ::mc_setup_y_q0 -+/* [0x00000a18] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_y_qn -+/* [0x00000a20] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00000a28] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000a30] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00000a38] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000a40] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 -+/* [0x00000a48] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000a50] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000a58] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif -+/* [0x00000a60] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 -+/* [0x00000a68] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x00000a70] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000a78] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000a80] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch -+/* [0x00000a88] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num -+/* [0x00000a90] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00000a98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000aa0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000aa8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000ab0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000ab8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00000ac0] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000ac8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000ad0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000ad8] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x00000ae0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000ae8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000af0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000af8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000b00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000b08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000b10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000b18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000b20] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 -+/* [0x00000b28] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a -+/* [0x00000b30] */ 0x95042ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a -+// :y_preload -+/* [0x00000b38] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000b40] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x00000b48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000b50] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000b58] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x00000b60] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x00000b68] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:y_preload -+/* [0x00000b70] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000b78] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000b80] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 -+/* [0x00000b88] */ 0x0c809dc0, 0xd0021367, // add rb_wt_den_p15, unif, 9 -+/* [0x00000b90] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000b98] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000ba0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000ba8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000bb0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000bb8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000bc0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000bc8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000bd0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000bd8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000be0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000be8] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00000bf0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000bf8] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00000c00] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00000c08] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 -+// :per_block_setup -+/* [0x00000c10] */ 0x935401f6, 0xd4125815, // max r0, r0, 0 ; mov ra_xshift, ra_xshift_next -+/* [0x00000c18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000c20] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000c28] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000c30] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif -+/* [0x00000c38] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00000c40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000c48] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif -+/* [0x00000c50] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 -+/* [0x00000c58] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000c60] */ 0x930401f6, 0xd2125813, // max r0, r0, 0 ; mov ra_y2_next, ra1.16a -+/* [0x00000c68] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif -+/* [0x00000c70] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000c78] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif -+/* [0x00000c80] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000c88] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000c90] */ 0x8c9dc07f, 0x10024831, // add r0, r0, r1 ; mov vw_setup, rb_vpm_init -+/* [0x00000c98] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 -+/* [0x00000ca0] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width -+/* [0x00000ca8] */ 0x8c405df6, 0xd2025460, // add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height -+/* [0x00000cb0] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16 -+/* [0x00000cb8] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 -+/* [0x00000cc0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7 -+/* [0x00000cc8] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width -+/* [0x00000cd0] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000cd8] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif -+/* [0x00000ce0] */ 0x918101f6, 0xd0045816, // shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000ce8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3 -+/* [0x00000cf0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000cf8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000d00] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000d08] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 -+/* [0x00000d10] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00000d18] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00000d20] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000d28] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000d30] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000d38] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000d40] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000d48] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000d50] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000d58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+/* [0x00000d60] */ 0x90216387, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255 -+/* [0x00000d68] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000d70] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00000d78] */ 0x90216387, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255 -+/* [0x00000d80] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000d88] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00000d90] */ 0x90216387, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255 -+/* [0x00000d98] */ 0x954a0dbf, 0x10064597, // mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif -+/* [0x00000da0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000da8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d -+/* [0x00000db0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000db8] */ 0x90216387, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255 -+/* [0x00000dc0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 -+/* [0x00000dc8] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif -+// ::mc_filter -+/* [0x00000dd0] */ 0xfffffe20, 0xf0f807a7, // brr ra_link, r:per_block_setup -+/* [0x00000dd8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00000de0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000de8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00000df0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 -+// :yloop -+/* [0x00000df8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00000e00] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00000e08] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00000e10] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00000e18] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00000e20] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000e28] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00000e30] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000e38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000e40] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000e48] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000e50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000e58] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x00000e60] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000e68] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000e70] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000e78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000e80] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000e88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000e90] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000e98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000ea0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000ea8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00000eb0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00000eb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00000ec0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000ec8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00000ed0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00000ed8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8 ; mov r1, ra8 -+/* [0x00000ee0] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000ee8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000ef0] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00000ef8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00000f00] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3 ; mov rb11, r1 -+/* [0x00000f08] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00000f10] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00000f18] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00000f20] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00000f28] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00000f30] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00000f38] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00000f40] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00000f48] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00000f50] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00000f58] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00000f60] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 -+/* [0x00000f68] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off -+/* [0x00000f70] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8 ; mov r0, ra_height -+/* [0x00000f78] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000f80] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 -+/* [0x00000f88] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait -+/* [0x00000f90] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a -+/* [0x00000f98] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 -+/* [0x00000fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00000fa8] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 -+/* [0x00000fb0] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 -+/* [0x00000fb8] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest -+/* [0x00000fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000fc8] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 -+/* [0x00000fd0] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 -+/* [0x00000fd8] */ 0xfffffe00, 0xf0f809e7, // brr -, r:yloop -+/* [0x00000fe0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x00000fe8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x00000ff0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init -+// ::mc_filter_b -+/* [0x00000ff8] */ 0xfffffbf8, 0xf0f807a7, // brr ra_link, r:per_block_setup -+/* [0x00001000] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001008] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00001010] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+// :yloopb -+/* [0x00001018] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x00001020] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00001028] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001030] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001038] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001040] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001048] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001050] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001058] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00001060] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001068] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 -+/* [0x00001070] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00001078] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x00001080] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00001088] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00001090] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00001098] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000010a0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000010a8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000010b0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000010b8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000010c0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000010c8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x000010d0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x000010d8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x000010e0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000010e8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x000010f0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x000010f8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8 ; mov r1, ra8 -+/* [0x00001100] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00001108] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001110] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00001118] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00001120] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3 ; mov rb11, r1 -+/* [0x00001128] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00001130] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00001138] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001140] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001148] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001150] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001158] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001160] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001168] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off -+/* [0x00001170] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+/* [0x00001178] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001180] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00001188] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 -+/* [0x00001190] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0 -+/* [0x00001198] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8 ; mov r0, ra_height -+/* [0x000011a0] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x000011a8] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 -+/* [0x000011b0] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait -+/* [0x000011b8] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a -+/* [0x000011c0] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 -+/* [0x000011c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000011d0] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 -+/* [0x000011d8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 -+/* [0x000011e0] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest -+/* [0x000011e8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000011f0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 -+/* [0x000011f8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 -+/* [0x00001200] */ 0xfffffdf8, 0xf0f809e7, // brr -, r:yloopb -+/* [0x00001208] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x00001210] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x00001218] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth ++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_8 ++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 ++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0, r0, v_dma_h_shift ++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift ++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 ++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 ++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 ++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif ++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++// ::mc_filter_y_pxx ++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++// :1 ++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height ++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_bxx ++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++// :1 ++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_filter_y_p00 -+/* [0x00001220] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001228] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next -+/* [0x00001230] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00001238] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00001240] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00001248] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00001250] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00001258] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif -+/* [0x00001260] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00001268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001270] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif -+/* [0x00001278] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init -+/* [0x00001280] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width -+/* [0x00001288] */ 0x8d402df6, 0xd2025460, // sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height -+/* [0x00001290] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16 -+/* [0x00001298] */ 0x8c8001f6, 0xd0025496, // add rb_lcount, r0, 0 ; mov ra_wt_off_mul_l0, unif -+/* [0x000012a0] */ 0x918071f6, 0xd0024817, // shl r0, r0, 7 ; mov rb_dest, unif -+/* [0x000012a8] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width -+/* [0x000012b0] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x000012b8] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base -+/* [0x000012c0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 -+/* [0x000012c8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif -+// :yloop_p00 -+/* [0x000012d0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+/* [0x000012d8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+/* [0x000012e0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x000012e8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x000012f0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x000012f8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001300] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00001308] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x00001310] */ 0x9140f3f6, 0xd2024860, // shl r1, r1, 15 ; mov r0, ra_height -+/* [0x00001318] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off -+/* [0x00001320] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:yloop_p00 -+/* [0x00001328] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 -+/* [0x00001330] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait -+/* [0x00001338] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a -+/* [0x00001340] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 -+/* [0x00001348] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001350] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 -+/* [0x00001358] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 -+/* [0x00001360] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest -+/* [0x00001368] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001370] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 -+/* [0x00001378] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 -+/* [0x00001380] */ 0xffffff30, 0xf0f809e7, // brr -, r:yloop_p00 -+/* [0x00001388] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x00001390] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x00001398] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next ++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif ++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif ++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif ++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base ++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++// :1 ++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b ++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_filter_y_b00 -+/* [0x000013a0] */ 0xfffff850, 0xf0f807a7, // brr ra_link, r:per_block_setup -+/* [0x000013a8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x000013b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000013b8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000013c0] */ 0x00000007, 0xe0020827, // mov r0, 7 -+/* [0x000013c8] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 -+/* [0x000013d0] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 -+/* [0x000013d8] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 -+/* [0x000013e0] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 -+/* [0x000013e8] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 -+// :yloop_b00 -+/* [0x000013f0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+/* [0x000013f8] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00001400] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001408] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001410] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00001418] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001420] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00001428] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001430] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00001438] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001440] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 -+/* [0x00001448] */ 0x545963c6, 0x12024860, // and r1, r1, rb_k255 ; mul24 r0, r0, ra_wt_mul_l0 -+/* [0x00001450] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x00001458] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 -+/* [0x00001460] */ 0x119ce3c0, 0xd0020867, // shl r1, r1, 14 -+/* [0x00001468] */ 0x8c40c3f6, 0x12024860, // add r1, r1, rb_wt_off ; mov r0, ra_height -+/* [0x00001470] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:yloop_b00 -+/* [0x00001478] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 -+/* [0x00001480] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait -+/* [0x00001488] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a -+/* [0x00001490] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 -+/* [0x00001498] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000014a0] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 -+/* [0x000014a8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 -+/* [0x000014b0] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest -+/* [0x000014b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000014c0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 -+/* [0x000014c8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 -+/* [0x000014d0] */ 0xffffff00, 0xf0f809e7, // brr -, r:yloop_b00 -+/* [0x000014d8] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x000014e0] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x000014e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7 ++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 ++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 ++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 ++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 ++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b ++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_setup_c10_q0 ++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c10_qn ++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1 ++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch ++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 ++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif ++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++// :1 ++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c10_p ++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 ++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b ++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_p_l1 ++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 ++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b ++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_b ++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif ++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif ++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif ++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift ++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a ++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b ++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c ++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif ++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d ++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 ++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++// :1 ++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 ++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8) ++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 ++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_sync10_q0 ++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q1 ++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q2 ++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q3 ++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q4 ++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q5 ++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q6 ++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q7 ++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q8 ++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q9 ++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q10 ++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q11 ++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_q0 ++// ::mc_exit_y10_q0 ++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_qn ++// ::mc_exit_y10_qn ++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y10_q0 ++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y10_qn ++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 ++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch ++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth ++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_10 ++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 ++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0, r0, v_dma_h_shift ++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift ++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 ++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 ++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 ++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif ++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++// ::mc_filter_y10_pxx ++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++// :1 ++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height ++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_p00 ++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next ++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif ++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif ++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base ++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++// :1 ++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b ++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_bxx ++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++// :1 ++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_b00 ++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7 ++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 ++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 ++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 ++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 ++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b ++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_end +}; +#ifdef __HIGHC__ @@ -16871,10 +25214,10 @@ index 0000000..f2842b6 +#endif diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h new file mode 100644 -index 0000000..a44bce9 +index 0000000000..82bf380eb4 --- /dev/null +++ b/libavcodec/rpi_shader.h -@@ -0,0 +1,35 @@ +@@ -0,0 +1,63 @@ +#ifndef rpi_shader_H +#define rpi_shader_H + @@ -16883,39 +25226,67 @@ index 0000000..a44bce9 +#define mc_setup_c_q0 (rpi_shader + 0) +#define mc_start (rpi_shader + 0) +#define mc_setup_c_qn (rpi_shader + 2) -+#define mc_filter_uv (rpi_shader + 138) -+#define mc_filter_uv_b0 (rpi_shader + 264) -+#define mc_sync_q0 (rpi_shader + 454) -+#define mc_sync_q1 (rpi_shader + 472) -+#define mc_sync_q2 (rpi_shader + 484) -+#define mc_sync_q3 (rpi_shader + 496) -+#define mc_sync_q4 (rpi_shader + 508) -+#define mc_sync_q5 (rpi_shader + 526) -+#define mc_sync_q6 (rpi_shader + 538) -+#define mc_sync_q7 (rpi_shader + 550) -+#define mc_sync_q8 (rpi_shader + 562) -+#define mc_sync_q9 (rpi_shader + 580) -+#define mc_sync_q10 (rpi_shader + 592) -+#define mc_sync_q11 (rpi_shader + 604) -+#define mc_exit (rpi_shader + 616) -+#define mc_exit_c (rpi_shader + 616) -+#define mc_interrupt_exit12 (rpi_shader + 630) -+#define mc_interrupt_exit12c (rpi_shader + 630) -+#define mc_setup_y_q0 (rpi_shader + 646) -+#define mc_setup_y_qn (rpi_shader + 648) -+#define mc_filter (rpi_shader + 884) -+#define mc_filter_b (rpi_shader + 1022) -+#define mc_filter_y_p00 (rpi_shader + 1160) -+#define mc_filter_y_b00 (rpi_shader + 1256) -+#define mc_end (rpi_shader + 1340) ++#define mc_filter_c_p (rpi_shader + 142) ++#define mc_filter_c_p_l1 (rpi_shader + 272) ++#define mc_filter_c_b (rpi_shader + 402) ++#define mc_sync_q0 (rpi_shader + 590) ++#define mc_sync_q1 (rpi_shader + 608) ++#define mc_sync_q2 (rpi_shader + 620) ++#define mc_sync_q3 (rpi_shader + 632) ++#define mc_sync_q4 (rpi_shader + 644) ++#define mc_sync_q5 (rpi_shader + 662) ++#define mc_sync_q6 (rpi_shader + 674) ++#define mc_sync_q7 (rpi_shader + 686) ++#define mc_sync_q8 (rpi_shader + 698) ++#define mc_sync_q9 (rpi_shader + 716) ++#define mc_sync_q10 (rpi_shader + 728) ++#define mc_sync_q11 (rpi_shader + 740) ++#define mc_exit_c_qn (rpi_shader + 752) ++#define mc_exit_y_qn (rpi_shader + 752) ++#define mc_exit_c_q0 (rpi_shader + 770) ++#define mc_exit_y_q0 (rpi_shader + 770) ++#define mc_setup_y_q0 (rpi_shader + 790) ++#define mc_setup_y_qn (rpi_shader + 792) ++#define mc_filter_y_pxx (rpi_shader + 1032) ++#define mc_filter_y_bxx (rpi_shader + 1162) ++#define mc_filter_y_p00 (rpi_shader + 1292) ++#define mc_filter_y_b00 (rpi_shader + 1382) ++#define mc_setup_c10_q0 (rpi_shader + 1462) ++#define mc_setup_c10_qn (rpi_shader + 1464) ++#define mc_filter_c10_p (rpi_shader + 1600) ++#define mc_filter_c10_p_l1 (rpi_shader + 1728) ++#define mc_filter_c10_b (rpi_shader + 1856) ++#define mc_sync10_q0 (rpi_shader + 2042) ++#define mc_sync10_q1 (rpi_shader + 2060) ++#define mc_sync10_q2 (rpi_shader + 2072) ++#define mc_sync10_q3 (rpi_shader + 2084) ++#define mc_sync10_q4 (rpi_shader + 2096) ++#define mc_sync10_q5 (rpi_shader + 2114) ++#define mc_sync10_q6 (rpi_shader + 2126) ++#define mc_sync10_q7 (rpi_shader + 2138) ++#define mc_sync10_q8 (rpi_shader + 2150) ++#define mc_sync10_q9 (rpi_shader + 2168) ++#define mc_sync10_q10 (rpi_shader + 2180) ++#define mc_sync10_q11 (rpi_shader + 2192) ++#define mc_exit_c10_q0 (rpi_shader + 2204) ++#define mc_exit_y10_q0 (rpi_shader + 2204) ++#define mc_exit_c10_qn (rpi_shader + 2224) ++#define mc_exit_y10_qn (rpi_shader + 2224) ++#define mc_setup_y10_q0 (rpi_shader + 2242) ++#define mc_setup_y10_qn (rpi_shader + 2244) ++#define mc_filter_y10_pxx (rpi_shader + 2494) ++#define mc_filter_y10_p00 (rpi_shader + 2624) ++#define mc_filter_y10_bxx (rpi_shader + 2716) ++#define mc_filter_y10_b00 (rpi_shader + 2846) ++#define mc_end (rpi_shader + 2926) + +#endif diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm new file mode 100644 -index 0000000..58fd911 +index 0000000000..ba6cc13a95 --- /dev/null +++ b/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1349 @@ +@@ -0,0 +1,1741 @@ + +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress +# the warning that we are using rotation & ra/rb registers. r0..3 can be @@ -16935,8 +25306,22 @@ index 0000000..58fd911 +# However in the current world there seems to be no benefit (and a small +# overhead) in setting this bigger than 2. + -+.set PREREAD, 2 ++.set PREREAD, 4 + ++# Block heights - 8 & 16 are the only numbers we currently support ++ ++.set C_BLK_HEIGHT_8, 16 ++.set C_BLK_HEIGHT_16, 8 ++.set Y_BLK_HEIGHT_8, 16 ++.set Y_BLK_HEIGHT_16, 8 ++ ++# QPU counts - depend on block size ++# If we have a 2-byte format & block_size > 8 then can only afford ++# 8 QPUs ++# These numbers must match the numbers in rpi_shader_cmd.h ++ ++.set N_QPU_8, 12 ++.set N_QPU_16, 12 + +# register allocation +# @@ -16995,7 +25380,13 @@ index 0000000..58fd911 +.set ra_wt_mul_l0, ra22.16a +.set ra_wt_off_l0, ra22.16b + -+# -- free -- ra23 ++# Max pel value (for 8 bit we can get away with sat ops but not 9+) ++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the ++# 2nd byte but as the source should never be > 3 there 0x3ff should do ++.set ra_blk_height_pmax, ra23 ++.set ra_pmax, ra23.16a ++.set ra_blk_height, ra23.8c ++# -- free -- ra23.8d + +# Loop: src frame base (L0) +.set ra_base, ra24 @@ -17021,9 +25412,9 @@ index 0000000..58fd911 +# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 +.set rb_elem_x, rb2 + -+# rb3 -+# C: Temp (U/V flag) -+# Y: free ++# El Flags ++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++.set rb_ef, rb3 + +# rb4-7 +# C-B: L1 H filter out FIFO @@ -17062,8 +25453,8 @@ index 0000000..58fd911 + +# -- free -- rb21 + -+# Setup: 255 -+.set rb_k255, rb22 ++# Setup: 0xff (8-bit) / 0xffff (9+ bit) ++.set rb_pmask, rb22 + +# Loop: destination address +.set rb_dest, rb23 @@ -17072,7 +25463,7 @@ index 0000000..58fd911 +.set rb_dma1_base, rb24 + +# Setup: pic width - 1 -+# In the case of chroma it is in bytes so 2 * (pic_width_c - 1) ++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. +.set rb_max_x, rb25 + +# Loop: height<<23 + width<<16 + vdw_setup_0 @@ -17105,8 +25496,10 @@ index 0000000..58fd911 +# Macros that express this - obviously these can't be overlapped +# so are probably unsuitable for loop code + -+.macro m_calc_dma_regs, r_vpm, r_dma ++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma + mov r2, qpu_num ++.if v_bit_depth <= 8 ++ # 8 bit version + asr r1, r2, 2 + shl r1, r1, 6 + and r0, r2, 3 @@ -17117,9 +25510,31 @@ index 0000000..58fd911 + + mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later + shl r0, r0, 5 ++ ++.else ++ # 16 bit version ++ # Limited to 8 QPUs if blk height > 8 ++ asr r1, r2, 1 ++.if v_blk_height <= 8 ++ shl r1, r1, 4 ++.else ++ shl r1, r1, 5 ++.endif ++ and r0, r2, 1 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR ++ add r_vpm, r0, r1 ++ ++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into ++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ shl r0, r0, 6 ++.endif + add r_dma, r0, r1 # DMA out +.endm + ++ +.macro m_setup_q0 + srel -, 12 +.endm @@ -17129,66 +25544,90 @@ index 0000000..58fd911 + +################################################################################ +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) -+::mc_setup_c_q0 -+ m_setup_q0 -+::mc_setup_c_qn ++ ++.macro m_setup_c, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_pmask, 0xff ++.set v_blk_height, C_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 2 ++.set v_pmask, 0xffff ++.set v_blk_height, C_BLK_HEIGHT_16 ++.endif ++ + mov tmurs, 1 # No swap TMUs + +# Load first request location -+ mov ra0, unif # next_x_y ++ mov ra0, unif # next_x_y ++ ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 + + mov ra_base, unif # Store frame c base + +# Read image dimensions + sub r0, unif, 1 # pic c width -+ add rb_max_x, r0, r0 -+ sub rb_max_y, unif, 1 # pic c height ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height + +# load constants + mov ra_kff100100, 0xff100100 -+ mov rb_k255, 255 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) + -+ -+ mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_base -+ -+# touch registers to keep simulator happy -+# ; ra12..15: vertical scroll registers +# get source pitch -+ mov rb_xpitch, unif ; mov ra12, 0 # stride2 -+ mov rb_pitch, unif ; mov ra13, 0 # stride1 -+ nop ; mov ra14, 0 -+# get destination vdw setup -+ add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1 ++ mov rb_xpitch, unif # stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 + + and r0, 1, elem_num + nop ; mul24 r0, r0, 5 ++.if v_bit_depth <= 8 + add rb_elem_x, r0, elem_num ++.else ++ add r0, r0, elem_num ++ add rb_elem_x, r0, r0 ++.endif + +# Compute base address for first and second access +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + -+ add r0, ra0.16b, ra0.16b # [rb_elem_x delay] ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] + add r0, r0, rb_elem_x # Add elem no to x to get X for this slice + max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y + min r0, r0, rb_max_x + +# Get shift ++# Shift will always calculate as 0 for 9+ bit ++# Ideally we can optimize the shift out of the code in these cases but for now ++# it is tidier to leave it in ++.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 ++.else ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++.endif + -+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to + ++.if v_bit_depth <= 8 + and r0, r0, -4 ++.endif + sub r1, ra_k0, rb_pitch + and r1, r0, r1 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 + add ra_base, ra_base, r0 + -+ add rb_wt_den_p15, 9, unif # denominator ++ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator + +# Compute part of VPM to use for DMA output -+ m_calc_dma_regs rb_vpm_init, rb_dma0_base ++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + +# And again for L1, but only worrying about frame2 stuff + @@ -17201,17 +25640,21 @@ index 0000000..58fd911 +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + -+ add r0, ra0.16b, ra0.16b # Load x ++ shl r0, ra0.16b, v_x_shift + add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset + max r0, r0, 0 + min r0, r0, rb_max_x + -+# Get shift ++# Get shift (already zero if 9+ bit so ignore) ++.if v_bit_depth <= 8 + shl rb_xshift2_next, r0, 3 ++.endif + +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs + ++.if v_bit_depth <= 8 + and r0, r0, -4 ++.endif + sub r1, ra_k0, rb_pitch + and r1, r0, r1 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch @@ -17222,7 +25665,7 @@ index 0000000..58fd911 +# r0 = ra_y, r2 = ra_y2 + mov r3, PREREAD ; mov r0, ra_y + -+:c_preload ++:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y @@ -17230,11 +25673,11 @@ index 0000000..58fd911 + add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 -+ brr.anynz -, r:c_preload ++ brr.anynz -, r:1b + min r1, r1, rb_max_y + add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch + add t1s, ra_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz c_preload ++# >>> .anynz 1b + + mov ra_link, unif # link +# touch registers to keep simulator happy @@ -17245,6 +25688,12 @@ index 0000000..58fd911 + mov ra6, 0 ; mov rb6, 0 + mov ra7, 0 ; mov rb7, 0 +# >>> ra_link ++.endm ++ ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ m_setup_c 8 + +################################################################################ + @@ -17252,85 +25701,116 @@ index 0000000..58fd911 + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block -+::mc_filter_uv -+# per-channel shifts were calculated on the *previous* invocation + ++.macro m_filter_c_p, v_tmu, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_x_mul, 4 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_tmu == 0 ++.set vrx_xshift, rb_xshift2 # b side more convienient ++.set vrx_xshift_next, ra_xshift_next ++.set vra_y_next, ra_y_next ++.set vrx_base_next, ra_base_next ++.set vra_y, ra_y ++.set vra_base, ra_base ++.set vr_txs, t0s ++.else ++.set vrx_xshift, ra_xshift # a side more convienient ++.set vrx_xshift_next, rb_xshift2_next ++.set vra_y_next, ra_y2_next ++.set vrx_base_next, rb_base2_next ++.set vra_y, ra_y2 ++.set vra_base, ra_base2 ++.set vr_txs, t1s ++.endif ++ ++# per-channel shifts were calculated on the *previous* invocation +# get base addresses and per-channel shifts for *next* invocation + mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ and.setf -, elem_num, 1 # [ra2 delay] ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base + -+ add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 # x ; r1=0 -+ add r0, r0, rb_elem_x -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next -+ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a + -+ shl ra_xshift_next, r0, 3 -+ -+ and r0, r0, -4 ; mov ra0, unif # H filter coeffs -+ nop ; mov ra_y_next, ra2.16a -+ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++.if v_bit_depth <= 8 ++ shl vrx_xshift_next, r0, 3 ++ and r0, r0, -4 ++.endif ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 -+ shl r0, r1, 7 ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ add vrx_base_next, r3, r0 ; mov r1, ra_height + +# set up VPM write -+ -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs -+ add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif # ; U offset/weight -+ add rb_lcount, r1, 3 ; mov.ifnz ra_wt_off_mul_l0, unif # ; V offset/weight ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight + +# ; unpack filter coefficients + -+ add r0, r0, r2 ; mov rb8, ra3.8a # Combine width and height of destination area (r0=h<<8, r2=w*2) -+ shl r0, r0, i_shift16 ; mov rb9, ra3.8b # Shift into bits 16 upwards of the vdw_setup0 register ++ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register + add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight + -+ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y ++ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y + -+ shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c -+ mov r5quad, 0 ; mov rb11, ra3.8d # Loop count (r5rep is B, r5quad is A) ++ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d + -+ asr rb_wt_off, r1, 1 ; mov ra_link, unif # Link -+ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 # weight*2 ++ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link ++ sub ra3, rb_wt_den_p15, ra_k1 + -+# ra9 alias for rb_max_y -+# ra_wt_mul_l0 - weight L0 * 2 -+# rb_wt_den_p15 = weight denom + 6 + 9 -+# rb_wt_off = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) -+ -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++# r5 = 0 (loop counter) ++# ra9 = alias for rb_max_y ++# ra_wt_mul_l0 = weight L0 ++# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19] ++# rb_wt_off = (offset * 2 + 1) << (ra3 - 1) + +# We want (r0r1) +# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... +# We fetch (after shift) +# C0 : C3 : C1 : C4 : C2 : C5 : ... + -+ mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -+ -+# r5 = 0 (loop counter) -+:uvloop ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment -+ shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+ shr r1, r2, 8 ; mov.ifnz r3, ra_y -+ add r0, r3, 1 ; mov.ifz ra_base, ra_base_next ++.if v_tmu == 0 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++.else ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++.endif + -+ and.setf -, 1, elem_num ; mov ra_y, r0 -+ max r3, r3, ra_k0 ; mov r0, r1 << 15 -+ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, ra9 ; mov.ifnc r0, r2 + -+ mov.ifz r0, r2 ; mul24 r2, r3, rb_pitch -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte -+ -+# ra4 not really needed; this could be a mul24 rather than a mov but current -+# register usage means this wouldn't help -+ mov.setf -, rb3 ; mov ra4, ra5 ++ mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + +# apply horizontal filter +# The filter coeffs for the two halves of this are the same (unlike in the @@ -17338,18 +25818,29 @@ index 0000000..58fd911 +# Also as the two halves are locked together we don't need to separate the 1st +# r0 mul or the last r1 mul as they are vaild for all QPUs + -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 + nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+ sub.setf -, r5, 4 ; mul24 r0, ra0.8d , r1 -+ brr.anyn -, r:uvloop -+ add r2, r2, r3 ; mov ra5, ra6 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++ +# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) ++# Have to dup block as we need to move the brr - code is more common than it ++# looks at first glance ++.if v_bit_depth <= 8 ++ brr.anyn -, r:1b ++ add r2, r2, r3 ; mov ra5, ra6 + mov ra6, ra7 ; mul24 r1, ra7, rb10 + sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+# >>> .anyn uvloop ++.else ++ add r2, r2, r3 ; mov ra5, ra6 ++ brr.anyn -, r:1b ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++ asr ra7, r2, v_bit_depth - 8 ++.endif ++# >>> .anyn 1b + + sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] + add r1, r1, r0 ; mul24 r0, ra7, rb11 @@ -17357,84 +25848,146 @@ index 0000000..58fd911 + sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 + asr r1, r1, 14 + nop ; mul24 r1, r1, ra_wt_mul_l0 -+ shl r1, r1, 8 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, ra3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b + -+ add r1, r1, rb_wt_off -+ brr.anyn -, r:uvloop -+ asr ra1.8as, r1, rb_wt_den_p15 -+ mov -, vw_wait -+ mov vpm, ra1.8a -+# >>> .anyn uvloop ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+# DMA out for U & stash for V -+ bra -, ra_link -+ mov vw_setup, rb_dma0 -+ mov vw_setup, rb_dma1 -+ mov vw_addr, rb_dest # u_dst_addr -+# >>> ra_link ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++# At 10 bits ++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits) ++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230 ++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits) ++# (P) ++# * weight (255) = 5987400 = 0x5b5c48 (23 bits) ++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits) ++# ... should be OK ++# ++# (B) ++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits) ++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits) ++# So signed overflow if we sign extend here :-( ++# ++# In practice this doesn't happen (we need a maximal offset and a very unlucky ++# filter). ++# ++# This could be fixed by offsetting the filters s.t. they are unsigned until ++# weight mul and then removing the offset with the weighting offset (I think ++# this should work) or splitting the rounding & offsetting ++ ++::mc_filter_c_p ++ m_filter_c_p 0, 8 ++ ++::mc_filter_c_p_l1 ++ m_filter_c_p 1, 8 + +################################################################################ + -+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) ++# mc_filter_c_b + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block -+::mc_filter_uv_b0 ++ ++.macro m_filter_c_b, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++.set v_x_mul, (1 << v_x_shift) ++ +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation + mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ and.setf -, elem_num, 1 # Also acts as delay slot for ra2 ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base + -+ add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 # x ; r1=0 ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 + add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, 0 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs + ++.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 ++.endif + -+ and r0, r0, -4 ; mov ra0, unif # L0 H filter coeffs -+ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 -+ shl r0, r1, 7 ; mov ra2, unif # ; L0 V filter coeffs ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B + +# set up VPM write + -+ sub rb_dma1, rb_dma1_base, r2 # Compute vdw_setup1(dst_pitch-width) -+ add rb_i_tmu, r1, 3 - PREREAD -+ add rb_lcount, r1, 3 ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight + -+ add r0, r0, r2 ; mov ra_wt_mul_l0, unif # ; U weight -+ shl r0, r0, ra_k16 ; mov.ifnz ra_wt_mul_l0, unif # Shift into bits 16 upwards of the vdw_setup0 register ; V weight -+ add rb_dma0, r0, rb_dma0_base ; mov ra3, unif # ; x2_y2 ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs + +# L1 - uniform layout could possibly be optimized + -+ mov ra9, rb_max_y # [ra3 delay] -+ -+ add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1 # r0=x*2 ; r1=0 -+ add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B -+ min r0, r0, rb_max_x ; mov ra1, unif # H filter coeffs ++ shl r0, ra3.16b, v_x_shift # r0=x*2 ++ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs ++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight ++ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs ++ min r0, r0, rb_max_x ; mov rb9, ra3.8b + ++.if v_bit_depth <= 8 + shl rb_xshift2_next, r0, 3 ++.endif + -+ and r0, r0, -4 -+ and r1, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight ++ and r1, r0, r1 ; mov rb10, ra3.8c + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov rb8, ra3.8a # Add stripe offsets ; start unpacking filter coeffs ++ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr + add rb_base2_next, r3, r0 + -+ mov ra_wt_off_mul_l1, unif ; mov rb9, ra3.8b # U offset/weight -+ mov.ifnz ra_wt_off_mul_l1, unif ; mov rb10, ra3.8c # V offset/weight -+ -+ mov rb_dest, unif # dst_addr -+ mov r5quad,0 ; mov rb11, ra3.8d ++ mov ra9, rb_max_y ; mov rb11, ra3.8d + shl r1, ra_wt_off_l1, rb_wt_den_p15 + asr rb_wt_off, r1, 9 ; mov ra_link, unif # link + @@ -17448,64 +26001,66 @@ index 0000000..58fd911 +# rb8-rb11 V coeffs L1 +# ra9 rb_max_y alias + -+ mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -+ -+:uvloop_b ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment + shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next -+ shr r1, r2, 8 ; mov.ifz ra_y_y2, ra_y_y2_next -+ mov rb4, rb5 ; mov.ifz ra_base, ra_base_next ++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next + add ra_y, 1, ra_y ; mov r3, ra_y + -+ and.setf -, 1, elem_num + max r3, r3, ra_k0 ; mov r0, r1 << 15 -+ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 + -+ mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch -+ add t0s, ra_base, r3 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + +# L0 H-filter +# H FIFO scrolls are spread all over this loop -+ mov.setf -, rb3 ; mov ra4, ra5 ++ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves + -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 + nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++.if v_bit_depth <= 8 + sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++.else ++ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++ asr ra3, r2, (v_bit_depth - 8) ++.endif + + shr r2, r4, rb_xshift2 ; mov ra5, ra6 -+ shr r1, r2, 8 ; mov r3, ra_y2 ++ shr r1, r2, v_v_shift ; mov r3, ra_y2 + add ra_y2, r3, ra_k1 ; mov rb6, rb7 + -+ and.setf -, 1, elem_num + max r3, r3, ra_k0 ; mov r0, r1 << 15 -+ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 + -+ mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch -+ add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + +# L1 H-filter -+ mov.setf -, rb3 ; mov rb7, ra3 + -+ and r1, r1, rb_k255 ; mul24 r3, ra1.8a, r0 ++ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 + nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 + sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 -+ brr.anyn -, r:uvloop_b +# V filters - start in branch delay slots of H ++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction + add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ brr.anyn -, r:1b + mov ra6, ra7 ; mul24 r3, ra7, rb10 -+ sub ra7, r2, r0 ; mul24 r0, rb4, ra2.8a -+# >>> .anyn uvloop_b0 ++ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++# >>> .anyn 1b + -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay] + add r1, r1, r0 ; mul24 r0, rb7, ra2.8d + sub r2, r1, r0 ; mul24 r0, ra4, rb8 + sub r1, r3, r0 ; mul24 r0, ra5, rb9 @@ -17516,22 +26071,46 @@ index 0000000..58fd911 + asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 + + add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) -+ add r1, r1, r2 ++ add r1, r1, r2 ; mov r3, ra_blk_height + + sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend + -+ brr.anyn -, r:uvloop_b -+ asr ra3.8as, r1, rb_wt_den_p15 -+ mov -, vw_wait -+ mov vpm, ra3.8a -+# >>> .anyn uvloop_b ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out -+ bra -, ra_link -+ mov vw_setup, rb_dma0 -+ mov vw_setup, rb_dma1 -+ mov vw_addr, rb_dest -+# >>> ra_link ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_b ++ m_filter_c_b 8 + +################################################################################ +# Exit code used by both Luma & Chroma so place between them to avoid I-cache @@ -17570,9 +26149,11 @@ index 0000000..58fd911 +# The code stalled when I had many waiters on a single sem so we have a +# "ripple" of srels to restart. Unsure why, may have been bug, but this works +# and we currently have both the memory & sems to support it. -+.macro m_sync_q, n_qpu -+ mov ra_link, unif -+ mov -, vw_wait ++.macro m_sync_q, n_qpu, n_quads ++# Do not generate code for qpu >= quads * 4 - fns should never be called ++.if n_qpu < n_quads * 4 ++ mov ra_link, unif # Can only branch to an a reg (not r0) ++ mov -, vw_wait # [ra_link delay] + +.set n_sem_sync, n_qpu - (n_qpu % 4) +.set n_sem_in, n_qpu @@ -17581,7 +26162,7 @@ index 0000000..58fd911 +.if n_qpu % 4 == 0 + +.set n_sem_quad_in, 12 + n_qpu / 4 -+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % 3) ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) + + sacq -, n_sem_sync + sacq -, n_sem_sync @@ -17601,51 +26182,67 @@ index 0000000..58fd911 + nop +.endif +.endif ++.endif +.endm + ++.set v_quads8, N_QPU_8 / 4 ++ +::mc_sync_q0 -+ m_sync_q 0 ++ m_sync_q 0, v_quads8 +::mc_sync_q1 -+ m_sync_q 1 ++ m_sync_q 1, v_quads8 +::mc_sync_q2 -+ m_sync_q 2 ++ m_sync_q 2, v_quads8 +::mc_sync_q3 -+ m_sync_q 3 ++ m_sync_q 3, v_quads8 +::mc_sync_q4 -+ m_sync_q 4 ++ m_sync_q 4, v_quads8 +::mc_sync_q5 -+ m_sync_q 5 ++ m_sync_q 5, v_quads8 +::mc_sync_q6 -+ m_sync_q 6 ++ m_sync_q 6, v_quads8 +::mc_sync_q7 -+ m_sync_q 7 ++ m_sync_q 7, v_quads8 +::mc_sync_q8 -+ m_sync_q 8 ++ m_sync_q 8, v_quads8 +::mc_sync_q9 -+ m_sync_q 9 ++ m_sync_q 9, v_quads8 +::mc_sync_q10 -+ m_sync_q 10 ++ m_sync_q 10, v_quads8 +::mc_sync_q11 -+ m_sync_q 11 ++ m_sync_q 11, v_quads8 + +# mc_exit() +# Chroma & Luma the same now -+::mc_exit_c -+::mc_exit ++ ++.macro m_exit_qn + m_exit_drain + nop ; nop ; thrend + nop + nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_qn ++::mc_exit_y_qn ++ m_exit_qn ++ ++ + +# mc_interrupt_exit12() -+::mc_interrupt_exit12c -+::mc_interrupt_exit12 ++ ++.macro m_exit_q0 + m_exit_drain + sacq -, 12 + nop ; nop ; thrend + mov interrupt, 1 + nop +# >>> thrend <<< ++.endm ++ ++::mc_exit_c_q0 ++::mc_exit_y_q0 ++ m_exit_q0 + +# LUMA CODE + @@ -17667,9 +26264,20 @@ index 0000000..58fd911 +# uint32_t next_fn; +# } qpu_mc_pred_y_s_t; + -+::mc_setup_y_q0 -+ m_setup_q0 -+::mc_setup_y_qn ++.macro m_setup_y, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_pmask, 0xff ++.set v_blk_height, Y_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 1 ++.set v_pmask, 0xffff ++.set v_blk_height, Y_BLK_HEIGHT_16 ++.endif ++ ++ + # Need to save these because we need to know the frame dimensions before computing texture coordinates + mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y + mov ra9, unif # ref_y_base @@ -17677,18 +26285,27 @@ index 0000000..58fd911 + mov ra11, unif # ref_y2_base + +# load constants ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ++ + + mov ra_kff100100, 0xff100100 -+ mov rb_k255, 255 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) + +# Compute part of VPM to use + +# Read image dimensions -+ mov ra3, unif # width_height -+ mov rb_xpitch, unif # stride2 ++ mov ra3, unif # width_height ++ mov rb_xpitch, unif # stride2 ++.if v_x_shift == 0 + sub rb_max_x, ra3.16b, 1 ++.else ++ sub r0, ra3.16b, 1 ++ shl rb_max_x, r0, v_x_shift ++.endif + sub rb_max_y, ra3.16a, 1 -+ mov rb_pitch, unif # stride1 ++ mov rb_pitch, unif # stride1 + +# get destination pitch + mov r1, vdw_setup_1(0) @@ -17696,38 +26313,44 @@ index 0000000..58fd911 + +# Compute base address for first and second access + mov r3, elem_num -+ add r0, ra0.16b, r3 # Load x + elem_num ++ add r0, ra0.16b, r3 # Load x + elem_num ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif + max r0, r0, 0 + min r0, r0, rb_max_x + shl ra_xshift_next, r0, 3 # Compute shifts + -+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs ++# X is byte offset - we can only load words - mask + + and r0, r0, -4 ; v8subs r2, r2, r2 + sub r2, r2, rb_pitch + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets ++ add r0, r0, r1 # Add stripe offsets + add ra_base, ra9, r0 + + # r3 still contains elem_num -+ add r0, ra1.16b, r3 # Load x ++ add r0, ra1.16b, r3 # Load x ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif + max r0, r0, 0 + min r0, r0, rb_max_x -+ shl rb_xshift2_next, r0, 3 # Compute shifts ++ shl rb_xshift2_next, r0, 3 # Compute shifts + + # r2 still contains mask + and r0, r0, -4 + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets ++ add r0, r0, r1 # Add stripe offsets + add ra_base2, ra11, r0 + +# Do preloads + nop ; mov r0, ra0.16a # ; r0 = y + mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 + -+:y_preload ++:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y @@ -17735,15 +26358,15 @@ index 0000000..58fd911 + add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 -+ brr.anynz -, r:y_preload ++ brr.anynz -, r:1b + min r1, r1, rb_max_y + add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch + add t1s, ra_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz y_preload ++# >>> .anynz 1b + -+ add rb_wt_den_p15, unif, 9 # weight denom + 6 ++ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom + -+ m_calc_dma_regs rb_vpm_init, rb_dma0_base ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + + mov ra_link, unif # Next fn + @@ -17754,6 +26377,12 @@ index 0000000..58fd911 + mov ra10, 0 ; mov rb10, 0 + mov ra11, 0 ; mov rb11, 0 +# >>> ra_link ++.endm ++ ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn ++ m_setup_y 8 + +################################################################################ +# @@ -17780,48 +26409,73 @@ index 0000000..58fd911 +# } qpu_mc_pred_y_p_t; +# + -+.macro luma_setup -+ brr ra_link, r:per_block_setup -+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # [ra0 delay] ++.macro m_luma_setup, v_bit_depth ++# Hack - QASM may well have have label pasting but I have no idea how... ++.if v_bit_depth == 8 ++ brr ra_link, r:per_block_setup_8 ++.elif v_bit_depth == 10 ++ brr ra_link, r:per_block_setup_10 ++.endif ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 + add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +.endm + -+:per_block_setup -+ max r0, r0, 0 ; mov ra_xshift, ra_xshift_next ++.macro m_per_block_setup, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next + min r0, r0, rb_max_x + + shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base + and r1, r0, r2 ; mov ra_y_next, ra0.16a + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y + add ra_base_next, ra_base_next, r0 # [ra1 delay] + + add r0, ra1.16b, r3 # Load x2 -+ max r0, r0, 0 ; mov ra_y2_next, ra1.16a ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a + min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base + shl rb_xshift2_next, r0, 3 # Compute shifts + and r0, r0, -4 ; mov ra_width_height, unif # ; width_height -+ and r1, r0, r2 ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov vw_setup, rb_vpm_init # Add stripe offsets ; set up VPM write ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes + add rb_base2_next, rb_base2_next, r0 + -+# get width,height of block (unif load above) -+ sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width) -+ add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height -+ min r0, r0, ra_k16 ++# get width,height of block (unif load above), r1 = width * pel_size ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height + add rb_lcount, r0, 7 -+ shl r0, r0, 7 -+ add r0, r0, ra_width # Combine width and height of destination area -+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register ++ shl r0, r0, v_dma_h_shift ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register + add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight -+ shl ra8, r0, 3 ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov r3, ra_k255 + +# Pack the 1st 4 filter coefs for H & V tightly +# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) @@ -17845,35 +26499,41 @@ index 0000000..58fd911 +# In the 2nd vertical half we use b registers due to using a-side fifo regs + + mov r1,0x3a281100 -+ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255 ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 + + mov r1,0x0a0b0500 # -ve + ror r0, r1, ra8.8d -+ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255 ++ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 + + mov r1,0x04040100 + ror r0, r1, ra8.8d -+ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255 ++ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 + -+ mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address ++ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address + + mov r1,0x01010000 # -ve + ror r0, r1, ra8.8d -+ bra -, ra_link -+ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255 + -+ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 ++ bra -, ra_link ++ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++ ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc + # For B l1 & L0 offsets should be identical so it doesn't matter which we use + asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val +# >>> branch ra_link + -+# r3 = 0 ++# r5 = 0 +# ra_wt_mul_l1 = weight L1 +# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) +# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) +# rb_wt_den_p15 = weight denom + 6 + 9 +# rb_wt_mul_l0 = weight L0 ++.endm ++ ++:per_block_setup_8 ++ m_per_block_setup 8 ++ + + +################################################################################ @@ -17881,14 +26541,14 @@ index 0000000..58fd911 +# In a P block, y2_x2 should be y_x+8 +# At this point we have already issued two pairs of texture requests for the current block + -+::mc_filter -+ luma_setup ++.macro m_filter_y_pxx, v_bit_depth ++ m_luma_setup v_bit_depth + + shl ra_wt_mul_l0, ra_wt_mul_l0, 1 + +# r5 = 0 (loop count) + -+:yloop ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + @@ -17906,45 +26566,39 @@ index 0000000..58fd911 + add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + + max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ++ min r2, r2, rb_max_y ; mov ra7, ra8 + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 + +# apply horizontal filter -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 + nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r5, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 -+ brr.anyn -, r:yloop -+ mov ra9, ra10 ; mov rb9, rb10 ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b + mov ra10, ra11 ; mov rb10, rb11 -+ sub ra11, r2, r3 ; mov rb11, r1 -+ # >>> .anyn yloop ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b + + # apply vertical filter and write to VPM -+ -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b + sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c + sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d + add r1, r1, r0 ; mul24 r0, ra8, rb4 @@ -17959,38 +26613,46 @@ index 0000000..58fd911 + sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish + asr r1, r1, 14 + nop ; mul24 r1, r1, ra_wt_mul_l0 -+ add r1, r1, rb_wt_off ++ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop ++ ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch + -+ shl r1, r1, 8 ; mov r0, ra_height -+ brr.anyn -, r:yloop -+ asr ra3.8as, r1, rb_wt_den_p15 -+ mov r1, ra_k16 ; mov -, vw_wait -+ sub r0, r0, r1 ; mov vpm, ra3.8a +# >>> branch.anyn yloop + -+# If looping again the we consumed 16 height last loop -+ # rb_dma1 (stride) remains constant -+ # rb_i_tmu remains const (based on total height) -+ # recalc rb_dma0, rb_lcount based on new segment height -+ # N.B. r3 is loop counter still ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 -+ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride -+ nop ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 +# >>> .anyz ra_link + ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block + add rb_lcount, rb_lcount, r0 -+ shl r0, r2, i_shift23 -+ add rb_dma0, rb_dma0, r0 -+ brr -, r:yloop -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 + mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> yloop ++# >>> 1b ++.endm ++ ++::mc_filter_y_pxx ++ m_filter_y_pxx 8 + + +################################################################################ @@ -17998,25 +26660,15 @@ index 0000000..58fd911 +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, only the first half of coefficients contain used information. +# At this point we have already issued two pairs of texture requests for the current block -+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?) -+# Can fill in the coefficients so only -+# Can also assume default weighted prediction for B frames. +# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time? +# Or possibly by taking advantage of symmetry? -+# From 19->7 32bits per command. + -+::mc_filter_b -+ luma_setup ++.macro m_filter_y_bxx, v_bit_depth ++ m_luma_setup v_bit_depth + -+:yloopb -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+# If we knew there was no clipping then this code would get simpler. -+# Perhaps we could add on the pitch and clip using larger values? -+ -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 + shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y @@ -18025,44 +26677,39 @@ index 0000000..58fd911 + add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + + max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ++ min r2, r2, rb_max_y ; mov ra7, ra8 + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 + +# apply horizontal filter -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 + nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r5, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 -+ brr.anyn -, r:yloopb -+ mov ra9, ra10 ; mov rb9, rb10 ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b + mov ra10, ra11 ; mov rb10, rb11 -+ sub ra11, r2, r3 ; mov rb11, r1 -+ # >>> .anyn yloopb ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b + + # apply vertical filter and write to VPM -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b + sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c + sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d + add r1, r1, r0 ; mul24 r0, ra8, rb4 @@ -18078,37 +26725,44 @@ index 0000000..58fd911 + nop ; mul24 r0, r1, ra_wt_mul_l0 + add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 + -+ add r1, r1, r0 -+ shl r1, r1, 8 ; mov r0, ra_height -+ brr.anyn -, r:yloopb -+ asr ra3.8as, r1, rb_wt_den_p15 -+ mov r1, ra_k16 ; mov -, vw_wait -+ sub r0, r0, r1 ; mov vpm, ra3.8a -+# >>> branch.anyn yloop ++ add r1, r1, r0 ; mov r3, ra_blk_height ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b + -+# If looping again the we consumed 16 height last loop -+ # rb_dma1 (stride) remains constant -+ # rb_i_tmu remains const (based on total height) -+ # recalc rb_dma0, rb_lcount based on new segment height -+ # N.B. r5 is loop counter still ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 -+ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride -+ nop ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 +# >>> .anyz ra_link + ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block + add rb_lcount, rb_lcount, r0 -+ shl r0, r2, i_shift23 -+ add rb_dma0, rb_dma0, r0 -+ brr -, r:yloopb -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 + mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> yloopb ++# >>> 1b ++.endm ++ ++::mc_filter_y_bxx ++ m_filter_y_bxx 8 + +################################################################################ +# @@ -18121,10 +26775,28 @@ index 0000000..58fd911 +# uint32_t next_fn; +# } qpu_mc_pred_y_p00_t; + -+::mc_filter_y_p00 -+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++.macro m_filter_y_p00, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++ mov ra0, unif ; mov r3, elem_num # y_x + mov ra_xshift, ra_xshift_next # [ra0 delay] + add r0, ra0.16b, r3 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif + + max r0, r0, 0 + min r0, r0, rb_max_x @@ -18135,23 +26807,23 @@ index 0000000..58fd911 + and r1, r0, r2 ; mov ra_y_next, ra0.16a + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height -+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write + +# get width,height of block (unif load above) -+ sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width) -+ sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height -+ min r0, r0, ra_k16 -+ add rb_lcount, r0, 0 ; mov ra_wt_off_mul_l0, unif -+ shl r0, r0, 7 ; mov rb_dest, unif # Destination address -+ add r0, r0, ra_width # Combine width and height of destination area -+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register ++# Compute vdw_setup1(dst_pitch-width) ++ shl r1, ra_width, v_x_shift ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset ++ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr + add rb_dma0, r0, rb_dma0_base + + shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 + # For B l1 & L0 offsets should be identical so it doesn't matter which we use + asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link + -+:yloop_p00 ++:1 + sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 + nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 + shr r0, r4, ra_xshift ; mov r3, rb_pitch @@ -18159,48 +26831,55 @@ index 0000000..58fd911 + max r2, ra_y, 0 # y + min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask + + sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+ shl r1, r1, 15 ; mov r0, ra_height -+ add r1, r1, rb_wt_off ++ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + -+ brr.anyn -, r:yloop_p00 -+ asr ra3.8as, r1, rb_wt_den_p15 -+ mov r1, ra_k16 ; mov -, vw_wait -+ sub r0, r0, r1 ; mov vpm, ra3.8a -+# >>> branch.anyn yloop_p00 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b + -+# If looping again the we consumed 16 height last loop -+ # rb_dma1 (stride) remains constant -+ # rb_i_tmu remains const (based on total height) -+ # recalc rb_dma0, rb_lcount based on new segment height -+ # N.B. r5 is loop counter still ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 -+ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride -+ nop ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 +# >>> .anyz ra_link + ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block + add rb_lcount, rb_lcount, r0 -+ shl r0, r2, i_shift23 -+ add rb_dma0, rb_dma0, r0 -+ brr -, r:yloop_p00 -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 + mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> yloop_p00 ++# >>> 1b ++.endm ++ ++::mc_filter_y_p00 ++ m_filter_y_p00 8 + +################################################################################ + -+::mc_filter_y_b00 ++.macro m_filter_y_b00, v_bit_depth +# luma setup does a fair bit more than we need calculating filter coeffs +# that we will never use but it saves I-cache to use it (also simple!) -+ luma_setup ++ m_luma_setup v_bit_depth + +# Fix up vals that were expecting a filter (somewhat icky) + mov r0, 7 @@ -18210,7 +26889,7 @@ index 0000000..58fd911 + shl rb_wt_off, rb_wt_off, r0 + nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 + -+:yloop_b00 ++:1 + sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 + shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 + shr r0, r4, ra_xshift ; mov r3, rb_pitch @@ -18223,64 +26902,157 @@ index 0000000..58fd911 + max r2, ra_y2, 0 + min r2, r2, rb_max_y + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte -+ and r1, r1, rb_k255 ; mul24 r0, r0, ra_wt_mul_l0 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 + + sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 + add r1, r0, r1 -+ shl r1, r1, 14 -+ add r1, r1, rb_wt_off ; mov r0, ra_height ++ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + -+ brr.anyn -, r:yloop_b00 -+ asr ra3.8as, r1, rb_wt_den_p15 -+ mov r1, ra_k16 ; mov -, vw_wait -+ sub r0, r0, r1 ; mov vpm, ra3.8a -+# >>> branch.anyn yloop ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b + -+# If looping again the we consumed 16 height last loop -+ # rb_dma1 (stride) remains constant -+ # rb_i_tmu remains const (based on total height) -+ # recalc rb_dma0, rb_lcount based on new segment height -+ # N.B. r5 is loop counter still ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link -+ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 -+ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride -+ nop ; mov vw_addr, rb_dest # start the VDW ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 +# >>> .anyz ra_link + ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block + add rb_lcount, rb_lcount, r0 -+ shl r0, r2, i_shift23 -+ add rb_dma0, rb_dma0, r0 -+ brr -, r:yloop_b00 -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 + mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> yloopb00 ++# >>> 1b ++.endm ++ ++::mc_filter_y_b00 ++ m_filter_y_b00 8 + +################################################################################ ++################################################################################ ++# 10 BIT ++ ++::mc_setup_c10_q0 ++ m_setup_q0 ++::mc_setup_c10_qn ++ m_setup_c 10 ++ ++::mc_filter_c10_p ++ m_filter_c_p 0, 10 ++ ++::mc_filter_c10_p_l1 ++ m_filter_c_p 1, 10 ++ ++ ++::mc_filter_c10_b ++ m_filter_c_b 10 ++ ++# Even if these fns are the same as for other bit depths we want our own copy ++# to keep the code we are using in a single lump to avoid (direct map) cache ++# thrashing ++.set v_quads10, N_QPU_16 / 4 ++ ++::mc_sync10_q0 ++ m_sync_q 0, v_quads10 ++::mc_sync10_q1 ++ m_sync_q 1, v_quads10 ++::mc_sync10_q2 ++ m_sync_q 2, v_quads10 ++::mc_sync10_q3 ++ m_sync_q 3, v_quads10 ++::mc_sync10_q4 ++ m_sync_q 4, v_quads10 ++::mc_sync10_q5 ++ m_sync_q 5, v_quads10 ++::mc_sync10_q6 ++ m_sync_q 6, v_quads10 ++::mc_sync10_q7 ++ m_sync_q 7, v_quads10 ++::mc_sync10_q8 ++ m_sync_q 8, v_quads10 ++::mc_sync10_q9 ++ m_sync_q 9, v_quads10 ++::mc_sync10_q10 ++ m_sync_q 10, v_quads10 ++::mc_sync10_q11 ++ m_sync_q 11, v_quads10 ++ ++::mc_exit_y10_q0 ++::mc_exit_c10_q0 ++ m_exit_q0 ++ ++::mc_exit_y10_qn ++::mc_exit_c10_qn ++ m_exit_qn ++ ++::mc_setup_y10_q0 ++ m_setup_q0 ++::mc_setup_y10_qn ++ m_setup_y 10 ++ ++:per_block_setup_10 ++ m_per_block_setup 10 ++ ++::mc_filter_y10_pxx ++ m_filter_y_pxx 10 ++ ++::mc_filter_y10_p00 ++ m_filter_y_p00 10 ++ ++::mc_filter_y10_bxx ++ m_filter_y_bxx 10 ++ ++::mc_filter_y10_b00 ++ m_filter_y_b00 10 ++ ++ + +::mc_end +# Do not add code here because mc_end must appear after all other code. diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h new file mode 100644 -index 0000000..838b6bd +index 0000000000..9f8983da52 --- /dev/null +++ b/libavcodec/rpi_shader_cmd.h -@@ -0,0 +1,112 @@ +@@ -0,0 +1,128 @@ +#ifndef RPI_SHADER_CMD_H +#define RPI_SHADER_CMD_H + +#pragma pack(push, 4) + ++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y ++// If mixed then we are just confused and get a lot of warnings.... ++typedef const uint8_t * qpu_mc_src_addr_t; ++typedef uint8_t * qpu_mc_dst_addr_t; ++#else ++typedef uint32_t qpu_mc_src_addr_t; ++typedef uint32_t qpu_mc_dst_addr_t; ++#endif ++ +typedef struct qpu_mc_src_s +{ + int16_t y; + int16_t x; -+ uint32_t base; ++ qpu_mc_src_addr_t base; +} qpu_mc_src_t; + + @@ -18292,7 +27064,7 @@ index 0000000..838b6bd + uint32_t coeffs_y; + uint32_t wo_u; + uint32_t wo_v; -+ uint32_t dst_addr_c; ++ qpu_mc_dst_addr_t dst_addr_c; + uint32_t next_fn; +} qpu_mc_pred_c_p_t; + @@ -18309,7 +27081,7 @@ index 0000000..838b6bd + uint32_t coeffs_y2; + uint32_t wo_u2; + uint32_t wo_v2; -+ uint32_t dst_addr_c; ++ qpu_mc_dst_addr_t dst_addr_c; + uint32_t next_fn; +} qpu_mc_pred_c_b_t; + @@ -18341,7 +27113,7 @@ index 0000000..838b6bd + uint32_t mymx21; + uint32_t wo1; + uint32_t wo2; -+ uint32_t dst_addr; ++ qpu_mc_dst_addr_t dst_addr; + uint32_t next_fn; +} qpu_mc_pred_y_p_t; + @@ -18350,7 +27122,7 @@ index 0000000..838b6bd + uint16_t h; + uint16_t w; + uint32_t wo1; -+ uint32_t dst_addr; ++ qpu_mc_dst_addr_t dst_addr; + uint32_t next_fn; +} qpu_mc_pred_y_p00_t; + @@ -18377,24 +27149,618 @@ index 0000000..838b6bd +typedef union qpu_mc_pred_cmd_u { + qpu_mc_pred_y_t y; + qpu_mc_pred_c_t c; ++ uint32_t data[1]; +} qpu_mc_pred_cmd_t; + ++#define QPU_MC_PRED_N_Y8 12 ++#define QPU_MC_PRED_N_C8 12 ++ ++#define QPU_MC_PRED_N_Y10 12 ++#define QPU_MC_PRED_N_C10 12 ++ +#pragma pack(pop) + +#endif + +diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c +new file mode 100644 +index 0000000000..2d763f54ef +--- /dev/null ++++ b/libavcodec/rpi_shader_template.c +@@ -0,0 +1,66 @@ ++#ifdef RPI ++ ++#include "hevc.h" ++#include "hevcdec.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "rpi_shader_cmd.h" ++#include "rpi_shader_template.h" ++ ++typedef struct shader_track_s ++{ ++ const union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ const struct qpu_mc_src_s *last_l0; ++ const struct qpu_mc_src_s *last_l1; ++ uint32_t width; // pic_width * PW ++ uint32_t height; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++} shader_track_t; ++ ++static int wtoidx(const unsigned int w) ++{ ++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++ return pel_weight[w]; ++} ++ ++static const int fctom(uint32_t x) ++{ ++ int rv; ++ // As it happens we can take the 2nd filter term & divide it by 8 ++ // (dropping fractions) to get the fractional move ++ rv = 8 - ((x >> 11) & 0xf); ++ av_assert2(rv >= 0 && rv <= 7); ++ return rv; ++} ++ ++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++{ ++ return (x << shl) >> shr; ++} ++ ++static inline int woff_p(HEVCContext *const s, int32_t x) ++{ ++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int woff_b(HEVCContext *const s, int32_t x) ++{ ++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int wweight(int32_t x) ++{ ++ return ext(x, 16, 16); ++} ++ ++ ++#define PW 1 ++#include "rpi_shader_template_fn.h" ++ ++#undef PW ++#define PW 2 ++#include "rpi_shader_template_fn.h" ++ ++#endif ++ +diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h +new file mode 100644 +index 0000000000..ecf5b8185a +--- /dev/null ++++ b/libavcodec/rpi_shader_template.h +@@ -0,0 +1,24 @@ ++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++ ++#ifdef RPI ++struct HEVCContext; ++struct HEVCRpiInterPredEnv; ++ ++void rpi_shader_c8(struct HEVCContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_shader_c16(struct HEVCContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_sand_dump8(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++void rpi_sand_dump16(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++#endif ++#endif ++ +diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h +new file mode 100644 +index 0000000000..b5ac2ceed6 +--- /dev/null ++++ b/libavcodec/rpi_shader_template_fn.h +@@ -0,0 +1,477 @@ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++#define PATCH_STRIDE (16 * PW) ++ ++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { ++ const pixel s = *(const pixel *)src; ++ pixel * d = (pixel *)dst; ++ for (unsigned int j = 0; j < w; j += PW) { ++ *d++ = s; ++ } ++ } ++} ++ ++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride) { ++ memcpy(dst, src, w); ++ } ++} ++ ++static void FUNC(get_patch_y)(const shader_track_t * const st, ++ uint8_t * dst, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > st->width) { ++ if (x >= st->width) ++ x = st->width - PW; ++ dr = (x + w) - st->width; ++ w = st->width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > st->height) { ++ if (y >= st->height) ++ y = st->height - 1; ++ db = (y + h) - st->height; ++ h = st->height - y; ++ } ++ ++ dst += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); ++ if (dr != 0) ++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); ++ w += dl + dr; ++ dst -= dl; ++ ++ if (dt != 0) ++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); ++ if (db != 0) ++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); ++} ++ ++ ++ ++static void FUNC(get_patch_c)(const shader_track_t * const st, ++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ const int width = st->width; ++ const int height = st->height; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > width) { ++ if (x >= width) ++ x = width - PW; ++ dr = (x + w) - width; ++ w = width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > height) { ++ if (y >= height) ++ y = height - 1; ++ db = (y + h) - height; ++ h = height - y; ++ } ++ ++ dst_u += dl + dt * dst_stride; ++ dst_v += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ { ++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); ++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); ++ } ++ if (dr != 0) ++ { ++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); ++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); ++ } ++ w += dl + dr; ++ dst_u -= dl; ++ dst_v -= dl; ++ ++ if (dt != 0) ++ { ++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); ++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); ++ } ++ if (db != 0) ++ { ++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); ++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); ++ } ++} ++ ++// w, y, w, h in pixels ++// stride1, stride2 in bytes ++void FUNC(rpi_sand_dump)(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++{ ++ const int mask = stride2 == 0 ? ~0 : stride1 - 1; ++ ++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ ++ if (is_c) { ++ x *= 2; ++ w *= 2; ++ } ++ ++ for (int i = y; i != y + h; ++i) { ++ for (int j = x; j != x + w; ++j) { ++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; ++ char sep = is_c && (j & 1) == 0 ? ':' : ' '; ++#if PW == 1 ++ if (j < 0 || i < 0) ++ printf("..%c", sep); ++ else ++ printf("%02x%c", *(const pixel*)p, sep); ++#else ++ if (j < 0 || i < 0) ++ printf("...%c", sep); ++ else ++ printf("%03x%c", *(const pixel*)p, sep); ++#endif ++ } ++ printf("\n"); ++ } ++} ++ ++ ++void FUNC(rpi_shader_c)(HEVCContext *const s, ++ const HEVCRpiInterPredEnv *const ipe_y, ++ const HEVCRpiInterPredEnv *const ipe_c) ++{ ++ for (int c_idx = 0; c_idx < 2; ++c_idx) ++ { ++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; ++ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; ++ unsigned int exit_n = 0; ++ ++ if (ipe == NULL || !ipe->used) { ++ continue; ++ } ++ ++ do { ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ const HEVCRpiInterPredQ * const q = ipe->q + i; ++ shader_track_t * const st = tracka + i; ++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; ++ ++ for (;;) { ++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++ ++ if (link == q->code_setup) { ++ if (c_idx == 0) { ++ // Luma ++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++ ++ st->height = c->pic_h; ++ st->width = c->pic_w * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else { ++ // Chroma ++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++ ++ st->height = c->pic_ch; ++ st->width = c->pic_cw * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ } ++ else if (link == s->qpu.y_pxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ const int w1 = FFMIN(c->w, 8); ++ const int w2 = c->w - w1; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ if (w2 > 0) { ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ } ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ if (w2 > 0) { ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ } ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_bxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_p00) { ++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ ++ st->last_l0 = &c->next_src1; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_b00) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ av_assert0(c->w <= 16 && c->h <= 64); ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( ++ patch_y3, patch_y1, PATCH_STRIDE, ++ c->h, 0, 0, c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), 0, 0, c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx_l1) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l1 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_bxx) { ++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; ++ const int mx1 = fctom(c->coeffs_x1); ++ const int my1 = fctom(c->coeffs_y1); ++ const int mx2 = fctom(c->coeffs_x2); ++ const int my2 = fctom(c->coeffs_y2); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; ++ uint8_t patch_v1[PATCH_STRIDE * 72]; ++ uint8_t patch_u2[PATCH_STRIDE * 72]; ++ uint8_t patch_v2[PATCH_STRIDE * 72]; ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, ++ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2), ++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, ++ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2), ++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == q->code_sync) { ++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); ++ break; ++ } ++ else if (link == q->code_exit) { ++ // We expect exit to occur without other sync ++ av_assert0(i == exit_n); ++ ++exit_n; ++ break; ++ } ++ else { ++ av_assert0(0); ++ } ++ } ++ ++ st->qpu_mc_curr = cmd; ++ } ++ } while (exit_n == 0); ++ } ++} ++ ++#undef FUNC ++#undef pixel ++ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c new file mode 100644 -index 0000000..b061fe0 +index 0000000000..b502de0a2c --- /dev/null +++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,581 @@ +@@ -0,0 +1,745 @@ +#include "config.h" +#ifdef RPI ++#include "libavcodec/avcodec.h" +#include "rpi_qpu.h" +#include "rpi_mailbox.h" +#include "rpi_zc.h" +#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" +#include + +#include "libavutil/buffer_internal.h" @@ -18421,21 +27787,11 @@ index 0000000..b061fe0 + struct ZcPool * pool; +} ZcPoolEnt; + -+#if 1 -+//#define ALLOC_PAD 0x1000 -+#define ALLOC_PAD 0 -+#define ALLOC_ROUND 0x1000 -+//#define ALLOC_N_OFFSET 0x100 -+#define ALLOC_N_OFFSET 0 -+#define STRIDE_ROUND 0x80 -+#define STRIDE_OR 0x80 -+#else +#define ALLOC_PAD 0 +#define ALLOC_ROUND 0x1000 +#define ALLOC_N_OFFSET 0 -+#define STRIDE_ROUND 32 ++#define STRIDE_ROUND 64 +#define STRIDE_OR 0 -+#endif + +#define DEBUG_ZAP0_BUFFERS 0 + @@ -18612,13 +27968,22 @@ index 0000000..b061fe0 + { + case AV_PIX_FMT_YUV420P: + geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; -+ // geo.stride_y = ((video_width + 32 + 31) & ~31); + geo.stride_c = geo.stride_y / 2; -+ // geo.height_y = (video_height + 15) & ~15; + geo.height_y = (video_height + 32 + 31) & ~31; + geo.height_c = geo.height_y / 2; + geo.planes_c = 2; + geo.stripes = 1; ++ geo.bytes_per_pel = 1; ++ break; ++ ++ case AV_PIX_FMT_YUV420P10: ++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 2; + break; + + case AV_PIX_FMT_SAND128: @@ -18653,6 +28018,7 @@ index 0000000..b061fe0 + geo.height_c = img.pitch / stripe_w - geo.height_y; + geo.planes_c = 1; + geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; + + pthread_mutex_unlock(&sand_lock); + @@ -18661,6 +28027,45 @@ index 0000000..b061fe0 + break; + } + ++ case AV_PIX_FMT_SAND64_16: ++ case AV_PIX_FMT_SAND64_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV_16, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ gpu_ref(); ++ mbox_get_image_params(gpu_get_mailbox(), &new_img); ++ gpu_unref(); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 2; ++ ++ pthread_mutex_unlock(&sand_lock); ++ break; ++ } ++ + default: + memset(&geo, 0, sizeof(geo)); + break; @@ -18733,8 +28138,12 @@ index 0000000..b061fe0 + frame->linesize[0] = geo.stride_y; + frame->linesize[1] = geo.stride_c; + frame->linesize[2] = geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply + if (geo.stripes > 1) -+ frame->linesize[3] = geo.height_y + geo.height_c; // abuse: linesize[3] = stripe stride ++ frame->linesize[3] = geo.height_y + geo.height_c; + + frame->data[0] = buf->data; + frame->data[1] = frame->data[0] + size_y; @@ -18744,6 +28153,11 @@ index 0000000..b061fe0 + frame->extended_data = frame->data; + // Leave extended buf alone + ++#if RPI_ZC_SAND_8_IN_10_BUF != 0 ++ // *** If we intend to use this for real we will want a 2nd buffer pool ++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge ++#endif ++ + return 0; +} + @@ -18762,7 +28176,7 @@ index 0000000..b061fe0 + rv = avcodec_default_get_buffer2(s, frame, flags); + } + else if (frame->format == AV_PIX_FMT_YUV420P || -+ frame->format == AV_PIX_FMT_SAND128) ++ av_rpi_is_sand_frame(frame)) + { + rv = rpi_get_display_buffer(s->get_buffer_context, frame); + } @@ -18792,6 +28206,7 @@ index 0000000..b061fe0 + unsigned int i; + uint8_t * psrc, * pdest; + ++ dest->format = src->format; + dest->width = src->width; + dest->height = src->height; + @@ -18823,29 +28238,142 @@ index 0000000..b061fe0 +} + + ++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s, ++ const AVFrame * const src) ++{ ++ AVFrame dest_frame; ++ AVFrame * const dest = &dest_frame; ++ unsigned int i; ++ uint8_t * psrc, * psrc2, * pdest; ++ ++ memset(dest, 0, sizeof(*dest)); ++ dest->format = AV_PIX_FMT_SAND128; ++ dest->width = src->width; ++ dest->height = src->height; ++ ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) ++ { ++ return NULL; ++ } ++ ++ // Y ++ for (i = 0, psrc = src->data[0], pdest = dest->data[0]; ++ i != dest->height; ++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) ++ { ++ uint16_t * s = (uint16_t*)psrc; ++ uint8_t * d = pdest; ++ for (unsigned int k = 0; k < dest->width; k += dest->linesize[0]) ++ { ++ const unsigned int n = FFMIN(dest->linesize[0], dest->width - k); ++ for (unsigned int j = 0; j != n; ++j) ++ *d++ = (uint8_t)(*s++ >> 2); ++ d += (dest->linesize[3] - 1) * dest->linesize[0]; ++ } ++ } ++ ++ // C ++ for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1]) ++ { ++ const uint16_t * su = (uint16_t*)psrc; ++ const uint16_t * sv = (uint16_t*)psrc2; ++ uint8_t * d = pdest; ++ for (unsigned int k = 0; k < dest->width; k += dest->linesize[1]) ++ { ++ const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2; ++ for (unsigned int j = 0; j != n; ++j) ++ { ++ *d++ = (uint8_t)(*su++ >> 2); ++ *d++ = (uint8_t)(*sv++ >> 2); ++ } ++ d += (dest->linesize[3] - 1) * dest->linesize[1]; ++ } ++ } ++ ++ return dest->buf[0]; ++} ++ ++ ++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s, ++ const AVFrame * const src, const unsigned int src_bits) ++{ ++ AVFrame dest_frame = { ++ .format = AV_PIX_FMT_SAND128, ++ .width = src->width, ++ .height = src->height ++ }; ++ AVFrame * const dest = &dest_frame; ++ const unsigned int shr = src_bits - 8; ++ ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) ++ { ++ return NULL; ++ } ++ ++ // Y ++ av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest), ++ src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest), ++ src->width, src->height, shr); ++ // C ++ av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest), ++ src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest), ++ src->width, src->height / 2, shr); ++ ++ return dest->buf[0]; ++} ++ ++ ++ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s, -+ const AVFrame * const frame, const int maycopy) ++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) +{ + assert(s != NULL); + + if (frame->format != AV_PIX_FMT_YUV420P && -+ frame->format != AV_PIX_FMT_SAND128) ++ frame->format != AV_PIX_FMT_YUV420P10 && ++ !av_rpi_is_sand_frame(frame)) + { + av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); + return NULL; + } + -+ if (frame->buf[1] != NULL) ++ if (frame->buf[1] != NULL || frame->format != expected_format) + { -+ av_assert0(frame->format == AV_PIX_FMT_YUV420P); ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) ++ { ++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); ++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); ++ } ++#endif ++ + if (maycopy) + { -+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); -+ return zc_copy(s, frame); ++ if (frame->buf[1] != NULL) ++ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); ++ else ++ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); ++ ++ switch (frame->format) ++ { ++ case AV_PIX_FMT_YUV420P10: ++ return zc_420p10_to_sand128(s, frame); ++ ++ case AV_PIX_FMT_SAND64_10: ++ return zc_sand64_16_to_sand128(s, frame, 10); ++ ++ default: ++ return zc_copy(s, frame); ++ } + } + else + { -+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__); ++ if (frame->buf[1] != NULL) ++ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); ++ else ++ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); + return NULL; + } + } @@ -18972,10 +28500,10 @@ index 0000000..b061fe0 + diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 -index 0000000..f4aeb78 +index 0000000000..26fb3be999 --- /dev/null +++ b/libavcodec/rpi_zc.h -@@ -0,0 +1,137 @@ +@@ -0,0 +1,105 @@ +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + @@ -18986,23 +28514,33 @@ index 0000000..f4aeb78 +// bit of memory for the frame when can then be reference counted until +// display has finished with it. + -+#include "libavutil/frame.h" -+#include "libavcodec/avcodec.h" ++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame ++// 0 disables ++// *** This option still in development ++// Only works if SAO active ++// Allocates buffers that are twice the required size ++#define RPI_ZC_SAND_8_IN_10_BUF 0 ++ ++struct AVBufferRef; ++struct AVFrame; ++struct AVCodecContext; ++enum AVPixelFormat; + +// "Opaque" pointer to whatever we are using as a buffer reference -+typedef AVBufferRef * AVRpiZcRefPtr; ++typedef struct AVBufferRef * AVRpiZcRefPtr; + +struct AVZcEnv; +typedef struct AVZcEnv * AVZcEnvPtr; + +typedef struct AVRpiZcFrameGeometry +{ -+ unsigned int stride_y; -+ unsigned int height_y; -+ unsigned int stride_c; -+ unsigned int height_c; -+ unsigned int planes_c; -+ unsigned int stripes; ++ unsigned int stride_y; // Luma stride (bytes) ++ unsigned int height_y; // Luma height (lines) ++ unsigned int stride_c; // Chroma stride (bytes) ++ unsigned int height_c; // Chroma stride (lines) ++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) ++ unsigned int stripes; // Number of stripes (sand) ++ unsigned int bytes_per_pel; +} AVRpiZcFrameGeometry; + + @@ -19028,7 +28566,7 @@ index 0000000..f4aeb78 +// the data, then allocate a new buffer and copy the data into it +// Otherwise return NULL +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s, -+ const AVFrame * const frame, const int maycopy); ++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); + +// Get the vc_handle from the frame ref +// Returns -1 if ref doesn't look valid @@ -19069,52 +28607,10 @@ index 0000000..f4aeb78 + + + -+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame) -+{ -+ return frame->linesize[3]; -+} -+ -+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ const unsigned int stride1 = frame->linesize[0]; -+ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y + stride2 * x2; -+} -+ -+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) -+{ -+ const unsigned int stride1 = frame->linesize[0]; -+ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); -+ const unsigned int x = x_c * 2; -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y_c + stride2 * x2; -+} -+ -+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y); -+} -+ -+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y); -+} -+ -+static inline int rpi_sliced_frame(const AVFrame * const frame) -+{ -+ return frame->format == AV_PIX_FMT_SAND128; -+} -+ -+ +#endif + diff --git a/libavcodec/utils.c b/libavcodec/utils.c -index 0c68836..b8139f5 100644 +index 9363026695..8a8b13f0df 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -26,6 +26,12 @@ @@ -19130,7 +28626,15 @@ index 0c68836..b8139f5 100644 #include "libavutil/atomic.h" #include "libavutil/attributes.h" #include "libavutil/avassert.h" -@@ -64,6 +70,10 @@ +@@ -39,6 +45,7 @@ + #include "libavutil/mathematics.h" + #include "libavutil/mem_internal.h" + #include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" + #include "libavutil/imgutils.h" + #include "libavutil/samplefmt.h" + #include "libavutil/dict.h" +@@ -64,6 +71,10 @@ #include "libavutil/ffversion.h" const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION; @@ -19141,7 +28645,7 @@ index 0c68836..b8139f5 100644 #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS static int default_lockmgr_cb(void **arg, enum AVLockOp op) { -@@ -508,6 +518,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, +@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, return ret; } @@ -19189,7 +28693,7 @@ index 0c68836..b8139f5 100644 static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) { FramePool *pool = avctx->internal->pool; -@@ -555,6 +606,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) +@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) av_buffer_pool_uninit(&pool->pools[i]); pool->linesize[i] = linesize[i]; if (size[i]) { @@ -19204,20 +28708,20 @@ index 0c68836..b8139f5 100644 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, CONFIG_MEMORY_POISONING ? NULL : -@@ -729,6 +788,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags +@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags { int ret; +#ifdef RPI + // This is going to end badly if we let it continue -+ av_assert0(frame->format != AV_PIX_FMT_SAND128); ++ av_assert0(!av_rpi_is_sand_frame(frame)); +#endif + if (avctx->hw_frames_ctx) return av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0); diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c -index ecfb872..5fa099f 100644 +index 7d306a5c33..20eeda97aa 100644 --- a/libavfilter/avfilter.c +++ b/libavfilter/avfilter.c @@ -969,6 +969,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args) @@ -19229,7 +28733,7 @@ index ecfb872..5fa099f 100644 #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR if ( !strcmp(filter->filter->name, "format") || diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c -index 3eff152..30dfb14 100644 +index 3eff1522bd..30dfb14946 100644 --- a/libavformat/mpegts.c +++ b/libavformat/mpegts.c @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { @@ -19242,7 +28746,7 @@ index 3eff152..30dfb14 100644 { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, diff --git a/libavformat/utils.c b/libavformat/utils.c -index a82bbc7..4bf5574 100644 +index ff55fc8d97..c233f57bbd 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in @@ -19254,8 +28758,84 @@ index a82bbc7..4bf5574 100644 continue; s->streams[i]->pts_wrap_reference = pts_wrap_reference; s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; +diff --git a/libavutil/Makefile b/libavutil/Makefile +index 15d95dec67..3be954257b 100644 +--- a/libavutil/Makefile ++++ b/libavutil/Makefile +@@ -60,6 +60,8 @@ HEADERS = adler32.h \ + rational.h \ + replaygain.h \ + ripemd.h \ ++ rpi_sand_fns.h \ ++ rpi_sand_fn_pw.h \ + samplefmt.h \ + sha.h \ + sha512.h \ +@@ -138,6 +140,7 @@ OBJS = adler32.o \ + reverse.o \ + rc4.o \ + ripemd.o \ ++ rpi_sand_fns.o \ + samplefmt.o \ + sha.o \ + sha512.o \ +diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile +index 5da44b0542..b74b7c4e2f 100644 +--- a/libavutil/arm/Makefile ++++ b/libavutil/arm/Makefile +@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ + + NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ ++ arm/rpi_sand_neon.o \ +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S +new file mode 100644 +index 0000000000..dbffdaefa4 +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -0,0 +1,40 @@ ++#include "libavutil/arm/asm.S" ++ ++@ void rpi_sand128b_stripe_to_8_10( ++@ uint8_t * dest, [r0] ++@ const uint8_t * src1, [r1] ++@ const uint8_t * src2, [r2] ++@ unsigned int lines); [r3] ++ ++.macro stripe2_to_8, bit_depth ++ vpush {q4-q7} ++1: ++ vldm r1!, {q0-q7} ++ subs r3, #1 ++ vldm r2!, {q8-q15} ++ vqrshrn.u16 d0, q0, #\bit_depth - 8 ++ vqrshrn.u16 d1, q1, #\bit_depth - 8 ++ vqrshrn.u16 d2, q2, #\bit_depth - 8 ++ vqrshrn.u16 d3, q3, #\bit_depth - 8 ++ vqrshrn.u16 d4, q4, #\bit_depth - 8 ++ vqrshrn.u16 d5, q5, #\bit_depth - 8 ++ vqrshrn.u16 d6, q6, #\bit_depth - 8 ++ vqrshrn.u16 d7, q7, #\bit_depth - 8 ++ vqrshrn.u16 d8, q8, #\bit_depth - 8 ++ vqrshrn.u16 d9, q9, #\bit_depth - 8 ++ vqrshrn.u16 d10, q10, #\bit_depth - 8 ++ vqrshrn.u16 d11, q11, #\bit_depth - 8 ++ vqrshrn.u16 d12, q12, #\bit_depth - 8 ++ vqrshrn.u16 d13, q13, #\bit_depth - 8 ++ vqrshrn.u16 d14, q14, #\bit_depth - 8 ++ vqrshrn.u16 d15, q15, #\bit_depth - 8 ++ vstm r0!, {q0-q7} ++ bne 1b ++ vpop {q4-q7} ++ bx lr ++.endm ++ ++function rpi_sand128b_stripe_to_8_10, export=1 ++ stripe2_to_8 10 ++endfunc ++ diff --git a/libavutil/buffer.c b/libavutil/buffer.c -index 8d1aa5f..649876d 100644 +index 8d1aa5fa84..649876db77 100644 --- a/libavutil/buffer.c +++ b/libavutil/buffer.c @@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool) @@ -19269,7 +28849,7 @@ index 8d1aa5f..649876d 100644 + return buf->opaque; +} diff --git a/libavutil/buffer.h b/libavutil/buffer.h -index 73b6bd0..d907de3 100644 +index 73b6bd0b14..d907de3f1c 100644 --- a/libavutil/buffer.h +++ b/libavutil/buffer.h @@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool); @@ -19283,7 +28863,7 @@ index 73b6bd0..d907de3 100644 * @} */ diff --git a/libavutil/frame.h b/libavutil/frame.h -index 7cb78a1..b94a635 100644 +index 7cb78a1a44..b94a63565f 100644 --- a/libavutil/frame.h +++ b/libavutil/frame.h @@ -127,6 +127,13 @@ enum AVFrameSideDataType { @@ -19315,10 +28895,10 @@ index 7cb78a1..b94a635 100644 /** * Structure to hold side data for an AVFrame. diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c -index d4a7a8b..92a01a4 100644 +index d4a7a8ba3b..bf7e402373 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c -@@ -2158,6 +2158,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { +@@ -2158,6 +2158,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA, }, @@ -19333,26 +28913,463 @@ index d4a7a8b..92a01a4 100644 + { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ + }, + .flags = 0, -+ } ++ }, ++ [AV_PIX_FMT_SAND64_10] = { ++ .name = "sand64_10", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ ++ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */ ++ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */ ++ }, ++ .flags = 0, ++ }, }; #if FF_API_PLUS1_MINUS1 FF_ENABLE_DEPRECATION_WARNINGS diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h -index 5dafc34..0895b69 100644 +index 5dafc341a1..9af4c3e610 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h -@@ -314,6 +314,9 @@ enum AVPixelFormat { +@@ -314,6 +314,11 @@ enum AVPixelFormat { AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian +// RPI - not on ifdef so can be got at by calling progs -+ AV_PIX_FMT_SAND128, ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +new file mode 100644 +index 0000000000..52d52a2a83 +--- /dev/null ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -0,0 +1,182 @@ ++// * Included twice from rpi_sand_fn with different PW ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x; ++ const unsigned int w = _w; ++ const unsigned int mask = stride1 - 1; ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { ++ memcpy(dst, p, w); ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const uint8_t * p = p2; ++ uint8_t * d = dst; ++ memcpy(d, p1, w1); ++ d += w1; ++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { ++ memcpy(d, p, stride1); ++ } ++ memcpy(d, p, w3); ++ } ++ } ++} ++ ++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) ++ ++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ const pixel * p = (const pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * p = (const pixel *)p1; ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++} ++ ++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++} ++ ++ ++#undef pixel ++#undef STRCAT ++#undef FUNC ++ +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +new file mode 100644 +index 0000000000..b8bfad915e +--- /dev/null ++++ b/libavutil/rpi_sand_fns.c +@@ -0,0 +1,96 @@ ++#include "config.h" ++#include ++#include ++#include "rpi_sand_fns.h" ++#include "avassert.h" ++ ++#define PW 1 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#define PW 2 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#if HAVE_NEON ++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines); ++#endif ++ ++#if 1 ++// Simple round ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ const unsigned int rnd = (1 << shr) >> 1; ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ *dst++ = (*src++ + rnd) >> shr; ++ } ++} ++#else ++// Dithered variation ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ unsigned int rnd = (1 << shr) >> 1; ++ const unsigned int mask = ((1 << shr) - 1); ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ rnd = *src++ + (rnd & mask); ++ *dst++ = rnd >> shr; ++ } ++} ++#endif ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr) ++{ ++ const unsigned int n = dst_stride1 / 2; ++ unsigned int j; ++ ++ // This is true for our current layouts ++ av_assert0(dst_stride1 == src_stride1); ++ ++ // As we have the same stride1 for src & dest and src is wider than dest ++ // then if we loop on src we can always write contiguously to dest ++ // We make no effort to copy an exact width - round up to nearest src stripe ++ // as we will always have storage in dest for that ++ ++#if HAVE_NEON ++ if (shr == 3 && src_stride1 == 128) { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ rpi_sand128b_stripe_to_8_10(d, s1, s2, h); ++ } ++ } ++ else ++#endif ++ { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ cpy16_to_8(d + n, s2, n, shr); ++ } ++ } ++ } ++ ++ // Fix up a trailing dest half stripe ++ if (j < w) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ } ++ } ++} ++ +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +new file mode 100644 +index 0000000000..48948ecb47 +--- /dev/null ++++ b/libavutil/rpi_sand_fns.h +@@ -0,0 +1,127 @@ ++#ifndef AVUTIL_RPI_SAND_FNS ++#define AVUTIL_RPI_SAND_FNS ++ ++#include "libavutil/frame.h" ++ ++// For all these fns _x & _w are measured as coord * PW ++// For the C fns coords are in chroma pels (so luma / 2) ++// Strides are in bytes ++ ++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_planar_to_sand_c8(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_planar_to_sand_c16(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr); ++ ++ ++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) ++{ ++ // * We could repl;ace thios with a fixed 128 whic would allow the compiler ++ // to optimize a whole lot better ++ return frame->linesize[0]; ++} ++ ++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_SAND128); ++} ++ ++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) ++{ ++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand8_frame(frame) ? 0 : 1; ++} ++ ++// If x is measured in bytes (not pixels) then this works for sand64_16 as ++// well as sand128 - but in the general case we work that out ++ ++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); ++} ++ ++#endif ++ diff --git a/libswscale/input.c b/libswscale/input.c -index 04a5190..837f633 100644 +index 04a5190711..0a188ba267 100644 --- a/libswscale/input.c +++ b/libswscale/input.c @@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV, @@ -19369,36 +29386,38 @@ index 04a5190..837f633 100644 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, -@@ -1124,6 +1131,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) +@@ -1124,6 +1131,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_P016BE: c->chrToYV12 = p016BEToUV_c; break; + case AV_PIX_FMT_SAND128: -+ c->chrToYV12 = sand128ToUV_c; ++ case AV_PIX_FMT_SAND64_10: ++ c->chrToYV12 = sand128ToUV_c; // NIF + break; } if (c->chrSrcHSubSample) { switch (srcFormat) { diff --git a/libswscale/utils.c b/libswscale/utils.c -index 4c9b53b..835f3aa 100644 +index 4c9b53bbeb..df8a793770 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c -@@ -254,6 +254,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { +@@ -254,6 +254,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_P010BE] = { 1, 1 }, [AV_PIX_FMT_P016LE] = { 1, 0 }, [AV_PIX_FMT_P016BE] = { 1, 0 }, +#ifdef RPI + [AV_PIX_FMT_SAND128] = { 1, 0 }, ++ [AV_PIX_FMT_SAND64_10] = { 1, 0 }, +#endif }; int sws_isSupportedInput(enum AVPixelFormat pix_fmt) diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000..2d45679 +index 0000000000..b1e99a6a89 --- /dev/null +++ b/pi-util/BUILD.txt -@@ -0,0 +1,24 @@ +@@ -0,0 +1,25 @@ +Building Pi FFmpeg +================== + @@ -19416,16 +29435,216 @@ index 0000000..2d45679 +in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a +lot of history you don't want. + -+If you have a copy of qasm.py in ../local then the .qasm sources will be ++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be +rebuilt. Otherwise the prebuilt .c & .h files will be used. ++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild + +pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time +H265 QPU acceleration is broken on Pi1 and so it is disabled. + + +diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv +new file mode 100644 +index 0000000000..f05b7753f7 +--- /dev/null ++++ b/pi-util/conf_h265.2016.csv +@@ -0,0 +1,193 @@ ++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 ++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5 ++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt ++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt ++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt ++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt ++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt ++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt ++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5 ++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5 ++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5 ++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5 ++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5 ++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5 ++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5 ++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5 ++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5 ++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt ++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt ++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5 ++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5 ++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5 ++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5 ++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5 ++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5 ++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5 ++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5 ++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5 ++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv new file mode 100644 -index 0000000..6082641 +index 0000000000..6082641271 --- /dev/null +++ b/pi-util/conf_h265.2016_HEVC_v1.csv @@ -0,0 +1,147 @@ @@ -19578,7 +29797,7 @@ index 0000000..6082641 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv new file mode 100644 -index 0000000..fc14f2a +index 0000000000..fc14f2a3c2 --- /dev/null +++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ @@ -19728,7 +29947,7 @@ index 0000000..fc14f2a +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh new file mode 100755 -index 0000000..ec25b81 +index 0000000000..ec25b81c31 --- /dev/null +++ b/pi-util/conf_pi1.sh @@ -0,0 +1,31 @@ @@ -19765,7 +29984,7 @@ index 0000000..ec25b81 +# -Wa,-ahls diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh new file mode 100755 -index 0000000..f8e5e75 +index 0000000000..f8e5e75375 --- /dev/null +++ b/pi-util/conf_pi2.sh @@ -0,0 +1,30 @@ @@ -19801,12 +30020,13 @@ index 0000000..f8e5e75 +# -Wa,-ahls diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py new file mode 100755 -index 0000000..e96bad2 +index 0000000000..70f7be22bb --- /dev/null +++ b/pi-util/ffconf.py -@@ -0,0 +1,164 @@ +@@ -0,0 +1,174 @@ +#!/usr/bin/env python + ++import string +import os +import subprocess +import re @@ -19817,9 +30037,18 @@ index 0000000..e96bad2 + +ffmpeg_exec = "./ffmpeg" + -+def testone(fileroot, name, es_file, md5_file): ++def testone(fileroot, srcname, es_file, md5_file): + tmp_root = "/tmp" + ++ names = srcname.split('/') ++ while len(names) > 1: ++ tmp_root = os.path.join(tmp_root, names[0]) ++ del names[0] ++ name = names[0] ++ ++ if not os.path.exists(tmp_root): ++ os.makedirs(tmp_root) ++ + dec_file = os.path.join(tmp_root, name + ".dec.md5") + try: + os.remove(dec_file) @@ -19878,7 +30107,7 @@ index 0000000..e96bad2 + pass + elif ext == ".bit" or ext == ".bin": + es_file = f -+ elif ext == ".md5" or (ext == ".txt" and base[-4:] == "_md5"): ++ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")): + if md5_file == "?": + md5_file = f + elif base[-3:] == "yuv": @@ -19890,9 +30119,9 @@ index 0000000..e96bad2 + if not tests: + return True + for t in tests: -+ if name[0:len(t)] == t: ++ if name[0:len(t)] == t or name.find("/" + t) != -1: + return True -+ return False ++ return False + +def doconf(csva, tests, test_root): + unx_failures = [] @@ -19954,9 +30183,9 @@ index 0000000..e96bad2 + + argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester") + argp.add_argument("tests", nargs='*') -+ argp.add_argument("--test_root", default="/opt/conform/h265", help="Root dir for test") ++ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test") + argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") -+ argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename") ++ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename") + args = argp.parse_args() + + if args.csvgen: @@ -19969,14 +30198,169 @@ index 0000000..e96bad2 + + doconf(csva, args.tests, args.test_root) + +diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py +new file mode 100755 +index 0000000000..27cc453963 +--- /dev/null ++++ b/pi-util/ffperf.py +@@ -0,0 +1,124 @@ ++#!/usr/bin/env python3 ++ ++import time ++import string ++import os ++import tempfile ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class tstats: ++ close_threshold = 0.01 ++ ++ def __init__(self, stats_dict=None): ++ if stats_dict != None: ++ self.name = stats_dict["name"] ++ self.elapsed = float(stats_dict["elapsed"]) ++ self.user = float(stats_dict["user"]) ++ self.sys = float(stats_dict["sys"]) ++ ++ def times_str(self): ++ ctime = self.sys + self.user ++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) ++ ++ def dict(self): ++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} ++ ++ def is_close(self, other): ++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold ++ ++ def __lt__(self, other): ++ return self.elapsed < other.elapsed ++ def __gt__(self, other): ++ return self.elapsed > other.elapsed ++ ++ def time_file(name, prefix): ++ stats = tstats() ++ stats.name = name ++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name, ++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); ++ pinfo = os.wait4(cproc.pid, 0) ++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ stats.elapsed = end_time - start_time ++ stats.user = pinfo[2].ru_utime ++ stats.sys = pinfo[2].ru_stime ++ return stats ++ ++ ++def common_prefix(s1, s2): ++ for i in range(min(len(s1),len(s2))): ++ if s1[i] != s2[i]: ++ return s1[:i] ++ return s1[:i+1] ++ ++def main(): ++ global flog ++ ++ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog=""" ++To blank the screen before starting use "xdg-screensaver activate" ++(For some reason this doesn't seem to work from within python). ++""") ++ ++ argp.add_argument("streams", nargs='*') ++ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename") ++ argp.add_argument("--csv_in", help="CSV input filename") ++ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).") ++ ++ args = argp.parse_args() ++ ++ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"]) ++ csv_out.writeheader() ++ ++ stats_in = {} ++ if args.csv_in != None: ++ with open(args.csv_in, 'r', newline='') as f_in: ++ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ ++ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt") ++ ++ streams = args.streams ++ if not streams: ++ if not stats_in: ++ print ("No source streams specified") ++ return 1 ++ prefix = "" if args.prefix == None else args.prefix ++ streams = [k for k in stats_in] ++ elif args.prefix != None: ++ prefix = args.prefix ++ else: ++ prefix = streams[0] ++ for f in streams[1:]: ++ prefix = common_prefix(prefix, f) ++ pp = prefix.rpartition(os.sep) ++ prefix = pp[0] + pp[1] ++ streams = [s[len(prefix):] for s in streams] ++ ++ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()): ++ print ("====", f) ++ ++ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999}) ++ for i in range(3): ++ t = tstats.time_file(f, prefix) ++ print ("...", t.times_str()) ++ if t0 > t: ++ t0 = t ++ ++ if t0.name in stats_in: ++ pstat = stats_in[t0.name] ++ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str()) ++ ++ csv_out.writerow(t0.dict()) ++ ++ print () ++ ++ return 0 ++ ++ ++if __name__ == '__main__': ++ exit(main()) ++ +diff --git a/pi-util/make_array.py b/pi-util/make_array.py +new file mode 100755 +index 0000000000..864fa5e704 +--- /dev/null ++++ b/pi-util/make_array.py +@@ -0,0 +1,19 @@ ++#!/usr/bin/env python ++ ++# Usage ++# make_array file.bin ++# Produces file.h with array of bytes. ++# ++import sys ++for file in sys.argv[1:]: ++ prefix,suffix = file.split('.') ++ assert suffix=='bin' ++ name=prefix.split('/')[-1] ++ print 'Converting',file ++ with open(prefix+'.h','wb') as out: ++ print >>out, 'static const unsigned char',name,'[] = {' ++ with open(file,'rb') as fd: ++ for byte in fd.read(): ++ print >>out, '%d,' % ord(byte) ++ print >>out,'};' ++ diff --git a/pi-util/qem.sh b/pi-util/qem.sh new file mode 100755 -index 0000000..47dd071 +index 0000000000..5ce2eeaf72 --- /dev/null +++ b/pi-util/qem.sh @@ -0,0 +1,9 @@ +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex -+QASM=python\ pi-util/qasm.py ++QASM=python\ ../local/bin/qasm.py +SRC_FILE=libavcodec/rpi_shader.qasm +DST_BASE=shader + @@ -19986,7 +30370,7 @@ index 0000000..47dd071 + diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py new file mode 100755 -index 0000000..5935a11 +index 0000000000..5935a11ca5 --- /dev/null +++ b/pi-util/v3dusage.py @@ -0,0 +1,128 @@ @@ -20118,4 +30502,3 @@ index 0000000..5935a11 + + do_logparse(args.logfile) + -