From 4b849b5f420e0524ba2d6be209ed429250d228c6 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 10 May 2017 05:49:19 +0100 Subject: [PATCH 1/5] kodi: update to kodi-61e98fd --- packages/mediacenter/kodi/package.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index 894d78f3a7..8d27e5f2ac 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="kodi" -PKG_VERSION="91a9066" +PKG_VERSION="61e98fd" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" From 98359156bda987c1b6085514048c3e9be36355b5 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 10 May 2017 05:49:19 +0100 Subject: [PATCH 2/5] kodi-binary-addons: update to latest --- .../mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.fluidsynth/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.modplug/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.nosefart/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.openmpt/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.organya/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.sidplay/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.snesapu/package.mk | 2 +- .../mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.stsound/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.timidity/package.mk | 2 +- .../kodi-binary-addons/audiodecoder.vgmstream/package.mk | 2 +- .../mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk | 2 +- .../kodi-binary-addons/inputstream.adaptive/package.mk | 2 +- .../mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk | 2 +- .../kodi-binary-addons/peripheral.joystick/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk | 2 +- .../mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk | 2 +- .../mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk | 2 +- .../mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk | 2 +- .../kodi-binary-addons/pvr.mediaportal.tvserver/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk | 2 +- packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk | 2 +- .../kodi-binary-addons/screensaver.asteroids/package.mk | 2 +- .../kodi-binary-addons/screensaver.asterwave/package.mk | 2 +- .../kodi-binary-addons/screensaver.biogenesis/package.mk | 2 +- .../kodi-binary-addons/screensaver.cpblobs/package.mk | 2 +- .../kodi-binary-addons/screensaver.greynetic/package.mk | 2 +- .../kodi-binary-addons/screensaver.matrixtrails/package.mk | 2 +- .../kodi-binary-addons/screensaver.pingpong/package.mk | 2 +- .../mediacenter/kodi-binary-addons/screensaver.pyro/package.mk | 2 +- .../kodi-binary-addons/screensaver.shadertoy/package.mk | 2 +- .../mediacenter/kodi-binary-addons/screensaver.stars/package.mk | 2 +- .../mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk | 2 +- .../kodi-binary-addons/visualization.fishbmc/package.mk | 2 +- .../kodi-binary-addons/visualization.goom/package.mk | 2 +- .../kodi-binary-addons/visualization.projectm/package.mk | 2 +- .../kodi-binary-addons/visualization.shadertoy/package.mk | 2 +- .../kodi-binary-addons/visualization.spectrum/package.mk | 2 +- .../kodi-binary-addons/visualization.waveform/package.mk | 2 +- 55 files changed, 55 insertions(+), 55 deletions(-) diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk index 7a7d2b8c7d..48c556becb 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.2sf" -PKG_VERSION="05fa941" +PKG_VERSION="c9de26d" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk index 6553771742..e8c3ec66dd 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.dumb/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.dumb" -PKG_VERSION="6c15ef8" +PKG_VERSION="54fba3d" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk index 635219b69a..790031232a 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.fluidsynth/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.fluidsynth" -PKG_VERSION="67fd270" +PKG_VERSION="e0f1809" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk index 50b427edc3..a8869dea91 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gme/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.gme" -PKG_VERSION="4f8c49a" +PKG_VERSION="8328bf2" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk index 01672584be..ad4fca57bc 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.gsf" -PKG_VERSION="122ff46" +PKG_VERSION="acf4998" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk index 6fa3cf4d59..171eb826cc 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.modplug" -PKG_VERSION="ae0b214" +PKG_VERSION="950682e" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk index 287742f463..1a89833f4e 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.ncsf" -PKG_VERSION="8835a04" +PKG_VERSION="f914839" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk index a863abb86c..abc1b28a72 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.nosefart/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.nosefart" -PKG_VERSION="1d3de76" +PKG_VERSION="1a9f949" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk index 8b0c74fc01..4ecea39638 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.openmpt" -PKG_VERSION="ceaffa1" +PKG_VERSION="fbcbfda" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk index 7fa1de2a89..949ebf33ed 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.organya" -PKG_VERSION="8573890" +PKG_VERSION="bacd0ab" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk index 5f9499dc00..e56eef901b 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.qsf" -PKG_VERSION="5edc117" +PKG_VERSION="e581a67" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk index 8c3e46e370..d02c09d184 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.sidplay/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.sidplay" -PKG_VERSION="3e8a22e" +PKG_VERSION="4083bc5" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk index e5c6355d13..2ed9a187d1 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.snesapu/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.snesapu" -PKG_VERSION="84b7104" +PKG_VERSION="b151c13" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk index 9db3a5b9f4..24b2492662 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.ssf" -PKG_VERSION="b12c97d" +PKG_VERSION="62750ac" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk index 4ca18b373d..5c936c07f7 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.stsound/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.stsound" -PKG_VERSION="a306bf6" +PKG_VERSION="89ed4f3" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk index a36019f2b0..9b49a85c98 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.timidity" -PKG_VERSION="20823d2" +PKG_VERSION="8bd7092" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk index 051388a700..000968ae18 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.vgmstream/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="audiodecoder.vgmstream" -PKG_VERSION="a7c6153" +PKG_VERSION="de21bab" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk index b3f9d7628f..b6311b92ff 100644 --- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk +++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="imagedecoder.raw" -PKG_VERSION="e7e2c2d" +PKG_VERSION="37ef22e" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index 376066d34c..520a823a7b 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="inputstream.adaptive" -PKG_VERSION="f23ba39" +PKG_VERSION="7bde41f" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/liberty-developer/inputstream.adaptive/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk index c32210797f..8d127b6ca2 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="inputstream.rtmp" -PKG_VERSION="6c1af46" +PKG_VERSION="1052cd3" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk index bd41ada7a4..54b056a9fd 100644 --- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk +++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="peripheral.joystick" -PKG_VERSION="3c7ea59" +PKG_VERSION="07aa1fe" PKG_REV="0" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk index 46f59751a4..81437f0690 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.argustv" -PKG_VERSION="7135b27" +PKG_VERSION="8f89814" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk index bd09eabe29..4246a7373a 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.demo" -PKG_VERSION="1606b61" +PKG_VERSION="978f428" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk index 9ba09b8ddf..f636a14cf2 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.dvblink" -PKG_VERSION="2634f6f" +PKG_VERSION="b7d887c" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk index 19145a8d20..8dc3656030 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.dvbviewer" -PKG_VERSION="13c6e5e" +PKG_VERSION="d099cfa" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk index d412e2e83e..769c12fc71 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.filmon" -PKG_VERSION="e026519" +PKG_VERSION="0f1d34d" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk index 40821e7945..1e830288ab 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.hdhomerun" -PKG_VERSION="98cb8d4" +PKG_VERSION="ab91169" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk index 6e4e7e9b0d..760637cdc7 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.hts" -PKG_VERSION="2993f43" +PKG_VERSION="3911c7f" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk index d5c01284f2..cededa128e 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.iptvsimple" -PKG_VERSION="53d63cc" +PKG_VERSION="f83990a" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk index bd074c7eeb..bb4c31a819 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.mediaportal.tvserver" -PKG_VERSION="d4dad61" +PKG_VERSION="367b128" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk index e0069b66b8..e1d141cee4 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.nextpvr" -PKG_VERSION="bb21826" +PKG_VERSION="e6ece9f" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk index 81b84d221f..dfa32a2a30 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.njoy" -PKG_VERSION="1ce9aba" +PKG_VERSION="296f558" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk index cea1924c41..3adfdff286 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.pctv" -PKG_VERSION="e3b2b84" +PKG_VERSION="eab5f85" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk index 861e7b75e5..f4d25e2c91 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.stalker" -PKG_VERSION="0466af9" +PKG_VERSION="62b7908" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk index f262f26244..08879bb604 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.vbox" -PKG_VERSION="d61c501" +PKG_VERSION="6001735" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk index 3150befcd5..61de07c987 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.vdr.vnsi" -PKG_VERSION="9ede401" +PKG_VERSION="b7c3f3b" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk index e93797d356..55a9b0d0aa 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.vuplus" -PKG_VERSION="d7fdd1e" +PKG_VERSION="78df030" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk index c91c248235..0e57b13566 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="pvr.wmc" -PKG_VERSION="5aa3b1c" +PKG_VERSION="27a88ca" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk index 3a81c78ebb..1cd361fbf2 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.asteroids/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.asteroids" -PKG_VERSION="111881d" +PKG_VERSION="5d6fd4e" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk index 469c31c557..3e15639ce8 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.asterwave/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.asterwave" -PKG_VERSION="2c82b03" +PKG_VERSION="8e6428c" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk index 897a07266a..e0c1b43078 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.biogenesis/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.biogenesis" -PKG_VERSION="8cf0d12" +PKG_VERSION="8d1ef04" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk index 8f7e7820e4..dac8c78fff 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.cpblobs/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.cpblobs" -PKG_VERSION="585c25b" +PKG_VERSION="1922717" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk index 9d4f937209..451aae9115 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.greynetic/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.greynetic" -PKG_VERSION="2c103d0" +PKG_VERSION="e4dc6eb" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk index 9654b7c08f..5bb332bcbc 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.matrixtrails/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.matrixtrails" -PKG_VERSION="84ca058" +PKG_VERSION="b5a245f" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk index d871d059e6..cf80fcc26d 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.pingpong" -PKG_VERSION="88c7fed" +PKG_VERSION="21ae78d" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk index 8310d5e589..c402795381 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.pyro/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.pyro" -PKG_VERSION="91a863a" +PKG_VERSION="1477bd4" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk index 5689a8d44a..ceb1f6801b 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.shadertoy" -PKG_VERSION="f576d4b" +PKG_VERSION="434f5ce" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk index 42489b22e2..6f990f313c 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.stars/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensaver.stars" -PKG_VERSION="8ff5ad1" +PKG_VERSION="6c62026" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk index cc064de4e8..0515ed5dd3 100644 --- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="screensavers.rsxs" -PKG_VERSION="b68a652" +PKG_VERSION="579ec13" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk index 9b48bfcef5..e93cc0e120 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.fishbmc/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.fishbmc" -PKG_VERSION="9704420" +PKG_VERSION="611e9a9" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk index 4f222578ef..0935f7b344 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.goom/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.goom" -PKG_VERSION="745d8c9" +PKG_VERSION="6bfc884" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk index e7acb8a2f6..872c7b672f 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.projectm/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.projectm" -PKG_VERSION="5450aa2" +PKG_VERSION="5bb3897" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk index f534521dfa..1c877680fc 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.shadertoy/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.shadertoy" -PKG_VERSION="6db9a48" +PKG_VERSION="ae677ac" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk index 83f470dd26..5badd52b0b 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.spectrum/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.spectrum" -PKG_VERSION="73c8786" +PKG_VERSION="d7d9c14" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk index 0715bb3055..d12ed3bc5f 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.waveform/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="visualization.waveform" -PKG_VERSION="ede2fd6" +PKG_VERSION="2a71ba0" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" From dd4bba891c815923d9ddd571a8047831499f336e Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 10 May 2017 05:49:19 +0100 Subject: [PATCH 3/5] LibreELEC-settings: update to LibreELEC-settings-463a99b --- packages/mediacenter/LibreELEC-settings/package.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk index bc41f502dc..b13c7d9537 100644 --- a/packages/mediacenter/LibreELEC-settings/package.mk +++ b/packages/mediacenter/LibreELEC-settings/package.mk @@ -17,7 +17,7 @@ ################################################################################ PKG_NAME="LibreELEC-settings" -PKG_VERSION="ca96ddd" +PKG_VERSION="463a99b" PKG_ARCH="any" PKG_LICENSE="prop." PKG_SITE="https://libreelec.tv" From e8fda252ee4473e406ac5352f167be78527d94dd Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 10 May 2017 05:49:19 +0100 Subject: [PATCH 4/5] vfs.rar: new package --- .../kodi-binary-addons/vfs.rar/package.mk | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk new file mode 100644 index 0000000000..1c39495fa7 --- /dev/null +++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk @@ -0,0 +1,41 @@ +################################################################################ +# This file is part of LibreELEC - http://www.libreelec.tv +# Copyright (C) 2017-present Team LibreELEC +# +# LibreELEC is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# LibreELEC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LibreELEC. If not, see . +################################################################################ + +PKG_NAME="vfs.rar" +PKG_VERSION="26800eb" +PKG_REV="1" +PKG_ARCH="any" +PKG_LICENSE="GPL" +PKG_SITE="http://www.kodi.tv" +PKG_URL="https://github.com/notspiff/vfs.rar/archive/$PKG_VERSION.tar.gz" +PKG_DEPENDS_TARGET="toolchain kodi-platform" +PKG_SECTION="" +PKG_SHORTDESC="vfs.rar" +PKG_LONGDESC="vfs.rar" +PKG_AUTORECONF="no" + +PKG_IS_ADDON="yes" +PKG_ADDON_TYPE="kodi.vfs" + +addon() { + mkdir -p $ADDON_BUILD/$PKG_ADDON_ID/ + cp -R $PKG_BUILD/.install_pkg/usr/share/$MEDIACENTER/addons/$PKG_NAME/* $ADDON_BUILD/$PKG_ADDON_ID/ + + ADDONSO=$(xmlstarlet sel -t -v "/addon/extension/@library_linux" $ADDON_BUILD/$PKG_ADDON_ID/addon.xml) + cp -L $PKG_BUILD/.install_pkg/usr/lib/$MEDIACENTER/addons/$PKG_NAME/$ADDONSO $ADDON_BUILD/$PKG_ADDON_ID/ +} From 896dfca8b16250a59692e9668641e82b2783a05d Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 10 May 2017 05:49:19 +0100 Subject: [PATCH 5/5] ffmpeg: update to ffmpeg-3.3-Leia-Alpha --- packages/multimedia/ffmpeg/package.mk | 6 +- ...mpeg-99.1001-pfcd_hevc_optimisations.patch | 10531 ++++++++-------- ...e6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch | 1 - ...format-to-fix-an-issue-with-MMAL-ren.patch | 1 - .../patches/ffmpeg-99.1010-tls-1.2.patch | 17 - 5 files changed, 5190 insertions(+), 5366 deletions(-) delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 0ee857cdfc..b169d807ac 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="ffmpeg" -# Current branch is: release/3.1-xbmc -PKG_VERSION="33c167d" +# Current branch is: release/3.3-kodi +PKG_VERSION="eb0819c" PKG_ARCH="any" PKG_LICENSE="LGPLv2.1+" PKG_SITE="https://ffmpeg.org" @@ -159,7 +159,6 @@ configure_target() { --disable-dxva2 \ --enable-runtime-cpudetect \ $FFMPEG_TABLES \ - --disable-memalign-hack \ --disable-encoders \ --enable-encoder=ac3 \ --enable-encoder=aac \ @@ -188,7 +187,6 @@ configure_target() { --disable-libopencore-amrwb \ --disable-libopencv \ --disable-libdc1394 \ - --disable-libfaac \ --disable-libfreetype \ --disable-libgsm \ --disable-libmp3lame \ diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch index 3623a64577..f9b7f1bd34 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch @@ -1,7 +1,7 @@ -diff --git a/.gitignore b/.gitignore +diff --git b/.gitignore a/.gitignore index 524fb73..305632b 100644 ---- a/.gitignore -+++ b/.gitignore +--- b/.gitignore ++++ a/.gitignore @@ -23,6 +23,7 @@ .\#* /.config @@ -10,10 +10,81 @@ index 524fb73..305632b 100644 /ffmpeg /ffplay /ffprobe -diff --git a/ffmpeg.c b/ffmpeg.c -index 9ffd833..7a86d7e 100644 ---- a/ffmpeg.c -+++ b/ffmpeg.c +diff --git b/Changelog a/Changelog +index 6f023a9..ad53c9d 100644 +--- b/Changelog ++++ a/Changelog +@@ -1,7 +1,7 @@ + Entries are sorted chronologically from oldest to youngest within each release, + releases are sorted from youngest to oldest. + +-version 3.3: ++version : + - CrystalHD decoder moved to new decode API + - add internal ebur128 library, remove external libebur128 dependency + - Pro-MPEG CoP #3-R2 FEC protocol +@@ -22,7 +22,6 @@ version 3.3: + - threshold filter + - midequalizer filter + - Optimal Huffman tables for (M)JPEG encoding +-- VAAPI-accelerated MPEG-2 and VP8 encoding + - FM Screen Capture Codec decoder + - native Opus encoder + - ScreenPressor decoder +@@ -33,7 +32,6 @@ version 3.3: + - Removed the legacy X11 screen grabber, use XCB instead + - MPEG-7 Video Signature filter + - Removed asyncts filter (use af_aresample instead) +-- Intel QSV-accelerated VP8 video decoding + + + version 3.2: +@@ -121,6 +119,7 @@ version 3.1: + - libutvideo wrapper removed + - YUY2 Lossless Codec decoder + - VideoToolbox H.264 encoder ++- VAAPI-accelerated MPEG-2 and VP8 encoding + + + version 3.0: +diff --git b/RELEASE_NOTES a/RELEASE_NOTES +new file mode 100644 +index 0000000..c3ec010 +--- /dev/null ++++ a/RELEASE_NOTES +@@ -0,0 +1,15 @@ ++ ++ ┌────────────────────────────────────────┐ ++ │ RELEASE NOTES for FFmpeg 3.2 "Hypatia" │ ++ └────────────────────────────────────────┘ ++ ++ The FFmpeg Project proudly presents FFmpeg 3.2 "Hypatia", about 4 ++ months after the release of FFmpeg 3.1. ++ ++ A complete Changelog is available at the root of the project, and the ++ complete Git history on http://source.ffmpeg.org. ++ ++ We hope you will like this release as much as we enjoyed working on it, and ++ as usual, if you have any questions about it, or any FFmpeg related topic, ++ feel free to join us on the #ffmpeg IRC channel (on irc.freenode.net) or ask ++ on the mailing-lists. +diff --git b/doc/Doxyfile a/doc/Doxyfile +index 0891899..8f855f8 100644 +--- b/doc/Doxyfile ++++ a/doc/Doxyfile +@@ -38,7 +38,7 @@ PROJECT_NAME = FFmpeg + # could be handy for archiving the generated documentation or if some version + # control system is used. + +-PROJECT_NUMBER = ++PROJECT_NUMBER = 3.2 + + # Using the PROJECT_BRIEF tag one can provide an optional one line description + # for a project that appears at the top of each page and should give viewer a +diff --git b/ffmpeg.c a/ffmpeg.c +index 11faf0d..494c23d 100644 +--- b/ffmpeg.c ++++ a/ffmpeg.c @@ -23,6 +23,11 @@ * multimedia converter based on the FFmpeg libraries */ @@ -26,7 +97,7 @@ index 9ffd833..7a86d7e 100644 #include "config.h" #include #include -@@ -66,6 +71,25 @@ +@@ -68,6 +73,25 @@ # include "libavfilter/buffersrc.h" # include "libavfilter/buffersink.h" @@ -52,7 +123,7 @@ index 9ffd833..7a86d7e 100644 #if HAVE_SYS_RESOURCE_H #include #include -@@ -158,6 +182,169 @@ static int restore_tty; +@@ -164,6 +188,174 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -168,11 +239,16 @@ index 9ffd833..7a86d7e 100644 +#ifdef RPI_ZERO_COPY +{ + const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1); ++ if (fr_buf == NULL) { ++ mmal_buffer_header_release(buf); ++ return; ++ } + + buf->user_data = fr_buf; + buf->data = av_rpi_zc_vc_handle(fr_buf); -+ buf->alloc_size = -+ buf->length = av_rpi_zc_numbytes(fr_buf); ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); + + ++rpi_display_count; +} @@ -222,7 +298,7 @@ index 9ffd833..7a86d7e 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret) +@@ -575,6 +767,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } @@ -234,9 +310,9 @@ index 9ffd833..7a86d7e 100644 for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret) - av_freep(&ist->filters); +@@ -587,6 +784,9 @@ static void ffmpeg_cleanup(int ret) av_freep(&ist->hwaccel_device); + av_freep(&ist->dts_buffer); +#ifdef RPI_ZERO_COPY + av_rpi_zc_uninit(ist->dec_ctx); @@ -244,7 +320,7 @@ index 9ffd833..7a86d7e 100644 avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret) +@@ -617,6 +817,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -252,7 +328,7 @@ index 9ffd833..7a86d7e 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s, +@@ -1050,6 +1251,15 @@ static void do_video_out(OutputFile *of, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; @@ -265,10 +341,10 @@ index 9ffd833..7a86d7e 100644 + } +#endif + - if (filter->inputs[0]->frame_rate.num > 0 && - filter->inputs[0]->frame_rate.den > 0) - duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base)); -@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) + frame_rate = av_buffersink_get_frame_rate(filter); + if (frame_rate.num > 0 && frame_rate.den > 0) + duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base)); +@@ -2873,6 +3083,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -281,10 +357,10 @@ index 9ffd833..7a86d7e 100644 ist->dec_ctx->thread_safe_callbacks = 1; av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0); -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index fd0d1f0..40d22d2 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile +diff --git b/libavcodec/Makefile a/libavcodec/Makefile +index 0dd0c7b..d2eb014 100644 +--- b/libavcodec/Makefile ++++ a/libavcodec/Makefile @@ -5,6 +5,11 @@ NAME = avcodec HEADERS = avcodec.h \ avdct.h \ @@ -297,7 +373,7 @@ index fd0d1f0..40d22d2 100644 d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -43,6 +48,10 @@ OBJS = allcodecs.o \ +@@ -47,6 +52,10 @@ OBJS = allcodecs.o \ resample.o \ resample2.o \ utils.o \ @@ -308,23 +384,37 @@ index fd0d1f0..40d22d2 100644 vorbis_parser.o \ xiph.o \ -@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -973,8 +982,7 @@ OBJS-$(CONFIG_AAC_ADTSTOASC_BSF) += aac_adtstoasc_bsf.o aacadtsdec.o \ + OBJS-$(CONFIG_CHOMP_BSF) += chomp_bsf.o + OBJS-$(CONFIG_DUMP_EXTRADATA_BSF) += dump_extradata_bsf.o + OBJS-$(CONFIG_DCA_CORE_BSF) += dca_core_bsf.o +-OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF) += extract_extradata_bsf.o \ +- h2645_parse.o ++OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF) += extract_extradata_bsf.o + OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF) += h264_mp4toannexb_bsf.o + OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF) += hevc_mp4toannexb_bsf.o + OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF) += imx_dump_header_bsf.o +@@ -1103,3 +1111,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + ++QASM := $(SUBDIR)../pi-util/qasm.py ++ ++ifneq ("$(wildcard $(QASM))","") +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm -+ python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ + +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm -+ python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++endif + +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h -diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c -index 54efaad..02a89c3 100644 ---- a/libavcodec/allcodecs.c -+++ b/libavcodec/allcodecs.c -@@ -667,6 +667,7 @@ void avcodec_register_all(void) +diff --git b/libavcodec/allcodecs.c a/libavcodec/allcodecs.c +index 4df4772..ca05158 100644 +--- b/libavcodec/allcodecs.c ++++ a/libavcodec/allcodecs.c +@@ -696,6 +696,7 @@ static void register_all(void) REGISTER_PARSER(H261, h261); REGISTER_PARSER(H263, h263); REGISTER_PARSER(H264, h264); @@ -332,11 +422,11 @@ index 54efaad..02a89c3 100644 REGISTER_PARSER(HEVC, hevc); REGISTER_PARSER(MJPEG, mjpeg); REGISTER_PARSER(MLP, mlp); -diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index a4ceca7..1354c14 100644 ---- a/libavcodec/arm/Makefile -+++ b/libavcodec/arm/Makefile -@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o +diff --git b/libavcodec/arm/Makefile a/libavcodec/arm/Makefile +index 1eeac54..f96f93b 100644 +--- b/libavcodec/arm/Makefile ++++ a/libavcodec/arm/Makefile +@@ -135,8 +135,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ arm/hevcdsp_deblock_neon.o \ @@ -348,10 +438,10 @@ index a4ceca7..1354c14 100644 NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o -diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h +diff --git b/libavcodec/arm/cabac.h a/libavcodec/arm/cabac.h index fdbf86b..0a3980a 100644 ---- a/libavcodec/arm/cabac.h -+++ b/libavcodec/arm/cabac.h +--- b/libavcodec/arm/cabac.h ++++ a/libavcodec/arm/cabac.h @@ -26,13 +26,34 @@ #include "libavutil/internal.h" #include "libavcodec/cabac.h" @@ -530,11 +620,11 @@ index fdbf86b..0a3980a 100644 #endif /* HAVE_ARMV6T2_INLINE */ #endif /* AVCODEC_ARM_CABAC_H */ -diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h +diff --git b/libavcodec/arm/hevc_cabac.h a/libavcodec/arm/hevc_cabac.h new file mode 100644 index 0000000..31d3c59 --- /dev/null -+++ b/libavcodec/arm/hevc_cabac.h ++++ a/libavcodec/arm/hevc_cabac.h @@ -0,0 +1,491 @@ +/* + * This file is part of FFmpeg. @@ -1027,10 +1117,10 @@ index 0000000..31d3c59 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ -diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S +diff --git b/libavcodec/arm/hevcdsp_deblock_neon.S a/libavcodec/arm/hevcdsp_deblock_neon.S index 166bddb..a088cc3 100644 ---- a/libavcodec/arm/hevcdsp_deblock_neon.S -+++ b/libavcodec/arm/hevcdsp_deblock_neon.S +--- b/libavcodec/arm/hevcdsp_deblock_neon.S ++++ a/libavcodec/arm/hevcdsp_deblock_neon.S @@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 vst1.8 {d4}, [r0] bx lr @@ -1159,11 +1249,11 @@ index 166bddb..a088cc3 100644 +90: mov a3, #1 + b 11b +endfunc -diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S +diff --git b/libavcodec/arm/hevcdsp_epel_neon.S a/libavcodec/arm/hevcdsp_epel_neon.S new file mode 100644 index 0000000..00eab9e --- /dev/null -+++ b/libavcodec/arm/hevcdsp_epel_neon.S ++++ a/libavcodec/arm/hevcdsp_epel_neon.S @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi @@ -1502,10 +1592,10 @@ index 0000000..00eab9e + .byte 4, 28, 46, 6 + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 -diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c -index 5591807..49c70dd 100644 ---- a/libavcodec/arm/hevcdsp_init_neon.c -+++ b/libavcodec/arm/hevcdsp_init_neon.c +diff --git b/libavcodec/arm/hevcdsp_init_neon.c a/libavcodec/arm/hevcdsp_init_neon.c +index 1a3912c..5c72e1d 100644 +--- b/libavcodec/arm/hevcdsp_init_neon.c ++++ a/libavcodec/arm/hevcdsp_init_neon.c @@ -22,6 +22,8 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" @@ -1515,9 +1605,9 @@ index 5591807..49c70dd 100644 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, - void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); +@@ -43,6 +45,21 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, + void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); +void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); @@ -1687,9 +1777,9 @@ index 5591807..49c70dd 100644 { if (bit_depth == 8) { @@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8; - c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8; - c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8; + c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8; + c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) { + c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper; + c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper; @@ -1729,11 +1819,11 @@ index 5591807..49c70dd 100644 + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon; } -diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S +diff --git b/libavcodec/arm/hevcdsp_sao_neon.S a/libavcodec/arm/hevcdsp_sao_neon.S new file mode 100644 index 0000000..9c7808d --- /dev/null -+++ b/libavcodec/arm/hevcdsp_sao_neon.S ++++ a/libavcodec/arm/hevcdsp_sao_neon.S @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi @@ -2245,28 +2335,28 @@ index 0000000..9c7808d + bx lr +endfunc + -diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 39713ed..25eb52b 100644 ---- a/libavcodec/avcodec.h -+++ b/libavcodec/avcodec.h -@@ -410,6 +410,8 @@ enum AVCodecID { - AV_CODEC_ID_SHEERVIDEO, - AV_CODEC_ID_YLC, +diff --git b/libavcodec/avcodec.h a/libavcodec/avcodec.h +index d780477..5807e1b 100644 +--- b/libavcodec/avcodec.h ++++ a/libavcodec/avcodec.h +@@ -443,6 +443,8 @@ enum AVCodecID { + AV_CODEC_ID_XPM, + AV_CODEC_ID_AV1, + AV_CODEC_ID_H264_MVC, + /* various PCM "codecs" */ AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs AV_CODEC_ID_PCM_S16LE = 0x10000, -@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext { - #define FF_BUG_DC_CLIP 4096 +@@ -2925,6 +2927,7 @@ typedef struct AVCodecContext { #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. #define FF_BUG_TRUNCATED 16384 -+#define FF_BUG_GMC_UNSUPPORTED 32768 + #define FF_BUG_IEDGE 32768 ++#define FF_BUG_GMC_UNSUPPORTED (1 << 16) /** * strictly follow the standard (MPEG-4, ...). -@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext { +@@ -3276,6 +3279,9 @@ typedef struct AVCodecContext { #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) #define FF_PROFILE_H264_CAVLC_444 44 @@ -2276,23 +2366,25 @@ index 39713ed..25eb52b 100644 #define FF_PROFILE_VC1_SIMPLE 0 #define FF_PROFILE_VC1_MAIN 1 -@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext { - #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1 +@@ -3586,7 +3592,13 @@ typedef struct AVCodecContext { #endif -+ /** + /** +- * Audio only. The amount of padding (in samples) appended by the encoder to + * Opaque pointer for use by replacement get_buffer2 code + * + * @author jc (08/02/2016) + */ + void * get_buffer_context; - } AVCodecContext; - - AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx); -diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h ++ ++ /* Audio only. The amount of padding (in samples) appended by the encoder to + * the end of the audio. I.e. this number of decoded samples must be + * discarded by the caller from the end of the stream to get the original + * audio without any trailing padding. +diff --git b/libavcodec/cabac.h a/libavcodec/cabac.h index 1bf1c62..ccfa991 100644 ---- a/libavcodec/cabac.h -+++ b/libavcodec/cabac.h +--- b/libavcodec/cabac.h ++++ a/libavcodec/cabac.h @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; typedef struct CABACContext{ int low; @@ -2309,15 +2401,50 @@ index 1bf1c62..ccfa991 100644 const uint8_t *bytestream_start; const uint8_t *bytestream; const uint8_t *bytestream_end; -diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 9d94b72..535ebf0 100644 ---- a/libavcodec/codec_desc.c -+++ b/libavcodec/codec_desc.c -@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = { - .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), - .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, +diff --git b/libavcodec/codec_desc.c a/libavcodec/codec_desc.c +index 9711019..9f99a2c 100644 +--- b/libavcodec/codec_desc.c ++++ a/libavcodec/codec_desc.c +@@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = { + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/png"), }, + { ++ .id = AV_CODEC_ID_CFHD, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "cfhd", ++ .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"), ++ .props = AV_CODEC_PROP_LOSSY, ++ }, ++ { ++ .id = AV_CODEC_ID_TRUEMOTION2RT, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "truemotion2rt", ++ .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"), ++ .props = AV_CODEC_PROP_LOSSY, ++ }, ++ { ++ .id = AV_CODEC_ID_MAGICYUV, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "magicyuv", ++ .long_name = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"), ++ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, ++ }, ++ { ++ .id = AV_CODEC_ID_SHEERVIDEO, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "sheervideo", ++ .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"), ++ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, ++ }, ++ { ++ .id = AV_CODEC_ID_YLC, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "ylc", ++ .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), ++ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, ++ }, ++ { + .id = AV_CODEC_ID_H264_MVC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h264_mvc", @@ -2327,50 +2454,103 @@ index 9d94b72..535ebf0 100644 /* various PCM "codecs" */ { -diff --git a/libavcodec/h264.h b/libavcodec/h264.h -index efe3555..16358aa 100644 ---- a/libavcodec/h264.h -+++ b/libavcodec/h264.h -@@ -126,7 +126,9 @@ enum { - NAL_END_STREAM = 11, - NAL_FILLER_DATA = 12, - NAL_SPS_EXT = 13, -+ NAL_SPS_SUBSET = 15, - NAL_AUXILIARY_SLICE = 19, -+ NAL_SLICE_EXT = 20, - NAL_FF_IGNORE = 0xff0f001, +diff --git b/libavcodec/dvdsubdec.c a/libavcodec/dvdsubdec.c +index 4e9c058..22ce728 100644 +--- b/libavcodec/dvdsubdec.c ++++ a/libavcodec/dvdsubdec.c +@@ -189,12 +189,12 @@ static void guess_palette(DVDSubContext* ctx, + r = (((subtitle_color >> 16) & 0xff) * level) >> 8; + g = (((subtitle_color >> 8) & 0xff) * level) >> 8; + b = (((subtitle_color >> 0) & 0xff) * level) >> 8; +- rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24); ++ rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24); + color_used[colormap[i]] = (i + 1); + j++; + } else { + rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) | +- ((alpha[i] * 17U) << 24); ++ ((alpha[i] * 17) << 24); + } + } + } +diff --git b/libavcodec/h264.h a/libavcodec/h264.h +index 86df5eb..22c4f1d 100644 +--- b/libavcodec/h264.h ++++ a/libavcodec/h264.h +@@ -41,7 +41,9 @@ enum { + H264_NAL_END_STREAM = 11, + H264_NAL_FILLER_DATA = 12, + H264_NAL_SPS_EXT = 13, ++ H264_NAL_SPS_SUBSET = 15, + H264_NAL_AUXILIARY_SLICE = 19, ++ H264_NAL_SLICE_EXT = 20, }; -diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index ce4bab2..b9b0c78 100644 ---- a/libavcodec/h264_parser.c -+++ b/libavcodec/h264_parser.c -@@ -58,6 +58,8 @@ typedef struct H264ParseContext { + #endif /* AVCODEC_H264_H */ +diff --git b/libavcodec/h264_parse.c a/libavcodec/h264_parse.c +index ea202e7..0c87319 100644 +--- b/libavcodec/h264_parse.c ++++ a/libavcodec/h264_parse.c +@@ -59,9 +59,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, + if (luma_weight_flag) { + pwt->luma_weight[i][list][0] = get_se_golomb(gb); + pwt->luma_weight[i][list][1] = get_se_golomb(gb); +- if ((int8_t)pwt->luma_weight[i][list][0] != pwt->luma_weight[i][list][0] || +- (int8_t)pwt->luma_weight[i][list][1] != pwt->luma_weight[i][list][1]) +- goto out_range_weight; + if (pwt->luma_weight[i][list][0] != luma_def || + pwt->luma_weight[i][list][1] != 0) { + pwt->use_weight = 1; +@@ -79,9 +76,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, + for (j = 0; j < 2; j++) { + pwt->chroma_weight[i][list][j][0] = get_se_golomb(gb); + pwt->chroma_weight[i][list][j][1] = get_se_golomb(gb); +- if ((int8_t)pwt->chroma_weight[i][list][j][0] != pwt->chroma_weight[i][list][j][0] || +- (int8_t)pwt->chroma_weight[i][list][j][1] != pwt->chroma_weight[i][list][j][1]) +- goto out_range_weight; + if (pwt->chroma_weight[i][list][j][0] != chroma_def || + pwt->chroma_weight[i][list][j][1] != 0) { + pwt->use_weight_chroma = 1; +@@ -110,9 +104,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, + } + pwt->use_weight = pwt->use_weight || pwt->use_weight_chroma; + return 0; +-out_range_weight: +- avpriv_request_sample(logctx, "Out of range weight\n"); +- return AVERROR_INVALIDDATA; + } + + /** +diff --git b/libavcodec/h264_parser.c a/libavcodec/h264_parser.c +index bc35a61..055828c 100644 +--- b/libavcodec/h264_parser.c ++++ a/libavcodec/h264_parser.c +@@ -60,6 +60,8 @@ typedef struct H264ParseContext { uint8_t parse_history[6]; int parse_history_count; int parse_last_mb; + int is_mvc; + int slice_ext; + int64_t reference_dts; + int last_frame_num, last_picture_structure; } H264ParseContext; - - -@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, +@@ -109,24 +111,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, } else if (state <= 5) { int nalu_type = buf[i] & 0x1F; - if (nalu_type == NAL_SEI || nalu_type == NAL_SPS || -- nalu_type == NAL_PPS || nalu_type == NAL_AUD) { -+ nalu_type == NAL_PPS || nalu_type == NAL_AUD || -+ nalu_type == NAL_SPS_SUBSET) { + if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS || +- nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) { ++ nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD || ++ nalu_type == H264_NAL_SPS_SUBSET) { if (pc->frame_start_found) { i++; goto found; } - } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA || -- nalu_type == NAL_IDR_SLICE) { -+ nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) { + } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA || +- nalu_type == H264_NAL_IDR_SLICE) { ++ nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) { state += 8; + -+ p->slice_ext = (nalu_type == NAL_SLICE_EXT); ++ p->slice_ext = (nalu_type == H264_NAL_SLICE_EXT); continue; } state = 7; @@ -2386,7 +2566,7 @@ index ce4bab2..b9b0c78 100644 p->parse_history_count = 0; mb= get_ue_golomb_long(&gb); p->parse_last_mb = mb; -@@ -145,7 +150,7 @@ found: +@@ -149,7 +154,7 @@ found: pc->frame_start_found = 0; if (p->is_avc) return next_avc; @@ -2395,7 +2575,7 @@ index ce4bab2..b9b0c78 100644 } static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb, -@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s, +@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s, } } @@ -2405,16 +2585,16 @@ index ce4bab2..b9b0c78 100644 if (avctx->framerate.num) avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1})); -@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx, +@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx, if ((state & 0xFFFFFF00) != 0x100) break; nalu_type = state & 0x1F; -- if (nalu_type == NAL_SPS) { -+ if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) { +- if (nalu_type == H264_NAL_SPS) { ++ if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) { has_sps = 1; - } else if (nalu_type == NAL_PPS) + } else if (nalu_type == H264_NAL_PPS) has_pps = 1; -@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = { +@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = { .parser_close = h264_close, .split = h264_split, }; @@ -2438,2373 +2618,69 @@ index ce4bab2..b9b0c78 100644 + .parser_close = h264_close, + .split = h264_split, +}; -diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c -index b478065..88dd40b 100644 ---- a/libavcodec/hevc.c -+++ b/libavcodec/hevc.c -@@ -41,8 +41,186 @@ - #include "hevc.h" - #include "profiles.h" - -+#ifdef RPI -+ #include "rpi_qpu.h" -+ #include "rpi_user_vcsm.h" -+ // Move Inter prediction into separate pass -+ #define RPI_INTER -+ -+ #ifdef RPI_INTER_QPU -+ // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU -+ #define RPI_MULTI_MAILBOX -+ #endif -+ -+ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory -+ // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined. -+ -+ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*) -+ //#define RPI_SIMULATE_QPUS -+ #ifdef RPI_WORKER -+ #include "pthread.h" -+ #endif -+ -+ static void rpi_execute_dblk_cmds(HEVCContext *s); -+ static void rpi_execute_transform(HEVCContext *s); -+ static void rpi_launch_vpu_qpu(HEVCContext *s); -+ static void rpi_execute_pred_cmds(HEVCContext *s); -+ static void rpi_execute_inter_cmds(HEVCContext *s); -+ static void rpi_begin(HEVCContext *s); -+ static void flush_frame(HEVCContext *s,AVFrame *frame); -+ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job); -+ -+#endif -+ -+// #define DISABLE_MC -+ -+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) -+ -+#ifndef av_mod_uintp2 -+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) -+{ -+ return a & ((1 << p) - 1); -+} -+# define av_mod_uintp2 av_mod_uintp2_c -+#endif -+ - const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; - -+ -+#ifdef RPI_INTER_QPU -+ -+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks -+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks -+// For each block of 64*64 the smallest block size is 8x4 -+// We also need an extra command for the setup information -+ -+#define RPI_CHROMA_COMMAND_WORDS 12 -+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS) -+// The QPU code for UV blocks only works up to a block width of 8 -+#define RPI_CHROMA_BLOCK_WIDTH 8 -+ -+#define RPI_LUMA_COMMAND_WORDS 10 -+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS) -+ -+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) -+ -+// TODO Chroma only needs 4 taps -+ -+// Actual filter goes -ve, +ve, +ve, -ve using these values -+static const uint32_t rpi_filter_coefs[8][1] = { -+ { ENCODE_COEFFS( 0, 64, 0, 0) }, -+ { ENCODE_COEFFS( 2, 58, 10, 2) }, -+ { ENCODE_COEFFS( 4, 54, 16, 2) }, -+ { ENCODE_COEFFS( 6, 46, 28, 4) }, -+ { ENCODE_COEFFS( 4, 36, 36, 4) }, -+ { ENCODE_COEFFS( 4, 28, 46, 6) }, -+ { ENCODE_COEFFS( 2, 16, 54, 4) }, -+ { ENCODE_COEFFS( 2, 10, 58, 2) } -+}; -+ -+#endif -+ -+ -+#ifdef RPI_WORKER -+ -+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); -+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); -+ -+#define LOG_ENTER -+#define LOG_EXIT -+ -+// Call this when we have completed pass0 and wish to trigger pass1 for the current job -+static void worker_submit_job(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_tail++; -+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} -+ -+// Call this to say we have completed pass1 -+static void worker_complete_job(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_head++; -+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} -+ -+// Call this to wait for all jobs to have completed at the end of a frame -+static void worker_wait(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ while( s->worker_head !=s->worker_tail) -+ { -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} -+ -+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes -+// available to receive the next job. -+static void worker_pass0_ready(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ // tail is number of submitted jobs -+ // head is number of completed jobs -+ // tail-head is number of outstanding jobs in the queue -+ // we need to ensure there is at least 1 space left for us to use -+ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS) -+ { -+ // Wait until another job is completed -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} -+ -+static void *worker_start(void *arg) -+{ -+ HEVCContext *s = (HEVCContext *)arg; -+ while(1) { -+ pthread_mutex_lock(&s->worker_mutex); -+ -+ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0) -+ { -+ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); -+ -+ if (s->kill_worker) { -+ break; -+ } -+ LOG_ENTER -+ // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10); -+ rpi_launch_vpu_qpu(s); -+ // Perform inter prediction -+ rpi_execute_inter_cmds(s); -+ // Wait for transform completion -+ vpu_wait(s->vpu_id); -+ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s); -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s); -+ -+ worker_complete_job(s); -+ LOG_EXIT -+ } -+ return NULL; -+} -+ -+#endif -+ - /** - * NOTE: Each function hls_foo correspond to the function foo in the - * specification (HLS stands for High Level Syntax). -@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 - /* free everything allocated by pic_arrays_init() */ - static void pic_arrays_free(HEVCContext *s) - { -+#ifdef RPI -+ int job; -+ for(job=0;jobcoeffs_buf_arm[job][0]) { -+ gpu_free(&s->coeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = 0; -+ } -+ if (s->coeffs_buf_arm[job][2]) { -+ gpu_free(&s->coeffs_buf_accelerated[job]); -+ s->coeffs_buf_arm[job][2] = 0; -+ } -+ } -+#endif -+#ifdef RPI_DEBLOCK_VPU -+ { -+ int i; -+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) { -+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; -+ -+ if (dvq->vpu_cmds_arm) { -+ gpu_free(&dvq->deblock_vpu_gmem); -+ dvq->vpu_cmds_arm = 0; -+ } -+ } -+ } -+#endif - av_freep(&s->sao); - av_freep(&s->deblock); - -@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) - int ctb_count = sps->ctb_width * sps->ctb_height; - int min_pu_size = sps->min_pu_width * sps->min_pu_height; - -+#ifdef RPI -+ int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size); -+ int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS; -+ int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; -+ int coefs_per_row = coefs_per_luma + coefs_per_chroma; -+ int job; -+ -+ av_assert0(sps); -+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb; -+ s->ctu_per_y_chan = s->max_ctu_count / 12; -+ s->ctu_per_uv_chan = s->max_ctu_count / 8; -+ for(job=0;jobcoeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; -+ if (!s->coeffs_buf_arm[job][0]) -+ goto fail; -+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data -+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; -+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; -+ if (!s->coeffs_buf_arm[job][2]) -+ goto fail; -+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. -+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; -+ } -+ } -+#endif -+#ifdef RPI_DEBLOCK_VPU -+ { -+ int i; -+ s->enable_rpi_deblock = !sps->sao_enabled; -+ s->setup_width = (sps->width+15) / 16; -+ s->setup_height = (sps->height+15) / 16; -+ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16; -+ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16; -+ -+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) -+ { -+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; -+ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15; -+ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15; -+ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15; -+ const unsigned int total_size =- cmd_size + y_size + uv_size; -+ int p_vc; -+ uint8_t * p_arm; -+ #if RPI_VPU_DEBLOCK_CACHED -+ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem); -+ #else -+ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem); -+ #endif -+ p_vc = dvq->deblock_vpu_gmem.vc; -+ p_arm = dvq->deblock_vpu_gmem.arm; -+ -+ // Zap all -+ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes); -+ -+ // Subdivide -+ dvq->vpu_cmds_arm = (void*)p_arm; -+ dvq->vpu_cmds_vc = p_vc; -+ -+ p_arm += cmd_size; -+ p_vc += cmd_size; -+ -+ dvq->y_setup_arm = (void*)p_arm; -+ dvq->y_setup_vc = (void*)p_vc; -+ -+ p_arm += y_size; -+ p_vc += y_size; -+ -+ dvq->uv_setup_arm = (void*)p_arm; -+ dvq->uv_setup_vc = (void*)p_vc; -+ -+ dvq->cmd_id = -1; -+ } -+ -+ s->dvq_n = 0; -+ s->dvq = s->dvq_ents + s->dvq_n; -+ } -+#endif -+ - s->bs_width = (width >> 2) + 1; - s->bs_height = (height >> 2) + 1; - -@@ -137,6 +422,29 @@ fail: - return AVERROR(ENOMEM); - } - -+static void default_pred_weight_table(HEVCContext * const s) -+{ -+ unsigned int i; -+ s->sh.luma_log2_weight_denom = 0; -+ s->sh.chroma_log2_weight_denom = 0; -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ s->sh.luma_weight_l0[i] = 1; -+ s->sh.luma_offset_l0[i] = 0; -+ s->sh.chroma_weight_l0[i][0] = 1; -+ s->sh.chroma_offset_l0[i][0] = 0; -+ s->sh.chroma_weight_l0[i][1] = 1; -+ s->sh.chroma_offset_l0[i][1] = 0; -+ } -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ s->sh.luma_weight_l1[i] = 1; -+ s->sh.luma_offset_l1[i] = 0; -+ s->sh.chroma_weight_l1[i][0] = 1; -+ s->sh.chroma_offset_l1[i][0] = 0; -+ s->sh.chroma_weight_l1[i][1] = 1; -+ s->sh.chroma_offset_l1[i][1] = 0; -+ } -+} -+ - static void pred_weight_table(HEVCContext *s, GetBitContext *gb) - { - int i = 0; -@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s) - (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) { - pred_weight_table(s, gb); - } -+ else -+ { -+ // Give us unit weights -+ default_pred_weight_table(s); -+ } - - sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); - if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { - return 0; - } - -+#ifdef RPI -+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) -+{ -+ if (s->enable_rpi) { -+ HEVCLocalContext *lc = s->HEVClc; -+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; -+ cmd->type = RPI_PRED_INTRA; -+ cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; -+ cmd->x = x0; -+ cmd->y = y0; -+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; -+ cmd->mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; -+ } else { -+ s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx); -+ } -+} -+#endif -+ - static int hls_transform_unit(HEVCContext *s, int x0, int y0, - int xBase, int yBase, int cb_xBase, int cb_yBase, - int log2_cb_size, int log2_trafo_size, -@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - if (lc->cu.pred_mode == MODE_INTRA) { - int trafo_size = 1 << log2_trafo_size; - ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); -- -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size, x0, y0, 0); -+#else - s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0); -+#endif +diff --git b/libavcodec/h264_slice.c a/libavcodec/h264_slice.c +index 44a0b9f..fa1e9ae 100644 +--- b/libavcodec/h264_slice.c ++++ a/libavcodec/h264_slice.c +@@ -1778,12 +1778,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, } + if ((pps->weighted_pred && sl->slice_type_nos == AV_PICTURE_TYPE_P) || + (pps->weighted_bipred_idc == 1 && +- sl->slice_type_nos == AV_PICTURE_TYPE_B)) { +- ret = ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count, ++ sl->slice_type_nos == AV_PICTURE_TYPE_B)) ++ ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count, + sl->slice_type_nos, &sl->pwt, h->avctx); +- if (ret < 0) +- return ret; +- } - if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { - ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); -+#else - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1); -+#endif - } - if (cbf_cb[i]) - ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { - ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); -+#else - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2); -+#endif - } - if (cbf_cr[i]) - ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - if (lc->cu.pred_mode == MODE_INTRA) { - ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), - trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); -+#else - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1); -+#endif - } - if (cbf_cb[i]) - ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - if (lc->cu.pred_mode == MODE_INTRA) { - ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), - trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); -+#else - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2); -+#endif - } - if (cbf_cr[i]) - ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); - int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); - ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1); -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2); -+#else - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1); - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2); -+#endif - if (s->ps.sps->chroma_format_idc == 2) { - ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c), - trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); -+ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); -+#else - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1); - s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2); -+#endif - } - } else if (blk_idx == 3) { - int trafo_size_h = 1 << (log2_trafo_size + 1); - int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]); - ff_hevc_set_neighbour_available(s, xBase, yBase, - trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1); -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2); -+#else - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1); - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2); -+#endif - if (s->ps.sps->chroma_format_idc == 2) { - ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)), - trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); -+ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); -+#else - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1); - s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2); -+#endif - } - } - } -@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) - * @param luma_offset additive offset applied to the luma prediction value - */ - -+#ifdef RPI_INTER -+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn) -+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref, const Mv *mv, int x_off, int y_off, -+ int block_w, int block_h, int luma_weight, int luma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_UNI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref->data[0]; -+ cmd->srcstride = ref->linesize[0]; -+ cmd->mv = *mv; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = luma_weight; -+ cmd->offset = luma_offset; -+} -+ -+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref0, const Mv *mv0, int x_off, int y_off, -+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_BI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[0]; -+ cmd->srcstride = ref0->linesize[0]; -+ cmd->mv = *mv0; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[0]; -+ cmd->srcstride1 = ref1->linesize[0]; -+ cmd->mv1 = *mv1; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, -+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist, -+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_UNI; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = src0; -+ cmd->srcstride = srcstride; -+ cmd->mv = current_mv->mv[reflist]; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = chroma_weight; -+ cmd->offset = chroma_offset; -+} -+ -+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, -+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_BI+cidx; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[cidx+1]; -+ cmd->srcstride = ref0->linesize[cidx+1]; -+ cmd->mv = current_mv->mv[0]; -+ cmd->mv1 = current_mv->mv[1]; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[cidx+1]; -+ cmd->srcstride1 = ref1->linesize[cidx+1]; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+#else -+#define RPI_REDIRECT(fn) fn -+#endif -+ - static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref, const Mv *mv, int x_off, int y_off, - int block_w, int block_h, int luma_weight, int luma_offset) -@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag); - int idx = ff_hevc_pel_weight[block_w]; - -+#ifdef DISABLE_MC -+ return; -+#endif -+ - x_off += mv->x >> 2; - y_off += mv->y >> 2; - src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - * @param mv1 motion vector1 (relative to block position) to get pixel data from - * @param current_mv current motion vector structure - */ -- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref0, const Mv *mv0, int x_off, int y_off, - int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) - { -@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); - uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); - -+#ifdef DISABLE_MC -+ return; -+#endif -+ - if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || - x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || - y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, - intptr_t _mx = mx << (1 - hshift); - intptr_t _my = my << (1 - vshift); - -+#ifdef DISABLE_MC -+ return; -+#endif -+ - x_off += mv->x >> (2 + hshift); - y_off += mv->y >> (2 + vshift); - src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF - int hshift = s->ps.sps->hshift[1]; - int vshift = s->ps.sps->vshift[1]; - -+#ifdef DISABLE_MC -+ return; -+#endif -+ - intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); - intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); - intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, - } - } - --static void hls_prediction_unit(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, -- int log2_cb_size, int partIdx, int idx) -+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) - { - #define POS(c_idx, x, y) \ - &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \ - (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)] -- HEVCLocalContext *lc = s->HEVClc; -+ HEVCLocalContext * const lc = s->HEVClc; - int merge_idx = 0; - struct MvField current_mv = {{{ 0 }}}; - -@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int y_cb = y0 >> log2_min_cb_size; - int x_pu, y_pu; - int i, j; -- -- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); -+ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); - - if (!skip_flag) - lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, -+#ifdef RPI_LUMA_QPU -+ if (s->enable_rpi) { -+ const Mv * const mv = ¤t_mv.mv[0]; -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const int x1_m3 = x0 + (mv->x >> 2) - 3; -+ const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame); -+ uint32_t * y = s->curr_y_mvs; -+ -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16); -+ -+ for(int start_x=0;start_x < nPbW;start_x+=16) { -+ const int bw = nPbW-start_x; -+ const int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]]; -+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1; -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter; -+ } -+ } -+ s->curr_y_mvs = y; -+ } else -+#endif -+ { -+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH, - s->sh.luma_weight_l0[current_mv.ref_idx[0]], - s->sh.luma_offset_l0[current_mv.ref_idx[0]]); -+ } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], -+#ifdef RPI_INTER_QPU -+ if (s->enable_rpi) { -+ int hshift = s->ps.sps->hshift[1]; -+ int vshift = s->ps.sps->vshift[1]; -+ const Mv *mv = ¤t_mv.mv[0]; -+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ intptr_t _mx = mx << (1 - hshift); -+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ -+ int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ uint32_t *u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ int bw = nPbW_c-start_x; -+ int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame); -+ *u++ = ( (bwsh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; -+ return; -+ } -+#endif -+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], - 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], -+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], - 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]); - } -@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, -+#ifdef RPI_LUMA_QPU -+ if (s->enable_rpi) { -+ const int reflist = 1; -+ const Mv *mv = ¤t_mv.mv[reflist]; -+ int mx = mv->x & 3; -+ int my = mv->y & 3; -+ int my_mx = (my<<8) + mx; -+ int my2_mx2_my_mx = (my_mx << 16) + my_mx; -+ int x1 = x0 + (mv->x >> 2); -+ int y1 = y0 + (mv->y >> 2); -+ uint32_t *y = s->curr_y_mvs; -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ for(int start_x=0;start_x < nPbW;start_x+=16) { -+ int bw = nPbW-start_x; -+ int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]]; -+ *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1; -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter; -+ } -+ } -+ s->curr_y_mvs = y; -+ } else -+#endif -+ -+ { -+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame, - ¤t_mv.mv[1], x0, y0, nPbW, nPbH, - s->sh.luma_weight_l1[current_mv.ref_idx[1]], - s->sh.luma_offset_l1[current_mv.ref_idx[1]]); -+ } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], -+#ifdef RPI_INTER_QPU -+ if (s->enable_rpi) { -+ const int reflist = 1; -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; -+ const Mv * const mv = ¤t_mv.mv[reflist]; -+ const intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ const intptr_t _mx = mx << (1 - hshift); -+ const intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ uint32_t * u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ const int bw = nPbW_c-start_x; -+ const int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame); -+ *u++ = ( (bwsh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; -+ return; -+ } -+#endif -+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], - 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); - -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], -+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], - 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]); - } -@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, -+#ifdef RPI_LUMA_QPU -+ if (s->enable_rpi && 0) { -+ const Mv *mv = ¤t_mv.mv[0]; -+ int mx = mv->x & 3; -+ int my = mv->y & 3; -+ int my_mx = (my<<8) + mx; -+ const Mv *mv2 = ¤t_mv.mv[1]; -+ int mx2 = mv2->x & 3; -+ int my2 = mv2->y & 3; -+ int my2_mx2 = (my2<<8) + mx2; -+ int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx; -+ int x1 = x0 + (mv->x >> 2); -+ int y1 = y0 + (mv->y >> 2); -+ int x2 = x0 + (mv2->x >> 2); -+ int y2 = y0 + (mv2->y >> 2); -+ uint32_t *y = s->curr_y_mvs; -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time -+ int bw = nPbW-start_x; -+ int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame); -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1 -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ -+ *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]], -+ s->sh.luma_weight_l0[current_mv.ref_idx[0]]); -+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] + -+ s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1; -+ -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b; -+ } -+ } -+ s->curr_y_mvs = y; -+ } else -+#endif -+ { -+ RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH, - ref1->frame, ¤t_mv.mv[1], ¤t_mv); -+ } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, -+#ifdef RPI_INTER_QPU -+ if (s->enable_rpi) { -+ int hshift = s->ps.sps->hshift[1]; -+ int vshift = s->ps.sps->vshift[1]; -+ const Mv *mv = ¤t_mv.mv[0]; -+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ intptr_t _mx = mx << (1 - hshift); -+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ const Mv *mv2 = ¤t_mv.mv[1]; -+ intptr_t mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ intptr_t my2 = av_mod_uintp2(mv2->y, 2 + vshift); -+ intptr_t _mx2 = mx2 << (1 - hshift); -+ intptr_t _my2 = my2 << (1 - vshift); // Fractional part of motion vector -+ -+ int x2_c = x0_c + (mv2->x >> (2 + hshift)); -+ int y2_c = y0_c + (mv2->y >> (2 + hshift)); -+ -+ -+ uint32_t *u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ int bw = nPbW_c-start_x; -+ int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame); -+ *u++ = ( (bwsh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U -+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V -+ *u++ = 0; // Intermediate results are not written back in first pass of B filtering -+ *u++ = 0; -+ -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame); -+ *u++ = ( (bwsh.chroma_offset_l0[current_mv.ref_idx[0]][0] + -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] + -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; -+ return; -+ } -+#endif -+ RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, - x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); - -- chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, -+ RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, - x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 1); - } - } -@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, - lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); - } - -+#ifdef RPI -+static void rpi_execute_dblk_cmds(HEVCContext *s) -+{ -+ int n; -+ int job = s->pass1_job; -+ int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ int (*p)[2] = s->dblk_cmds[job]; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) { -+ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size); -+ } -+ s->num_dblk_cmds[job] = 0; -+} -+ -+static void rpi_execute_transform(HEVCContext *s) -+{ -+ int i=2; -+ int job = s->pass1_job; -+ /*int j; -+ int16_t *coeffs = s->coeffs_buf_arm[job][i]; -+ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) { -+ s->hevcdsp.idct[4-2](coeffs, 16); -+ } -+ i=3; -+ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i]; -+ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) { -+ s->hevcdsp.idct[5-2](coeffs, 32); -+ }*/ -+ -+ gpu_cache_flush(&s->coeffs_buf_accelerated[job]); -+ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], -+ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], -+ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]); -+ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0); -+ //gpu_cache_flush(&s->coeffs_buf_accelerated); -+ //vpu_wait(s->vpu_id); -+ -+ for(i=0;i<4;i++) -+ s->num_coeffs[job][i] = 0; -+} -+ -+static void rpi_execute_pred_cmds(HEVCContext *s) -+{ -+ int i; -+ int job = s->pass1_job; -+ HEVCPredCmd *cmd = s->univ_pred_cmds[job]; -+#ifdef RPI_WORKER -+ HEVCLocalContextIntra *lc = &s->HEVClcIntra; -+#else -+ HEVCLocalContext *lc = s->HEVClc; -+#endif -+ -+ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { -+ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); -+ if (cmd->type == RPI_PRED_INTRA) { -+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode; -+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; -+ lc->na.cand_left = (cmd->na >> 3) & 1; -+ lc->na.cand_up_left = (cmd->na >> 2) & 1; -+ lc->na.cand_up = (cmd->na >> 1) & 1; -+ lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx); -+ } else { -+#ifdef RPI_PRECLEAR -+ int trafo_size = 1 << cmd->size; -+#endif -+ s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride); -+#ifdef RPI_PRECLEAR -+ memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache -+#endif -+ } -+ } -+ s->num_pred_cmds[job] = 0; -+} -+ -+static void rpi_execute_inter_cmds(HEVCContext *s) -+{ -+ int job = s->pass1_job; -+ HEVCMvCmd *cmd = s->unif_mv_cmds[job]; -+ int n,cidx; -+ AVFrame myref; -+ AVFrame myref1; -+ struct MvField mymv; -+ if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) { -+ printf("Overflow inter_cmds\n"); -+ exit(-1); -+ } -+ for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) { -+ switch(cmd->cmd) { -+ case RPI_CMD_LUMA_UNI: -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_LUMA_BI: -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ myref1.data[0] = cmd->src1; -+ myref1.linesize[0] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ luma_mc_bi(s, cmd->dst, cmd->dststride, -+ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, -+ &myref1, &cmd->mv1, &mymv); -+ break; -+ case RPI_CMD_CHROMA_UNI: -+ mymv.mv[0] = cmd->mv; -+ chroma_mc_uni(s, cmd->dst, -+ cmd->dststride, cmd->src, cmd->srcstride, 0, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_CHROMA_BI: -+ case RPI_CMD_CHROMA_BI+1: -+ cidx = cmd->cmd - RPI_CMD_CHROMA_BI; -+ myref.data[cidx+1] = cmd->src; -+ myref.linesize[cidx+1] = cmd->srcstride; -+ myref1.data[cidx+1] = cmd->src1; -+ myref1.linesize[cidx+1] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ mymv.mv[0] = cmd->mv; -+ mymv.mv[1] = cmd->mv1; -+ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx); -+ break; -+ } -+ } -+ s->num_mv_cmds[job] = 0; -+} -+ -+static void rpi_do_all_passes(HEVCContext *s) -+{ -+ // Kick off QPUs and VPUs -+ rpi_launch_vpu_qpu(s); -+ // Perform luma inter prediction -+ rpi_execute_inter_cmds(s); -+ // Wait for transform completion -+ vpu_wait(s->vpu_id); -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s); -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s); -+ // Prepare next batch -+ rpi_begin(s); -+} -+ -+#endif -+ -+#ifdef RPI -+static void rpi_begin(HEVCContext *s) -+{ -+ int job = s->pass0_job; -+ int i; -+#ifdef RPI_INTER_QPU -+ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1]; -+ -+ for(i=0;i<8;i++) { -+ s->u_mvs[job][i] = s->mvs_base[job][i]; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = pic_width; -+ *s->u_mvs[job][i]++ = pic_height; -+ *s->u_mvs[job][i]++ = s->frame->linesize[1]; -+ *s->u_mvs[job][i]++ = s->frame->linesize[2]; -+ *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = i; // Select section of VPM (avoid collisions with 3d unit) -+ } -+ s->curr_u_mvs = s->u_mvs[job][0]; -+#endif -+ -+#ifdef RPI_LUMA_QPU -+ for(i=0;i<12;i++) { -+ // This needs to have a generally similar structure to the -+ // actual filter code as various pipelined bits need to land correctly -+ // when inserted by the filter requests -+ s->y_mvs[job][i] = s->y_mvs_base[job][i]; -+ *s->y_mvs[job][i]++ = 0; // y_x -+ *s->y_mvs[job][i]++ = 0; // ref_y_base -+ *s->y_mvs[job][i]++ = 0; // y2_x2 -+ *s->y_mvs[job][i]++ = 0; // ref_y2_base -+ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height; -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch -+ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6 -+ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block -+ *s->y_mvs[job][i]++ = 0; // Next kernel -+ } -+ s->curr_y_mvs = s->y_mvs[job][0]; -+#endif -+ s->ctu_count = 0; -+} -+#endif -+ -+#ifdef RPI_SIMULATE_QPUS -+ -+static int32_t clipx(int x,int FRAME_WIDTH) -+{ -+ if (x<=0) return 0; -+ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1; -+ return x; -+} -+ -+static int32_t clipy(int y,int FRAME_HEIGHT) -+{ -+ if (y<=0) return 0; -+ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1; -+ return y; -+} -+ -+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch]; -+ -+ vsum += lumaFilter[my][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+round)>>denom)+offset; -+ -+ return av_clip_uint8( vsum ); -+}*/ -+ -+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int chromaFilterH[4]; -+ int chromaFilterV[4]; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ for(i=0;i<4;i++) { -+ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24; -+ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24; -+ } -+ -+ for (y = 0; y < 4; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 4; x++) -+ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += chromaFilterV[y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} }; -+ -+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx) -+{ -+ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx]; -+ int pitch = frame->linesize[cIdx]; -+ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) : -+ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame); -+ if (p>=base && pdata[cIdx] + (p-base); -+ } -+ return NULL; -+} -+ -+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx) -+{ -+ SliceHeader *sh = &s->sh; -+ uint8_t *arm = test_frame(s,p,s->frame,cIdx); -+ int i; -+ if (arm) return arm; -+ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE) -+ { -+ for(i=0;inb_refs[L0];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx); -+ if (arm) return arm; -+ } -+ } -+ if (sh->slice_type == B_SLICE) { -+ for(i=0;inb_refs[L1];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx); -+ if (arm) return arm; -+ } -+ } -+ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT)); -+ exit(-1); -+ return NULL; -+} -+ -+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p) -+{ -+ uint32_t next_kernel; -+ uint32_t x0; -+ uint32_t y0; -+ uint8_t *ref_u_base; -+ uint8_t *ref_v_base; -+ uint32_t frame_width = p[5]; -+ uint32_t frame_height = p[6]; -+ uint32_t pitch = p[7]; -+ uint32_t dst_pitch = p[8]; -+ int32_t offset_before = p[9]; -+ int32_t denom = p[10]; -+ uint32_t vpm_id = p[11]; -+ uint32_t tmp_u_dst[256]; -+ uint32_t tmp_v_dst[256]; -+ while(1) { -+ p += 12; -+ next_kernel = p[0-12]; -+ x0 = p[1-12]; -+ y0 = p[2-12]; -+ if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) { -+ int x,y; -+ uint32_t width_height = p[5]; -+ uint32_t hcoeffs = p[6]; -+ uint32_t vcoeffs = p[7]; -+ uint32_t offset_weight_u = p[8]; -+ uint32_t offset_weight_v = p[9]; -+ uint8_t *this_u_dst; -+ uint8_t *this_v_dst; -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ ref_u_base = compute_arm_addr(s,p[3-12],1); -+ ref_v_base = compute_arm_addr(s,p[4-12],2); -+ if (next_kernel!=s->mc_filter_uv_b0) -+ { -+ this_u_dst = compute_arm_addr(s,p[10],1); -+ this_v_dst = compute_arm_addr(s,p[11],2); -+ } -+ for (y=0; ymc_filter_uv) { -+ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } else if (next_kernel==s->mc_filter_uv_b0) { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ tmp_u_dst[x+y*16] = refa; -+ tmp_v_dst[x+y*16] = refb; -+ } else { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; -+ } -+ } -+} -+ -+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel) -+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan) -+{ -+ uint32_t next_kernel; -+ int y_x,y2_x2; -+ int x0; -+ int y0; -+ int x2; -+ int y2; -+ uint32_t *p0 = p; -+ uint8_t *ref_y_base; -+ uint8_t *ref_y2_base; -+ uint32_t frame_width_height = p[4]; -+ uint32_t frame_width = frame_width_height>>16; -+ uint32_t frame_height = (frame_width_height<<16)>>16; -+ uint32_t pitch = p[5]; -+ uint32_t dst_pitch = p[6]; -+ int offset_shift = p[7]; -+ int32_t offset_before = offset_shift>>16; -+ int32_t denom = (offset_shift<<16)>>16; -+ while(1) { -+ p += 9; -+ next_kernel = p[8-9]; -+ y_x = p[0-9]; -+ x0 = (y_x<<16)>>16; -+ y0 = y_x>>16; -+ y2_x2 = p[2-9]; -+ x2 = (y2_x2<<16)>>16; -+ y2 = y2_x2>>16; -+ -+ if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) { -+ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+ int x,y; -+ uint32_t width_height = p[4]; -+ uint32_t my2_mx2_my_mx = p[5]; -+ uint32_t offset_weight = p[6]; -+ uint8_t *this_dst = compute_arm_addr(s,p[7],0); -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ uint8_t *dst_base = s->frame->data[0]; -+ ref_y_base = compute_arm_addr(s,p[1-9],0); -+ ref_y2_base = compute_arm_addr(s,p[3-9],0); -+ for (y=0; ymc_filter) { -+ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height); -+ refa = av_clip_uint8(refa); -+ this_dst[x+y*dst_pitch] = refa; -+ } -+ else { -+ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height); -+ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height); -+ this_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; -+ } -+ } -+} -+ -+static void rpi_simulate_inter_qpu(HEVCContext *s) -+{ -+ // First run the transform as normal -+ int i; -+ rpi_execute_transform(s); -+ for(i=0;i<8;i++) -+ { -+ rpi_simulate_inter_chroma(s,s->mvs_base[i]); -+ } -+ for(i=0;i<12;i++) -+ { -+ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i); -+ } -+} -+ -+#endif -+ -+#ifdef RPI_INTER_QPU -+ -+static void rpi_launch_vpu_qpu(HEVCContext *s) -+{ -+ int k; -+ int job = s->pass1_job; -+ int i; -+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc; -+#ifdef RPI_LUMA_QPU -+ uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc; -+#endif -+ if (s->sh.slice_type == I_SLICE) { -+#ifdef RPI_MULTI_MAILBOX -+ rpi_execute_transform(s); -+ return; -+#endif -+ } -+ for(k=0;k<8;k++) { -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V -+ av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU); -+ } -+ -+ s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore -+ -+#ifdef RPI_LUMA_QPU -+ for(k=0;k<12;k++) { -+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request -+ s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform) -+ av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU); -+ } -+ s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore -+#endif -+ -+#ifdef RPI_SIMULATE_QPUS -+ rpi_simulate_inter_qpu(s); -+ return; -+#endif -+ -+#ifdef RPI_MULTI_MAILBOX -+#ifdef RPI_CACHE_UNIF_MVS -+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job); -+#else -+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job); -+#endif -+ -+#if 1 -+ { -+ unsigned int i; -+ uint32_t * p; -+ uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV); -+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; -+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; -+ -+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { -+ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ code = qpu_get_fn(QPU_MC_SETUP); -+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { -+ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(), -+ vpu_get_constants(), -+ s->coeffs_buf_vc[job][2], -+ s->num_coeffs[job][2] >> 8, -+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], -+ s->num_coeffs[job][3] >> 10, -+ 0, -+ // QPU job 1 -+ QPU_N_UV, -+ mail_uv, -+ // QPU job 2 -+ QPU_N_Y, -+ mail_y -+ ); -+ } -+ -+#else -+ s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, -+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0, -+ qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+#ifdef RPI_LUMA_QPU -+ qpu_get_fn(QPU_MC_SETUP), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)) -+#else -+ 0, -+ 0,0,0,0, -+ 0,0,0,0, -+ 0,0,0,0 -+#endif -+ ); -+#endif -+ for(i=0;i<4;i++) -+ s->num_coeffs[job][i] = 0; -+#else -+#error Code rotted here -+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)) -+ ); -+#endif -+ -+ -+} -+#else -+ -+#ifdef RPI -+static void rpi_launch_vpu_qpu(HEVCContext *s) -+{ -+ rpi_execute_transform(s); -+} -+#endif -+ -+#endif -+ -+#ifdef RPI -+ -+#ifndef RPI_FAST_CACHEFLUSH -+#error RPI_FAST_CACHEFLUSH is broken -+static void flush_buffer(AVBufferRef *bref) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref); -+ gpu_cache_flush(p); -+} -+#endif -+ -+static void flush_frame(HEVCContext *s,AVFrame *frame) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame); -+ int n = s->ps.sps->height; -+ int curr_y = 0; -+ int curr_uv = 0; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)(p.arm) + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)(p.arm) + base; -+ iocache.s[1].size = sz; -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)(p.arm) + base; -+ iocache.s[2].size = sz; -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer(frame->buf[0]); -+ flush_buffer(frame->buf[1]); -+ flush_buffer(frame->buf[2]); -+#endif -+} -+ -+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int n; -+ int curr_y; -+ int curr_uv; -+ int n_uv; -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame); -+ int sz,base; -+ int (*d)[2] = s->dblk_cmds[job]; -+ int low=(*d)[1]; -+ int high=(*d)[1]; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { -+ int y = (*d)[1]; -+ low=FFMIN(low,y); -+ high=FFMAX(high,y); -+ } -+ curr_y = low; -+ n = high+(1 << s->ps.sps->log2_ctb_size); -+ curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ n_uv = n >> s->ps.sps->vshift[1]; -+ -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)(p.arm) + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)(p.arm) + base; -+ iocache.s[1].size = sz; -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)(p.arm) + base; -+ iocache.s[2].size = sz; -+ -+ iocache.s[3].handle = p0->vcsm_handle; -+ iocache.s[3].cmd = 3; // clean+invalidate -+ iocache.s[3].addr = (int) p0->arm; -+ iocache.s[3].size = p0->numbytes; -+ if (p1) { -+ iocache.s[4].handle = p1->vcsm_handle; -+ iocache.s[4].cmd = 3; // clean+invalidate -+ iocache.s[4].addr = (int) p1->arm; -+ iocache.s[4].size = p1->numbytes; -+ } -+ if (p2) { -+ iocache.s[5].handle = p2->vcsm_handle; -+ iocache.s[5].cmd = 3; // clean+invalidate -+ iocache.s[5].addr = (int) p2->arm; -+ iocache.s[5].size = p2->numbytes; -+ } -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer(frame->buf[0]); -+ flush_buffer(frame->buf[1]); -+ flush_buffer(frame->buf[2]); -+ gpu_cache_flush3(p0, p1, p2); -+#endif -+} -+ -+#endif -+ - static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - { - HEVCContext *s = avctxt->priv_data; -@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - int y_ctb = 0; - int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; - -+#ifdef RPI -+ s->enable_rpi = s->ps.sps->bit_depth == 8 -+ && !s->ps.pps->cross_component_prediction_enabled_flag; -+ -+ if (!s->enable_rpi) { -+ if (s->ps.pps->cross_component_prediction_enabled_flag) -+ printf("Cross component\n"); -+ } -+#endif -+ //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]); -+ - if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { - av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); - return AVERROR_INVALIDDATA; -@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - } - } - -+#ifdef RPI_WORKER -+ s->pass0_job = 0; -+ s->pass1_job = 0; -+#endif -+#ifdef RPI -+ rpi_begin(s); -+#endif -+ - while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { - int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; - -@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; - s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; - -+#ifdef RPI_INTER_QPU -+ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8]; -+#endif -+#ifdef RPI_LUMA_QPU -+ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12]; -+#endif -+ - more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); -+ -+#ifdef RPI_INTER_QPU -+ s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs; -+#endif -+#ifdef RPI_LUMA_QPU -+ s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs; -+#endif -+ -+#ifdef RPI -+ if (s->enable_rpi) { -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0); -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]pass0_jobpass0_job>=0); -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; -+ s->ctu_count++; -+ //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job); -+ -+ if ( s->ctu_count >= s->max_ctu_count ) { -+#ifdef RPI_WORKER -+ if (s->used_for_ref) { -+ // Split work load onto separate threads so we make as rapid progress as possible with this frame -+ // Pass on this job to worker thread -+ worker_submit_job(s); -+ // Make sure we have space to prepare the next job -+ worker_pass0_ready(s); -+ -+ // Prepare the next batch of commands -+ rpi_begin(s); -+ } else { -+ // Non-ref frame so do it all on this thread -+ rpi_do_all_passes(s); -+ } -+#else -+ rpi_do_all_passes(s); -+#endif -+ } -+ -+ } -+#endif -+ -+ - if (more_data < 0) { - s->tab_slice_address[ctb_addr_rs] = -1; - return more_data; -@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - - ctb_addr_ts++; - ff_hevc_save_states(s, ctb_addr_ts); -+#ifdef RPI -+ if (s->enable_rpi) -+ continue; -+#endif - ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size); - } - -+#ifdef RPI -+ -+#ifdef RPI_WORKER -+ // Wait for the worker to finish all its jobs -+ if (s->enable_rpi) { -+ worker_wait(s); -+ } -+#endif -+ -+ // Finish off any half-completed rows -+ if (s->enable_rpi && s->ctu_count) { -+ rpi_do_all_passes(s); -+ } -+ -+#endif -+ - if (x_ctb + ctb_size >= s->ps.sps->width && - y_ctb + ctb_size >= s->ps.sps->height) - ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - s = s1->sList[self_id]; - lc = s->HEVClc; - -+#ifdef RPI -+ s->enable_rpi = 0; -+ //printf("Wavefront\n"); -+#endif -+ - if(ctb_row) { - ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); - -@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) - if (ret < 0) - return ret; - -+ s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N || -+ s->nal_unit_type == NAL_TSA_N || -+ s->nal_unit_type == NAL_STSA_N || -+ s->nal_unit_type == NAL_RADL_N || -+ s->nal_unit_type == NAL_RASL_N); -+ -+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { -+ s->is_decoded = 0; -+ break; -+ } - if (s->max_ra == INT_MAX) { - if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) { - s->max_ra = s->poc; -@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) - } - - fail: -- if (s->ref && s->threads_type == FF_THREAD_FRAME) -+ if (s->ref && s->threads_type == FF_THREAD_FRAME) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height); -+#endif - ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); -- -+ } else if (s->ref) { -+#ifdef RPI_INTER_QPU -+ // When running single threaded we need to flush the whole frame -+ flush_frame(s,s->frame); -+#endif -+ } - return ret; - } - -@@ -3064,6 +4625,41 @@ fail: - return AVERROR(ENOMEM); - } - -+#ifdef RPI_WORKER -+static av_cold void hevc_init_worker(HEVCContext *s) -+{ -+ int err; -+ pthread_cond_init(&s->worker_cond_head, NULL); -+ pthread_cond_init(&s->worker_cond_tail, NULL); -+ pthread_mutex_init(&s->worker_mutex, NULL); -+ -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; -+ err = pthread_create(&s->worker_thread, NULL, worker_start, s); -+ if (err) { -+ printf("Failed to create worker thread\n"); -+ exit(-1); -+ } -+} -+ -+static av_cold void hevc_exit_worker(HEVCContext *s) -+{ -+ void *res; -+ s->kill_worker=1; -+ pthread_cond_broadcast(&s->worker_cond_tail); -+ pthread_join(s->worker_thread, &res); -+ -+ pthread_cond_destroy(&s->worker_cond_head); -+ pthread_cond_destroy(&s->worker_cond_tail); -+ pthread_mutex_destroy(&s->worker_mutex); -+ -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; -+} -+#endif -+ - static av_cold int hevc_decode_free(AVCodecContext *avctx) - { - HEVCContext *s = avctx->priv_data; -@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) - - av_freep(&s->cabac_state); - -+#ifdef RPI -+ -+#ifdef RPI_WORKER -+ hevc_exit_worker(s); -+#endif -+ -+ for(i=0;iunif_mv_cmds[i]); -+ av_freep(&s->univ_pred_cmds[i]); -+ -+#ifdef RPI_INTER_QPU -+ if (s->unif_mvs[i]) { -+ gpu_free( &s->unif_mvs_ptr[i] ); -+ s->unif_mvs[i] = 0; -+ } -+#endif -+#ifdef RPI_LUMA_QPU -+ if (s->y_unif_mvs[i]) { -+ gpu_free( &s->y_unif_mvs_ptr[i] ); -+ s->y_unif_mvs[i] = 0; -+ } -+#endif -+ } -+ -+#endif -+ - for (i = 0; i < 3; i++) { - av_freep(&s->sao_pixel_buffer_h[i]); - av_freep(&s->sao_pixel_buffer_v[i]); -@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) - return 0; - } - -+#ifdef RPI -+#ifdef RPI_PRECLEAR -+static av_cold void memclear16(int16_t *p, int n) -+{ -+ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1); -+ //int i; -+ //for(i=0;ipriv_data; - int i; -+ int job; - - s->avctx = avctx; - -@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) - s->HEVClcList[0] = s->HEVClc; - s->sList[0] = s; - -+#ifdef RPI -+ for(job=0;jobunif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS); -+ if (!s->unif_mv_cmds[job]) -+ goto fail; -+ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); -+ if (!s->univ_pred_cmds[job]) -+ goto fail; -+ } -+ -+#ifdef RPI_INTER_QPU -+ // We divide the image into blocks 256 wide and 64 high -+ // We support up to 2048 widths -+ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted -+ // Also add space for the startup command for each stream. -+ -+ { -+ int uv_commands_per_qpu = UV_COMMANDS_PER_QPU; -+ uint32_t *p; -+ for(job=0;jobunif_mvs_ptr[job] ); -+#else -+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); -+#endif -+ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->unif_mvs[job]; -+ for(i = 0; i < 8; i++) { -+ s->mvs_base[job][i] = p; -+ p += uv_commands_per_qpu; -+ } -+ } -+ s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV); -+ s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0); -+ s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B); -+ } -+ -+#endif -+#ifdef RPI_LUMA_QPU -+ for(job=0;joby_unif_mvs_ptr[job] ); -+#else -+ gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); -+#endif -+ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->y_unif_mvs[job]; -+ for(i = 0; i < 12; i++) { -+ s->y_mvs_base[job][i] = p; -+ p += y_commands_per_qpu; -+ } -+ } -+ s->mc_filter = qpu_get_fn(QPU_MC_FILTER); -+ s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B); -+#endif -+ //gpu_malloc_uncached(2048*64,&s->dummy); -+ -+ s->enable_rpi = 0; -+ -+#ifdef RPI_WORKER -+ hevc_init_worker(s); -+#endif -+ -+#endif -+ - s->cabac_state = av_malloc(HEVC_CONTEXTS); - if (!s->cabac_state) - goto fail; -diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h -index be91010..6b03ea8 100644 ---- a/libavcodec/hevc.h -+++ b/libavcodec/hevc.h -@@ -23,6 +23,9 @@ + sl->explicit_ref_marking = 0; + if (nal->ref_idc) { +diff --git b/libavcodec/hevc.h a/libavcodec/hevc.h +index de77d2a..494ca48 100644 +--- b/libavcodec/hevc.h ++++ a/libavcodec/hevc.h +@@ -21,6 +21,34 @@ #ifndef AVCODEC_HEVC_H #define AVCODEC_HEVC_H +// define RPI to split the CABAC/prediction/transform into separate stages -+#include "config.h" ++#ifndef RPI + - #include "libavutil/buffer.h" - #include "libavutil/md5.h" - -@@ -37,6 +40,29 @@ - #include "thread.h" - #include "videodsp.h" - -+// define RPI to split the CABAC/prediction/transform into separate stages -+#ifdef RPI ++ #define RPI_INTER 0 ++ ++#else + + #include "rpi_qpu.h" -+ // Define RPI_INTER_QPU to use QPU for chroma inter prediction -+ #define RPI_INTER_QPU ++ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU + -+ #ifdef RPI_INTER_QPU -+ // Define RPI_LUMA_QPU to also use QPU for luma inter prediction -+ #define RPI_LUMA_QPU -+ #endif -+ -+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames -+ #define RPI_MAX_JOBS 2 + // Define RPI_WORKER to launch a worker thread for pixel processing tasks + #define RPI_WORKER ++ // By passing jobs to a worker thread we hope to be able to catch up during slow frames ++ // This has no effect unless RPI_WORKER is defined ++ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as ++ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one ++ // free for the foreground to fill in. ++ #define RPI_MAX_JOBS 2 ++ + // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs ++ // As it stands there is something mildy broken in VPU deblock - looks mostly OK ++ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_) ++ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM +// #define RPI_DEBLOCK_VPU + ++ #define RPI_VPU_DEBLOCK_CACHED 1 +#endif + -+#define RPI_VPU_DEBLOCK_CACHED 1 -+ - #define MAX_DPB_SIZE 16 // A.4.1 - #define MAX_REFS 16 - -@@ -660,17 +686,6 @@ typedef struct CodingUnit { - uint8_t cu_transquant_bypass_flag; - } CodingUnit; - --typedef struct Mv { -- int16_t x; ///< horizontal component of motion vector -- int16_t y; ///< vertical component of motion vector --} Mv; -- --typedef struct MvField { -- DECLARE_ALIGNED(4, Mv, mv)[2]; -- int8_t ref_idx[2]; -- int8_t pred_flag; --} MvField; -- - typedef struct NeighbourAvailable { - int cand_bottom_left; - int cand_left; -@@ -747,7 +762,17 @@ typedef struct HEVCFrame { - uint8_t flags; - } HEVCFrame; - -+#ifdef RPI_WORKER -+typedef struct HEVCLocalContextIntra { -+ TransformUnit tu; -+ NeighbourAvailable na; -+} HEVCLocalContextIntra; -+#endif -+ - typedef struct HEVCLocalContext { -+ TransformUnit tu; -+ NeighbourAvailable na; // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra -+ - uint8_t cabac_state[HEVC_CONTEXTS]; - - uint8_t stat_coeff[4]; -@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext { - - int qPy_pred; - -- TransformUnit tu; - - uint8_t ctb_left_flag; - uint8_t ctb_up_flag; -@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext { - int ct_depth; - CodingUnit cu; - PredictionUnit pu; -- NeighbourAvailable na; - - #define BOUNDARY_LEFT_SLICE (1 << 0) - #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext { - int boundary_flags; - } HEVCLocalContext; - -+ -+#ifdef RPI -+ -+// The processing is done in chunks -+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma) -+// This is a distance of 1536 pixels across the screen -+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing, -+// but allocate more memory and increase the latency before data in the next frame can be processed -+#define RPI_NUM_CHUNKS 1 -+ -+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code -+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24) -+ -+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi -+#define RPI_MAX_MV_CMDS (2*16*3*(RPI_MAX_WIDTH/4)) -+// Each block can have an intra prediction and a transform_add command -+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4)) -+// Worst case is 16x16 CTUs -+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16) -+ -+#define RPI_CMD_LUMA_UNI 0 -+#define RPI_CMD_CHROMA_UNI 1 -+#define RPI_CMD_LUMA_BI 2 -+#define RPI_CMD_CHROMA_BI 3 -+#define RPI_CMD_V_BI 4 -+ -+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed? -+// #define RPI_PRECLEAR -+ -+// Command for inter prediction -+typedef struct HEVCMvCmd { -+ int cmd; -+ uint8_t *dst; -+ ptrdiff_t dststride; -+ uint8_t *src; -+ ptrdiff_t srcstride; -+ Mv mv; -+ int x_off; -+ int y_off; -+ int block_w; -+ int block_h; -+ int weight; -+ int offset; -+ uint8_t *src1; -+ ptrdiff_t srcstride1; -+ Mv mv1; -+ int8_t ref_idx[2]; -+} HEVCMvCmd; -+ -+ -+// Command for intra prediction and transform_add of predictions to coefficients -+#define RPI_PRED_TRANSFORM_ADD 0 -+#define RPI_PRED_INTRA 1 -+typedef struct HEVCPredCmd { -+ uint8_t size; -+ uint8_t type; -+ uint8_t na; -+ uint8_t c_idx; -+ union { -+ uint8_t *dst; // RPI_PRED_TRANSFORM_ADD -+ uint32_t x; // RPI_PRED_INTRA -+ }; -+ union { -+ int16_t *buf; // RPI_PRED_TRANSFORM_ADD -+ uint32_t y; // RPI_PRED_INTRA -+ }; -+ union { -+ enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD -+ uint32_t stride; // RPI_PRED_INTRA -+ }; -+} HEVCPredCmd; -+ -+#endif -+ - typedef struct HEVCContext { - const AVClass *c; // needed by private avoptions - AVCodecContext *avctx; -@@ -798,13 +895,107 @@ typedef struct HEVCContext { - - HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; - HEVCLocalContext *HEVClc; -- -+#ifdef RPI_WORKER -+ HEVCLocalContextIntra HEVClcIntra; -+#endif - uint8_t threads_type; - uint8_t threads_number; - - int width; - int height; - -+ int used_for_ref; -+ -+#ifdef RPI -+ int enable_rpi; -+ HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS]; -+ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; -+ int buf_width; -+ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; -+ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS]; -+ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4]; -+ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4]; -+ int num_coeffs[RPI_MAX_JOBS][4]; -+ int num_xfm_cmds[RPI_MAX_JOBS]; -+ int num_mv_cmds[RPI_MAX_JOBS]; -+ int num_pred_cmds[RPI_MAX_JOBS]; -+ int num_dblk_cmds[RPI_MAX_JOBS]; -+ int vpu_id; -+ int pass0_job; // Pass0 does coefficient decode -+ int pass1_job; // Pass1 does pixel processing -+ int ctu_count; // Number of CTUs done in pass0 so far -+ int max_ctu_count; // Number of CTUs when we trigger a round of processing -+ int ctu_per_y_chan; // Number of CTUs per luma QPU -+ int ctu_per_uv_chan; // Number of CTUs per chroma QPU -+#ifdef RPI_INTER_QPU -+ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands -+ -+ // _base pointers are to the start of the row -+ uint32_t *mvs_base[RPI_MAX_JOBS][8]; -+ // these pointers are to the next free space -+ uint32_t *u_mvs[RPI_MAX_JOBS][8]; -+ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma -+ // Function pointers -+ uint32_t mc_filter_uv; -+ uint32_t mc_filter_uv_b0; -+ uint32_t mc_filter_uv_b; -+#endif -+#ifdef RPI_LUMA_QPU -+ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands -+ uint32_t *y_mvs_base[RPI_MAX_JOBS][12]; -+ uint32_t *y_mvs[RPI_MAX_JOBS][12]; -+ uint32_t *curr_y_mvs; // Current uniform stream for luma -+ // Function pointers -+ uint32_t mc_filter; -+ uint32_t mc_filter_b; -+#endif -+ -+#ifdef RPI_WORKER -+ pthread_t worker_thread; -+ pthread_cond_t worker_cond_head; -+ pthread_cond_t worker_cond_tail; -+ pthread_mutex_t worker_mutex; -+ -+ int worker_tail; // Contains the number of posted jobs -+ int worker_head; // Contains the number of completed jobs -+ int kill_worker; // set to 1 to terminate the worker -+#endif -+ -+#define RPI_DEBLOCK_VPU_Q_COUNT 2 -+ -+#ifdef RPI_DEBLOCK_VPU -+ int enable_rpi_deblock; -+ -+ int uv_setup_width; -+ int uv_setup_height; -+ int setup_width; // Number of 16x16 blocks across the image -+ int setup_height; // Number of 16x16 blocks down the image -+ -+ struct dblk_vpu_q_s -+ { -+ GPU_MEM_PTR_T deblock_vpu_gmem; -+ -+ uint8_t (*y_setup_arm)[2][2][2][4]; -+ uint8_t (*y_setup_vc)[2][2][2][4]; -+ -+ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned -+ uint8_t (*uv_setup_vc)[2][2][2][4]; -+ -+ int (*vpu_cmds_arm)[6]; // r0-r5 for each command -+ int vpu_cmds_vc; -+ -+ int cmd_id; -+ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT]; -+ -+ struct dblk_vpu_q_s * dvq; -+ unsigned int dvq_n; -+ -+#endif -+ -+#endif -+ - uint8_t *cabac_state; - - /** 1 if the independent slice segment header was successfully parsed */ -@@ -922,6 +1113,9 @@ typedef struct HEVCContext { - uint32_t max_mastering_luminance; - uint32_t min_mastering_luminance; - -+#ifdef RPI -+ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2]; -+#endif - } HEVCContext; - - int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, -@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - int log2_trafo_size, enum ScanType scan_idx, - int c_idx); - -+#ifdef RPI_INTER_QPU -+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n); -+#endif -+ - void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); - - -diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c -index 05b2821..e2f1f4e 100644 ---- a/libavcodec/hevc_cabac.c -+++ b/libavcodec/hevc_cabac.c -@@ -21,14 +21,72 @@ + /** + * Table 7-3: NAL unit type codes + */ +diff --git b/libavcodec/hevc_cabac.c a/libavcodec/hevc_cabac.c +index e27c54e..1dbbb16 100644 +--- b/libavcodec/hevc_cabac.c ++++ a/libavcodec/hevc_cabac.c +@@ -21,6 +21,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -4813,10 +2689,10 @@ index 05b2821..e2f1f4e 100644 #include "libavutil/attributes.h" #include "libavutil/common.h" --#include "cabac_functions.h" +@@ -29,8 +31,64 @@ #include "hevc.h" -+#include "cabac_functions.h" -+ + #include "hevcdec.h" + +// BY22 is probably faster than simple bypass if the processor has +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction +// x86 has fast int divide @@ -4836,7 +2712,7 @@ index 05b2821..e2f1f4e 100644 +#if ARCH_ARM +#include "arm/hevc_cabac.h" +#endif - ++ #define CABAC_MAX_BIN 31 + @@ -4878,7 +2754,7 @@ index 05b2821..e2f1f4e 100644 /** * number of bin by SyntaxElement. */ -@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { +@@ -447,6 +505,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { { 28, 36, 43, 49, 54, 58, 61, 63, }, }; @@ -5090,7 +2966,7 @@ index 05b2821..e2f1f4e 100644 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts) { if (s->ps.pps->entropy_coding_sync_enabled_flag && -@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) +@@ -865,19 +1128,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth); } @@ -5116,7 +2992,7 @@ index 05b2821..e2f1f4e 100644 } int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) { -@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { +@@ -893,14 +1156,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx); } @@ -5133,7 +3009,7 @@ index 05b2821..e2f1f4e 100644 ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ctx_shift = (log2_size + 1) >> 2; } else { -@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, +@@ -931,22 +1194,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, return value; } @@ -5159,7 +3035,7 @@ index 05b2821..e2f1f4e 100644 { return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); } -@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, +@@ -968,90 +1225,337 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); } @@ -5488,12 +3364,12 @@ index 05b2821..e2f1f4e 100644 int vshift = s->ps.sps->vshift[c_idx]; uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; -+#ifdef RPI -+ //***** transform_skip_flag decoded later! -+ int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4; -+#endif - int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); +- int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); - uint8_t significant_coeff_group_flag[8][8] = {{0}}; ++#ifdef RPI ++ int use_vpu; ++#endif ++ int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero int explicit_rdpcm_flag = 0; int explicit_rdpcm_dir_flag; @@ -5508,38 +3384,11 @@ index 05b2821..e2f1f4e 100644 int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode : lc->tu.intra_pred_mode_c; +- memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); + int prev_sig = 0; + const int c_idx_nz = (c_idx != 0); + + int may_hide_sign; -+ -+#ifdef RPI -+ if (s->enable_rpi) { -+ int n = trafo_size * trafo_size; -+ if (use_vpu) { -+ // We support size 4 and size 5. -+ // Size 4 grows from the front (Coeffs_buf_arm[2] points to start of buf) -+ // Size 5 grows from the back (Coeffs_buf_arm[3] points to end of buf) -+ // num_coeffs is indexed by log2_trafo_size-2 -+ if (log2_trafo_size == 4) -+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2]; -+ else -+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n; -+ s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n; -+ } else { -+ coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0]; -+ s->num_coeffs[s->pass0_job][0] += n; -+ } -+ } -+ // We now do the memset after transform_add while we know the data is cached. -+ #ifdef RPI_PRECLEAR -+ #else -+ memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); -+ #endif -+#else - memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); -+#endif -+ + // Derive QP for dequant @@ -5549,7 +3398,7 @@ index 05b2821..e2f1f4e 100644 static const uint8_t rem6[51 + 4 * 6 + 1] = { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1067,9 +1571,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, }; int qp_y = lc->qp_y; @@ -5570,7 +3419,7 @@ index 05b2821..e2f1f4e 100644 } if (c_idx == 0) { -@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1102,39 +1616,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, qp += s->ps.sps->qp_bd_offset; } @@ -5641,6 +3490,9 @@ index 05b2821..e2f1f4e 100644 + may_hide_sign = 0; } ++ ++ ++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && - (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) { - explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx); @@ -5658,7 +3510,7 @@ index 05b2821..e2f1f4e 100644 &last_significant_coeff_x, &last_significant_coeff_y); if (last_significant_coeff_x > 3) { -@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1162,119 +1713,133 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int last_x_c = last_significant_coeff_x & 3; int last_y_c = last_significant_coeff_y & 3; @@ -5715,14 +3567,40 @@ index 05b2821..e2f1f4e 100644 - for (i = num_last_subset; i >= 0; i--) { - int n, m; - int x_cg, y_cg, x_c, y_c, pos; -- int implicit_non_zero_coeff = 0; ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ ++ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; ++ ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++#ifdef RPI ++ use_vpu = 0; ++ if (s->enable_rpi) { ++ use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4; ++ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#ifndef RPI_PRECLEAR ++ // We now do the memset after transform_add while we know the data is cached. ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } ++ else ++#endif ++ { ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ } ++ } ++ ++ i = num_last_subset; ++ do { + int implicit_non_zero_coeff = 0; - int64_t trans_coeff_level; - int prev_sig = 0; - int offset = i << 4; - int rice_init = 0; -+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ int n_end; -- uint8_t significant_coeff_flag_idx[16]; + uint8_t significant_coeff_flag_idx[16]; - uint8_t nb_significant_coeff_flag = 0; - - x_cg = scan_x_cg[i]; @@ -5734,8 +3612,7 @@ index 05b2821..e2f1f4e 100644 - ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg]; - if (y_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1]; -+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; - +- - significant_coeff_group_flag[x_cg][y_cg] = - significant_coeff_group_flag_decode(s, c_idx, ctx_cg); - implicit_non_zero_coeff = 1; @@ -5744,13 +3621,8 @@ index 05b2821..e2f1f4e 100644 - ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) || - (x_cg == 0 && y_cg == 0)); - } -+ i = num_last_subset; -+ do { -+ int implicit_non_zero_coeff = 0; -+ int n_end; - +- - last_scan_pos = num_coeff - offset - 1; -+ uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; if (i == num_last_subset) { @@ -5836,7 +3708,7 @@ index 05b2821..e2f1f4e 100644 if (log2_trafo_size == 3) { scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; } else { -@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1288,34 +1853,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } } @@ -5885,7 +3757,7 @@ index 05b2821..e2f1f4e 100644 significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; nb_significant_coeff_flag++; } -@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1325,141 +1886,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } @@ -6188,7 +4060,7 @@ index 05b2821..e2f1f4e 100644 if (lc->cu.cu_transquant_bypass_flag) { if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1469,7 +2074,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else { @@ -6197,20 +4069,9 @@ index 05b2821..e2f1f4e 100644 int rot = s->ps.sps->transform_skip_rotation_enabled_flag && log2_trafo_size == 2 && lc->cu.pred_mode == MODE_INTRA; -@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - for (i = 0; i < 8; i++) - FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); - } -- - s->hevcdsp.transform_skip(coeffs, log2_trafo_size); - - if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); - } +@@ -1490,6 +2095,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { -- s->hevcdsp.idct_4x4_luma(coeffs); -+ s->hevcdsp.idct_4x4_luma(coeffs); + s->hevcdsp.transform_4x4_luma(coeffs); } else { +#ifdef RPI + if (!use_vpu) { @@ -6232,16 +4093,16 @@ index 05b2821..e2f1f4e 100644 +#else int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) - s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); -@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, + s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); +@@ -1503,6 +2126,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, col_limit = FFMIN(24, col_limit); - s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); + s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); } +#endif } } if (lc->tu.cross_pf) { -@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1512,6 +2136,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); } } @@ -6250,19 +4111,19 @@ index 05b2821..e2f1f4e 100644 + HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; + cmd->type = RPI_PRED_TRANSFORM_ADD; + cmd->size = log2_trafo_size; -+ cmd->buf = coeffs; -+ cmd->dst = dst; -+ cmd->stride = stride; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; + return; + } +#endif - s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride); + s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride); } -diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c -index 1f33b0c..55a0315 100644 ---- a/libavcodec/hevc_filter.c -+++ b/libavcodec/hevc_filter.c +diff --git b/libavcodec/hevc_filter.c a/libavcodec/hevc_filter.c +index 14e7c8d..e4ffd87 100644 +--- b/libavcodec/hevc_filter.c ++++ a/libavcodec/hevc_filter.c @@ -22,6 +22,12 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -6276,19 +4137,18 @@ index 1f33b0c..55a0315 100644 #include "libavutil/common.h" #include "libavutil/internal.h" -@@ -31,6 +37,11 @@ +@@ -30,6 +36,10 @@ #include "bit_depth_template.c" +#ifdef RPI -+#include "rpi_user_vcsm.h" +#include "rpi_qpu.h" +#endif + #define LUMA 0 #define CB 1 #define CR 2 -@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -272,6 +282,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) edges[2] = x_ctb == s->ps.sps->ctb_width - 1; edges[3] = y_ctb == s->ps.sps->ctb_height - 1; @@ -6299,7 +4159,7 @@ index 1f33b0c..55a0315 100644 if (restore) { if (!edges[0]) { left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -495,6 +509,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->ps.sps->pcm.loop_filter_disable_flag) || s->ps.pps->transquant_bypass_enable_flag; @@ -6315,7 +4175,7 @@ index 1f33b0c..55a0315 100644 if (x0) { left_tc_offset = s->deblock[ctb - 1].tc_offset; left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -538,6 +561,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -6335,7 +4195,7 @@ index 1f33b0c..55a0315 100644 s->hevcdsp.hevc_v_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -570,6 +606,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -6355,7 +4215,7 @@ index 1f33b0c..55a0315 100644 s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -604,9 +653,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6379,7 +4239,7 @@ index 1f33b0c..55a0315 100644 } } -@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -637,6 +700,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6399,7 +4259,7 @@ index 1f33b0c..55a0315 100644 s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q); -@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -647,69 +723,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } } @@ -6469,7 +4329,7 @@ index 1f33b0c..55a0315 100644 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size) -@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -720,10 +733,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_min_tu_size = s->ps.sps->log2_min_tb_size; int min_pu_width = s->ps.sps->min_pu_width; int min_tu_width = s->ps.sps->min_tb_width; @@ -6494,7 +4354,7 @@ index 1f33b0c..55a0315 100644 boundary_upper = y0 > 0 && !(y0 & 7); if (boundary_upper && -@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -735,34 +759,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_upper = 0; @@ -6571,7 +4431,7 @@ index 1f33b0c..55a0315 100644 boundary_left = x0 > 0 && !(x0 & 7); if (boundary_left && ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -773,64 +819,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_left = 0; @@ -6586,9 +4446,7 @@ index 1f33b0c..55a0315 100644 - int xq_pu = x0 >> log2_min_pu_size; - int xp_tu = (x0 - 1) >> log2_min_tu_size; - int xq_tu = x0 >> log2_min_tu_size; -+ rpl; -+ MvField *left = curr - 1; - +- - for (i = 0; i < (1 << log2_trafo_size); i += 4) { - int y_pu = (y0 + i) >> log2_min_pu_size; - int y_tu = (y0 + i) >> log2_min_tu_size; @@ -6606,10 +4464,7 @@ index 1f33b0c..55a0315 100644 - s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs; - } - } -+ if (is_intra) { -+ for (j = 0; j < (1 << log2_trafo_size); j += 4) -+ bs[j * s->bs_width >> 2] = 2; - +- - if (log2_trafo_size > log2_min_pu_size && !is_intra) { - RefPicList *rpl = s->ref->refPicList; - @@ -6617,12 +4472,17 @@ index 1f33b0c..55a0315 100644 - for (j = 8; j < (1 << log2_trafo_size); j += 8) { - int yp_pu = (y0 + j - 1) >> log2_min_pu_size; - int yq_pu = (y0 + j) >> log2_min_pu_size; -- ++ rpl; ++ MvField *left = curr - 1; + - for (i = 0; i < (1 << log2_trafo_size); i += 4) { - int x_pu = (x0 + i) >> log2_min_pu_size; - MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu]; - MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu]; -- ++ if (is_intra) { ++ for (j = 0; j < (1 << log2_trafo_size); j += 4) ++ bs[j * s->bs_width >> 2] = 2; + - bs = boundary_strength(s, curr, top, rpl); - s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs; + } else { @@ -6674,137 +4534,42 @@ index 1f33b0c..55a0315 100644 } } } -@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -839,11 +875,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, #undef CB #undef CR -+#if !defined(RPI_FAST_CACHEFLUSH) -+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU) -+static void flush_buffer_y(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame); -+ gpu_cache_flush(&p); -+} -+ -+static void flush_buffer_u(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame); -+ gpu_cache_flush(&p); -+} -+ -+static void flush_buffer_v(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame); -+ gpu_cache_flush(&p); -+} -+#endif -+#endif -+ -+ +#ifdef RPI_DEBLOCK_VPU -+#error Not fixed yet -+ +// ff_hevc_flush_buffer_lines +// flushes and invalidates all pixel rows in [start,end-1] +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma) +{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int curr_y = start; -+ int n = end; -+ int curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ GPU_MEM_PTR_T p; -+ if (curr_uv < 0) curr_uv = 0; -+ if (n_uv<=curr_uv) { return; } -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ if (flush_chroma) { -+ p = get_gpu_mem_ptr_u(s->frame); -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)p.arm + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)p.arm + base; -+ iocache.s[1].size = sz; -+ } -+ if (flush_luma) { -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)p.arm + base; -+ iocache.s[2].size = sz; -+ } -+ vcsm_clean_invalid( &iocache ); -+#else -+ if (flush_chroma) { -+ flush_buffer_u(s->frame); -+ flush_buffer_v(s->frame); -+ } -+ if (flush_luma) { -+ flush_buffer_y(s->frame); -+ } -+#endif ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma); ++ rpi_cache_flush_finish(rfe); +} +#endif + -+#ifdef RPI_INTER_QPU -+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n) ++#if RPI_INTER ++ ++// Flush some lines of a reference frames ++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n) +{ + if (s->enable_rpi && s->used_for_ref) { -+ // TODO make this use ff_hevc_flush_buffer_lines -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int curr_y = ((int *)f->progress->data)[0]; -+ int curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ GPU_MEM_PTR_T p; -+ if (curr_uv < 0) curr_uv = 0; -+ if (n_uv<=curr_uv) { return; } -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ p = get_gpu_mem_ptr_u(s->frame); -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)p.arm + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)p.arm + base; -+ iocache.s[1].size = sz; ++ const int d0 = ((int *)f->progress->data)[0]; ++ const unsigned int curr_y = d0 == -1 ? 0 : d0; // At start of time progress is -1 + -+#ifdef RPI_LUMA_QPU -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)p.arm + base; -+ iocache.s[2].size = sz; -+#endif -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer_u(s->frame); -+ flush_buffer_v(s->frame); -+#ifdef RPI_LUMA_QPU -+ flush_buffer_y(s->frame); -+#endif -+ -+#endif -+ //memcpy(s->dummy.arm,s->frame->data[0],2048*64); -+ //memcpy(s->dummy.arm,s->frame->data[1],1024*32); -+ //memcpy(s->dummy.arm,s->frame->data[2],1024*32); ++ if (curr_y < (unsigned int)f->f->height) { ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1); ++ rpi_cache_flush_finish(rfe); ++ } + } +} +#endif + +#ifdef RPI_DEBLOCK_VPU -+#error XXX +/* rpi_deblock deblocks an entire row of ctbs using the VPU */ +static void rpi_deblock(HEVCContext *s, int y, int ctb_size) +{ @@ -6833,16 +4598,19 @@ index 1f33b0c..55a0315 100644 + s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) ); + s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1]; + s->dvq->vpu_cmds_arm[2][5] = 4; ++ + // Call VPU -+ s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands ++ { ++ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands ++ vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id); ++ vpu_qpu_job_finish(vqj); ++ } + + s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1); + s->dvq = s->dvq_ents + s->dvq_n; + -+ if (s->dvq->cmd_id != -1) { -+ vpu_wait(s->dvq->cmd_id); -+ s->dvq->cmd_id = -1; -+ } ++ vpu_qpu_wait(&s->dvq->cmd_id); +} + +#endif @@ -6871,14 +4639,14 @@ index 1f33b0c..55a0315 100644 if (s->ps.sps->sao_enabled) { int y_end = y >= s->ps.sps->height - ctb_size; if (y && x) -@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) +@@ -852,16 +981,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) sao_filter_CTB(s, x - ctb_size, y); if (y && x_end) { sao_filter_CTB(s, x, y - ctb_size); - if (s->threads_type & FF_THREAD_FRAME ) -+ if (s->threads_type & FF_THREAD_FRAME ) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s,&s->ref->tf, y); ++ if (s->threads_type == FF_THREAD_FRAME ) { ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s,&s->ref->tf, y); +#endif ff_thread_report_progress(&s->ref->tf, y, 0); + } @@ -6886,14 +4654,15 @@ index 1f33b0c..55a0315 100644 if (x_end && y_end) { sao_filter_CTB(s, x , y); - if (s->threads_type & FF_THREAD_FRAME ) -+ if (s->threads_type & FF_THREAD_FRAME ) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size); ++ if (s->threads_type == FF_THREAD_FRAME ) { ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size); +#endif ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); + } -+ } -+ } else if (s->threads_type & FF_THREAD_FRAME && x_end) { + } +- } else if (s->threads_type & FF_THREAD_FRAME && x_end) ++ } else if (s->threads_type == FF_THREAD_FRAME && x_end) { + //int newh = y + ctb_size - 4; + //int currh = s->ref->tf.progress->data[0]; + //if (((y + ctb_size)&63)==0) @@ -6904,15 +4673,14 @@ index 1f33b0c..55a0315 100644 + ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); + } + } else { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4); ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); +#endif + ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); - } -- } else if (s->threads_type & FF_THREAD_FRAME && x_end) ++ } +#else -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4); ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); + // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi +#endif ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); @@ -6921,11 +4689,11 @@ index 1f33b0c..55a0315 100644 } void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) -diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c -index 83f2ec2..6882a8d 100644 ---- a/libavcodec/hevc_ps.c -+++ b/libavcodec/hevc_ps.c -@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, +diff --git b/libavcodec/hevc_ps.c a/libavcodec/hevc_ps.c +index acd55cc..0a465d4 100644 +--- b/libavcodec/hevc_ps.c ++++ a/libavcodec/hevc_ps.c +@@ -1001,6 +1001,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, sps->amp_enabled_flag = get_bits1(gb); sps->sao_enabled = get_bits1(gb); @@ -6934,10 +4702,2640 @@ index 83f2ec2..6882a8d 100644 sps->pcm_enabled_flag = get_bits1(gb); if (sps->pcm_enabled_flag) { sps->pcm.bit_depth = get_bits(gb, 4) + 1; -diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c -index 9d773d9..a6534a9 100644 ---- a/libavcodec/hevcdsp.c -+++ b/libavcodec/hevcdsp.c +diff --git b/libavcodec/hevcdec.c a/libavcodec/hevcdec.c +index ef21595..b36e840 100644 +--- b/libavcodec/hevcdec.c ++++ a/libavcodec/hevcdec.c +@@ -42,8 +42,233 @@ + #include "hevcdec.h" + #include "profiles.h" + ++#ifdef RPI ++ #include "rpi_qpu.h" ++ #include "rpi_shader.h" ++ ++ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory ++ #define RPI_CACHE_UNIF_MVS 1 ++ ++ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*) ++ //#define RPI_SIMULATE_QPUS ++ #ifdef RPI_WORKER ++ #include "pthread.h" ++ #endif ++ ++ #include "libavutil/atomic.h" ++ ++ static void worker_core(HEVCContext * const s); ++#endif ++ ++// #define DISABLE_MC ++ ++ ++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) ++ ++#ifndef av_mod_uintp2 ++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) ++{ ++ return a & ((1 << p) - 1); ++} ++# define av_mod_uintp2 av_mod_uintp2_c ++#endif ++ ++#define Y_B_ONLY 1 ++ + const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; + ++ ++#if RPI_INTER ++ ++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks ++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks ++// For each block of 64*64 the smallest block size is 8x4 ++// We also need an extra command for the setup information ++ ++#define RPI_CHROMA_COMMAND_WORDS 11 ++#define UV_COMMANDS_PER_QPU ((1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS) ++// The QPU code for UV blocks only works up to a block width of 8 ++#define RPI_CHROMA_BLOCK_WIDTH 8 ++ ++typedef struct qpu_mc_pred_c_s { ++ uint32_t next_fn; ++ int16_t next_src_y; ++ int16_t next_src_x; ++ uint32_t next_src_base_u; ++ uint32_t next_src_base_v; ++ union { ++ struct { ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ uint32_t dst_addr_u; ++ uint32_t dst_addr_v; ++ } p; ++ struct { ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t weight_u; ++ uint32_t weight_v; ++ uint32_t dummy0; ++ uint32_t dummy1; ++ } b0; ++ struct { ++ uint32_t dummy0; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ uint32_t dst_addr_u; ++ uint32_t dst_addr_v; ++ } b1; ++ struct { ++ uint32_t pic_w; ++ uint32_t pic_h; ++ uint32_t src_stride; ++ uint32_t dst_stride; ++ uint32_t wdenom; ++ uint32_t dummy0; ++ uint32_t dummy1; ++ } s; ++ }; ++} qpu_mc_pred_c_t; ++ ++ ++static const char static_assert_qpu_mc_pred[sizeof(qpu_mc_pred_c_t) != RPI_CHROMA_COMMAND_WORDS * 4 ? -1 : 1] = {0}; ++ ++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) ++ ++// TODO Chroma only needs 4 taps ++ ++// Actual filter goes -ve, +ve, +ve, -ve using these values ++static const uint32_t rpi_filter_coefs[8] = { ++ ENCODE_COEFFS( 0, 64, 0, 0), ++ ENCODE_COEFFS( 2, 58, 10, 2), ++ ENCODE_COEFFS( 4, 54, 16, 2), ++ ENCODE_COEFFS( 6, 46, 28, 4), ++ ENCODE_COEFFS( 4, 36, 36, 4), ++ ENCODE_COEFFS( 4, 28, 46, 6), ++ ENCODE_COEFFS( 2, 16, 54, 4), ++ ENCODE_COEFFS( 2, 10, 58, 2) ++}; ++ ++#define RPI_LUMA_COMMAND_WORDS 10 ++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS) ++#endif ++ ++ ++#ifdef RPI_WORKER ++ ++typedef struct worker_global_env_s ++{ ++ volatile int arm_load; ++ pthread_mutex_t lock; ++ ++ unsigned int arm_y; ++ unsigned int arm_c; ++ unsigned int gpu_y; ++ unsigned int gpu_c; ++} worker_global_env_t; ++ ++static worker_global_env_t worker_global_env = ++{ ++ .lock = PTHREAD_MUTEX_INITIALIZER ++}; ++ ++ ++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); ++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); ++ ++#define LOG_ENTER ++#define LOG_EXIT ++ ++// Call this when we have completed pass0 and wish to trigger pass1 for the current job ++static void worker_submit_job(HEVCContext *s) ++{ ++ LOG_ENTER ++ pthread_mutex_lock(&s->worker_mutex); ++ s->worker_tail++; ++ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved ++ pthread_mutex_unlock(&s->worker_mutex); ++ LOG_EXIT ++} ++ ++// Call this to say we have completed pass1 ++static void worker_complete_job(HEVCContext *s) ++{ ++ LOG_ENTER ++ pthread_mutex_lock(&s->worker_mutex); ++ s->worker_head++; ++ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved ++ pthread_mutex_unlock(&s->worker_mutex); ++ LOG_EXIT ++} ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++static void worker_wait(HEVCContext *s) ++{ ++ LOG_ENTER ++ pthread_mutex_lock(&s->worker_mutex); ++ while( s->worker_head !=s->worker_tail) ++ { ++ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); ++ } ++ pthread_mutex_unlock(&s->worker_mutex); ++ LOG_EXIT ++} ++ ++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes ++// available to receive the next job. ++static void worker_pass0_ready(HEVCContext *s) ++{ ++ LOG_ENTER ++ pthread_mutex_lock(&s->worker_mutex); ++ // tail is number of submitted jobs ++ // head is number of completed jobs ++ // tail-head is number of outstanding jobs in the queue ++ // we need to ensure there is at least 1 space left for us to use ++ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS) ++ { ++ // Wait until another job is completed ++ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); ++ } ++ pthread_mutex_unlock(&s->worker_mutex); ++ LOG_EXIT ++} ++ ++static void *worker_start(void *arg) ++{ ++ HEVCContext *s = (HEVCContext *)arg; ++ while(1) { ++ pthread_mutex_lock(&s->worker_mutex); ++ ++ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0) ++ { ++ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex); ++ } ++ pthread_mutex_unlock(&s->worker_mutex); ++ ++ if (s->kill_worker) { ++ break; ++ } ++ LOG_ENTER ++ worker_core(s); ++ ++ worker_complete_job(s); ++ LOG_EXIT ++ } ++ return NULL; ++} ++ ++#endif ++ + /** + * NOTE: Each function hls_foo correspond to the function foo in the + * specification (HLS stands for High Level Syntax). +@@ -56,6 +281,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 + /* free everything allocated by pic_arrays_init() */ + static void pic_arrays_free(HEVCContext *s) + { ++#ifdef RPI ++ int job; ++ for(job=0;jobcoeffs_buf_arm[job][0]) { ++ gpu_free(&s->coeffs_buf_default[job]); ++ s->coeffs_buf_arm[job][0] = 0; ++ } ++ if (s->coeffs_buf_arm[job][2]) { ++ gpu_free(&s->coeffs_buf_accelerated[job]); ++ s->coeffs_buf_arm[job][2] = 0; ++ } ++ } ++#endif ++#ifdef RPI_DEBLOCK_VPU ++ { ++ int i; ++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) { ++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; ++ ++ if (dvq->vpu_cmds_arm) { ++ gpu_free(&dvq->deblock_vpu_gmem); ++ dvq->vpu_cmds_arm = 0; ++ } ++ } ++ } ++#endif + av_freep(&s->sao); + av_freep(&s->deblock); + +@@ -92,6 +343,88 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) + int ctb_count = sps->ctb_width * sps->ctb_height; + int min_pu_size = sps->min_pu_width * sps->min_pu_height; + ++#ifdef RPI ++ const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size); ++ const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS; ++ const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; ++ const int coefs_per_row = coefs_per_luma + coefs_per_chroma; ++ int job; ++ ++ av_assert0(sps); ++// s->max_ctu_count = sps->ctb_width; ++// printf("CTB with=%d\n", sps->ctb_width); ++// s->max_ctu_count = coefs_per_luma / coefs_in_ctb; ++ s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width); ++ s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y; ++ s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV; ++ ++ for(job=0;jobcoeffs_buf_default[job]); ++ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; ++ if (!s->coeffs_buf_arm[job][0]) ++ goto fail; ++ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data ++ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; ++ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; ++ if (!s->coeffs_buf_arm[job][2]) ++ goto fail; ++ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. ++ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; ++ } ++ } ++#endif ++#ifdef RPI_DEBLOCK_VPU ++ { ++ int i; ++ s->enable_rpi_deblock = !sps->sao_enabled; ++ s->setup_width = (sps->width+15) / 16; ++ s->setup_height = (sps->height+15) / 16; ++ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16; ++ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16; ++ ++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) ++ { ++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; ++ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15; ++ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15; ++ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15; ++ const unsigned int total_size =- cmd_size + y_size + uv_size; ++ int p_vc; ++ uint8_t * p_arm; ++ #if RPI_VPU_DEBLOCK_CACHED ++ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem); ++ #else ++ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem); ++ #endif ++ p_vc = dvq->deblock_vpu_gmem.vc; ++ p_arm = dvq->deblock_vpu_gmem.arm; ++ ++ // Zap all ++ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes); ++ ++ // Subdivide ++ dvq->vpu_cmds_arm = (void*)p_arm; ++ dvq->vpu_cmds_vc = p_vc; ++ ++ p_arm += cmd_size; ++ p_vc += cmd_size; ++ ++ dvq->y_setup_arm = (void*)p_arm; ++ dvq->y_setup_vc = (void*)p_vc; ++ ++ p_arm += y_size; ++ p_vc += y_size; ++ ++ dvq->uv_setup_arm = (void*)p_arm; ++ dvq->uv_setup_vc = (void*)p_vc; ++ } ++ ++ s->dvq_n = 0; ++ s->dvq = s->dvq_ents + s->dvq_n; ++ } ++#endif ++ + s->bs_width = (width >> 2) + 1; + s->bs_height = (height >> 2) + 1; + +@@ -138,6 +471,29 @@ fail: + return AVERROR(ENOMEM); + } + ++static void default_pred_weight_table(HEVCContext * const s) ++{ ++ unsigned int i; ++ s->sh.luma_log2_weight_denom = 0; ++ s->sh.chroma_log2_weight_denom = 0; ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ s->sh.luma_weight_l0[i] = 1; ++ s->sh.luma_offset_l0[i] = 0; ++ s->sh.chroma_weight_l0[i][0] = 1; ++ s->sh.chroma_offset_l0[i][0] = 0; ++ s->sh.chroma_weight_l0[i][1] = 1; ++ s->sh.chroma_offset_l0[i][1] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ s->sh.luma_weight_l1[i] = 1; ++ s->sh.luma_offset_l1[i] = 0; ++ s->sh.chroma_weight_l1[i][0] = 1; ++ s->sh.chroma_offset_l1[i][0] = 0; ++ s->sh.chroma_weight_l1[i][1] = 1; ++ s->sh.chroma_offset_l1[i][1] = 0; ++ } ++} ++ + static void pred_weight_table(HEVCContext *s, GetBitContext *gb) + { + int i = 0; +@@ -678,6 +1034,11 @@ static int hls_slice_header(HEVCContext *s) + (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) { + pred_weight_table(s, gb); + } ++ else ++ { ++ // Give us unit weights ++ default_pred_weight_table(s); ++ } + + sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); + if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { +@@ -933,6 +1294,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { + return 0; + } + ++#ifdef RPI ++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) ++{ ++ if (s->enable_rpi) { ++ HEVCLocalContext *lc = s->HEVClc; ++ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ cmd->type = RPI_PRED_INTRA; ++ cmd->size = log2_trafo_size; ++ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; ++ cmd->c_idx = c_idx; ++ cmd->i_pred.x = x0; ++ cmd->i_pred.y = y0; ++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ } else { ++ s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx); ++ } ++} ++#endif ++ + static int hls_transform_unit(HEVCContext *s, int x0, int y0, + int xBase, int yBase, int cb_xBase, int cb_yBase, + int log2_cb_size, int log2_trafo_size, +@@ -945,8 +1325,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + if (lc->cu.pred_mode == MODE_INTRA) { + int trafo_size = 1 << log2_trafo_size; + ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); +- ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size, x0, y0, 0); ++#else + s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0); ++#endif + } + + if (cbf_luma || cbf_cb[0] || cbf_cr[0] || +@@ -1032,7 +1415,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { + ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); ++#else + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1); ++#endif + } + if (cbf_cb[i]) + ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), +@@ -1061,7 +1448,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { + ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); ++#else + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2); ++#endif + } + if (cbf_cr[i]) + ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), +@@ -1090,7 +1481,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + if (lc->cu.pred_mode == MODE_INTRA) { + ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), + trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); ++#else + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1); ++#endif + } + if (cbf_cb[i]) + ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), +@@ -1100,7 +1495,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + if (lc->cu.pred_mode == MODE_INTRA) { + ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), + trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); ++#else + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2); ++#endif + } + if (cbf_cr[i]) + ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), +@@ -1112,26 +1511,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, + int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); + int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); + ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1); ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2); ++#else + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1); + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2); ++#endif + if (s->ps.sps->chroma_format_idc == 2) { + ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c), + trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); ++ rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); ++#else + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1); + s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2); ++#endif + } + } else if (blk_idx == 3) { + int trafo_size_h = 1 << (log2_trafo_size + 1); + int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]); + ff_hevc_set_neighbour_available(s, xBase, yBase, + trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1); ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2); ++#else + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1); + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2); ++#endif + if (s->ps.sps->chroma_format_idc == 2) { + ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)), + trafo_size_h, trafo_size_v); ++#ifdef RPI ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); ++ rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); ++#else + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1); + s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2); ++#endif + } + } + } +@@ -1277,33 +1696,23 @@ do { + return 0; + } + +-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) ++ ++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) + { +- HEVCLocalContext *lc = s->HEVClc; + GetBitContext gb; +- int cb_size = 1 << log2_cb_size; + ptrdiff_t stride0 = s->frame->linesize[0]; + ptrdiff_t stride1 = s->frame->linesize[1]; + ptrdiff_t stride2 = s->frame->linesize[2]; + uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; + uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; + uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; +- +- int length = cb_size * cb_size * s->ps.sps->pcm.bit_depth + +- (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) + +- ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) * +- s->ps.sps->pcm.bit_depth_chroma; +- const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3); + int ret; + +- if (!s->sh.disable_deblocking_filter_flag) +- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); +- + ret = init_get_bits(&gb, pcm, length); + if (ret < 0) + return ret; + +- s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); + if (s->ps.sps->chroma_format_idc) { + s->hevcdsp.put_pcm(dst1, stride1, + cb_size >> s->ps.sps->hshift[1], +@@ -1318,6 +1727,59 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) + return 0; + } + ++#ifdef RPI ++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n) ++{ ++ int16_t * const coeffs = (buf_no != 3) ? ++ s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] : ++ s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n; ++ s->num_coeffs[s->pass0_job][buf_no] += n; ++ return coeffs; ++} ++#endif ++ ++// x * 2^(y*2) ++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) ++{ ++ return x << (y * 2); ++} ++ ++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size) ++{ ++ // Length in bits ++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]); ++ ++ const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); ++ ++#ifdef RPI ++ if (s->enable_rpi) { ++ // Copy coeffs ++ const int blen = (length + 7) >> 3; ++ int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, (blen + 1) >> 1); ++ memcpy(coeffs, pcm, blen); ++ ++ // Add command ++ { ++ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ cmd->type = RPI_PRED_I_PCM; ++ cmd->size = log2_cb_size; ++ cmd->i_pcm.src = coeffs; ++ cmd->i_pcm.x = x0; ++ cmd->i_pcm.y = y0; ++ cmd->i_pcm.src_len = length; ++ } ++ return 0; ++ } ++#endif ++ ++ return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size); ++} ++ + /** + * 8.5.3.2.2.1 Luma sample unidirectional interpolation process + * +@@ -1334,6 +1796,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) + * @param luma_offset additive offset applied to the luma prediction value + */ + ++#if RPI_INTER ++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, ++ AVFrame *ref, const Mv *mv, int x_off, int y_off, ++ int block_w, int block_h, int luma_weight, int luma_offset) ++{ ++ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; ++ cmd->cmd = RPI_CMD_LUMA_UNI; ++ cmd->dst = dst; ++ cmd->dststride = dststride; ++ cmd->src = ref->data[0]; ++ cmd->srcstride = ref->linesize[0]; ++ cmd->mv = *mv; ++ cmd->x_off = x_off; ++ cmd->y_off = y_off; ++ cmd->block_w = block_w; ++ cmd->block_h = block_h; ++ cmd->weight = luma_weight; ++ cmd->offset = luma_offset; ++} ++ ++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, ++ AVFrame *ref0, const Mv *mv0, int x_off, int y_off, ++ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, ++ const struct MvField * const current_mv) ++{ ++ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; ++ cmd->cmd = RPI_CMD_LUMA_BI; ++ cmd->dst = dst; ++ cmd->dststride = dststride; ++ cmd->src = ref0->data[0]; ++ cmd->srcstride = ref0->linesize[0]; ++ cmd->mv = *mv0; ++ cmd->x_off = x_off; ++ cmd->y_off = y_off; ++ cmd->block_w = block_w; ++ cmd->block_h = block_h; ++ cmd->src1 = ref1->data[0]; ++ cmd->srcstride1 = ref1->linesize[0]; ++ cmd->mv1 = *mv1; ++ cmd->ref_idx[0] = current_mv->ref_idx[0]; ++ cmd->ref_idx[1] = current_mv->ref_idx[1]; ++} ++ ++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, ++ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, ++ int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset) ++{ ++ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; ++ cmd->cmd = RPI_CMD_CHROMA_UNI; ++ cmd->dst = dst0; ++ cmd->dststride = dststride; ++ cmd->src = src0; ++ cmd->srcstride = srcstride; ++ cmd->mv = *mv; ++ cmd->x_off = x_off; ++ cmd->y_off = y_off; ++ cmd->block_w = block_w; ++ cmd->block_h = block_h; ++ cmd->weight = chroma_weight; ++ cmd->offset = chroma_offset; ++} ++ ++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, ++ int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx) ++{ ++ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; ++ cmd->cmd = RPI_CMD_CHROMA_BI+cidx; ++ cmd->dst = dst0; ++ cmd->dststride = dststride; ++ cmd->src = ref0->data[cidx+1]; ++ cmd->srcstride = ref0->linesize[cidx+1]; ++ cmd->mv = current_mv->mv[0]; ++ cmd->mv1 = current_mv->mv[1]; ++ cmd->x_off = x_off; ++ cmd->y_off = y_off; ++ cmd->block_w = block_w; ++ cmd->block_h = block_h; ++ cmd->src1 = ref1->data[cidx+1]; ++ cmd->srcstride1 = ref1->linesize[cidx+1]; ++ cmd->ref_idx[0] = current_mv->ref_idx[0]; ++ cmd->ref_idx[1] = current_mv->ref_idx[1]; ++} ++ ++#endif ++ + static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + AVFrame *ref, const Mv *mv, int x_off, int y_off, + int block_w, int block_h, int luma_weight, int luma_offset) +@@ -1349,6 +1896,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag); + int idx = ff_hevc_pel_weight[block_w]; + ++#ifdef DISABLE_MC ++ return; ++#endif ++ + x_off += mv->x >> 2; + y_off += mv->y >> 2; + src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); +@@ -1395,7 +1946,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + * @param mv1 motion vector1 (relative to block position) to get pixel data from + * @param current_mv current motion vector structure + */ +- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, ++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + AVFrame *ref0, const Mv *mv0, int x_off, int y_off, + int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) + { +@@ -1419,6 +1970,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); + uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); + ++#ifdef DISABLE_MC ++ return; ++#endif ++ + if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || + x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || + y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { +@@ -1504,6 +2059,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, + intptr_t _mx = mx << (1 - hshift); + intptr_t _my = my << (1 - vshift); + ++#ifdef DISABLE_MC ++ return; ++#endif ++ + x_off += mv->x >> (2 + hshift); + y_off += mv->y >> (2 + vshift); + src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); +@@ -1568,6 +2127,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF + int hshift = s->ps.sps->hshift[1]; + int vshift = s->ps.sps->vshift[1]; + ++#ifdef DISABLE_MC ++ return; ++#endif ++ + intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); + intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); + intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); +@@ -1695,14 +2258,312 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, + } + } + +-static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +- int nPbW, int nPbH, +- int log2_cb_size, int partIdx, int idx) ++ ++#if RPI_INTER ++static void ++rpi_pred_y(HEVCContext *const s, const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const Mv *const mv, ++ const int weight_mul, ++ const int weight_offset, ++ AVFrame *const src_frame) ++{ ++ const unsigned int y_off = x0 + y0 * s->frame->linesize[0]; ++ ++ rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, ++ mv, x0, y0, nPbW, nPbH, ++ weight_mul, weight_offset); ++ ++ { ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const int x1_m3 = x0 + (mv->x >> 2) - 3; ++ const int y1_m3 = y0 + (mv->y >> 2) - 3; ++ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); ++ uint32_t *y = s->curr_y_mvs; ++ uint32_t dst_base = get_vc_address_y(s->frame) + y_off; ++ const uint32_t wo_0 = PACK2(weight_offset * 2 + 1, weight_mul); ++ ++ // Potentially we could change the assembly code to support taller sizes in one go ++ for (int start_y = 0; start_y < nPbH; start_y += 16, dst_base += s->frame->linesize[0] * 16) { ++ const uint32_t src_yx_y = y1_m3 + start_y; ++ int start_x = 0; ++ const int bh = FFMIN(nPbH - start_y, 16); ++ uint32_t *const py = y - RPI_LUMA_COMMAND_WORDS; ++ uint32_t *const ppy = y - RPI_LUMA_COMMAND_WORDS * 2; ++ ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ // ++ // In the init (1st) block w/h is pic width height so given ++ // that no pic will ever be 8 pixels wide the first test here ++ // should fail if this is the first pred (i.e. after that test ++ // ppy is valid) ++ if (py[4] == ((8 << 16) | bh) && py[8] + 8 == dst_base && ppy[9] == s->qpu_filter) { ++ const int bw = FFMIN(nPbW, 8); ++ ++ ppy[2] = PACK2(src_yx_y, x1_m3); ++ ppy[3] = src_vc_address_y; ++ py[4] += bw << 16; ++ py[5] = PACK2(my2_mx2_my_mx, py[5]); ++ // py[6] stays the same ++ py[7] = wo_0; ++ ++ start_x = bw; ++ } ++ ++ for (; start_x < nPbW; start_x += 16) { ++ const int bw = FFMIN(nPbW - start_x, 16);; ++ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + start_x); ++ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; ++ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + 8 + start_x); ++ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; ++ *y++ = PACK2(bw, bh); ++ *y++ = my2_mx2_my_mx; ++ *y++ = wo_0; ++ *y++ = wo_0; ++ *y++ = dst_base + start_x; ++ y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter; ++ } ++ } ++ s->curr_y_mvs = y; ++ } ++} ++ ++static void ++rpi_pred_y_b(HEVCContext * const s, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const struct MvField *const mv_field, ++ AVFrame *const src_frame, ++ AVFrame *const src_frame2) ++{ ++ const unsigned int y_off = x0 + y0 * s->frame->linesize[0]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++ rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, ++ mv, x0, y0, nPbW, nPbH, ++ src_frame2, mv2, mv_field); ++#if !Y_B_ONLY ++ { ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = mv2->x & 3; ++ const unsigned int my2 = mv2->y & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const int x1 = x0 + (mv->x >> 2) - 3; ++ const int y1 = y0 + (mv->y >> 2) - 3; ++ const int x2 = x0 + (mv2->x >> 2) - 3; ++ const int y2 = y0 + (mv2->y >> 2) - 3; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + ++ s->sh.luma_offset_l1[ref_idx1] + 1; ++ const uint32_t wo_0 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo_1 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ ++ uint32_t * y = s->curr_y_mvs; ++ uint32_t dst = get_vc_address_y(s->frame) + y_off; ++ ++ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go ++ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time ++ int bw = nPbW-start_x; ++ int bh = nPbH-start_y; ++ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y1 + start_y, x1 + start_x); ++ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame); ++ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y2 + start_y, x2 + start_x); ++ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame2); ++ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16); ++ *y++ = my2_mx2_my_mx; ++ ++ *y++ = wo_0; ++ *y++ = wo_1; ++ ++ *y++ = dst + start_x; ++ y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter_b; ++ } ++ dst += s->frame->linesize[0] * 16; ++ } ++ s->curr_y_mvs = y; ++ } ++#endif ++} ++ ++ ++static void ++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const Mv * const mv, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ AVFrame * const src_frame) ++{ ++ ++ const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1]; ++ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); ++ ++ rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1], ++ x0_c, y0_c, nPbW_c, nPbH_c, mv, ++ c_weights[0], c_offsets[0]); ++ ++ rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2], ++ x0_c, y0_c, nPbW_c, nPbH_c, mv, ++ c_weights[1], c_offsets[1]); ++ ++ { ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; ++ ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const uint32_t src_base_u = get_vc_address_u(src_frame); ++ const uint32_t src_base_v = get_vc_address_v(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off; ++ ++ qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs; ++ ++ for(int start_y=0;start_y < nPbH_c;start_y+=16) ++ { ++ const int bh = FFMIN(nPbH_c-start_y, 16); ++ // We are allowed 3/4 powers of two as well as powers of 2 ++ av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2); ++ ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH, ++u) ++ { ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ u[-1].next_fn = s->qpu_filter_uv; ++ u[-1].next_src_x = x1_c + start_x; ++ u[-1].next_src_y = y1_c + start_y; ++ u[-1].next_src_base_u = src_base_u; ++ u[-1].next_src_base_v = src_base_v; ++ u[0].p.h = bh; ++ u[0].p.w = bw; ++ u[0].p.coeffs_x = x_coeffs; ++ u[0].p.coeffs_y = y_coeffs; ++ u[0].p.wo_u = wo_u; ++ u[0].p.wo_v = wo_v; ++ u[0].p.dst_addr_u = dst_base_u + start_x; ++ u[0].p.dst_addr_v = dst_base_v + start_x; ++ } ++ ++ dst_base_u += s->frame->linesize[1] * 16; ++ dst_base_v += s->frame->linesize[2] * 16; ++ } ++ s->curr_u_mvs = (uint32_t *)u; ++ } ++ return; ++} ++ ++static void ++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const struct MvField * const mv_field, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ const int16_t * const c_weights2, ++ const int16_t * const c_offsets2, ++ AVFrame * const src_frame, ++ AVFrame * const src_frame2) ++{ ++ const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1]; ++ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); ++ ++ rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2, ++ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0); ++ ++ rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2, ++ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1); ++ ++ { ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); ++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off; ++ qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs; ++ ++ for (int start_y = 0; start_y < nPbH_c; start_y += 16) { ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH, u += 2) { ++ int bw = nPbW_c-start_x; ++ int bh = nPbH_c-start_y; ++ u[-1].next_fn = s->qpu_filter_uv_b0; // In fact ignored ++ u[-1].next_src_x = x1_c + start_x; ++ u[-1].next_src_y = y1_c + start_y; ++ u[-1].next_src_base_u = get_vc_address_u(src_frame); ++ u[-1].next_src_base_v = get_vc_address_v(src_frame); ++ ++ u[0].next_fn = s->qpu_filter_uv_b; ++ u[0].next_src_x = x2_c + start_x; ++ u[0].next_src_y = y2_c + start_y; ++ u[0].next_src_base_u = get_vc_address_u(src_frame2); ++ u[0].next_src_base_v = get_vc_address_v(src_frame2); ++ ++ u[0].b0.h = (bh<16 ? bh : 16); ++ u[0].b0.w = (bwframe->linesize[1] * 16; ++ dst_base_v += s->frame->linesize[2] * 16; ++ } ++ ++ s->curr_u_mvs = (uint32_t *)u; ++ } ++} ++#endif ++ ++ ++ ++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) + { + #define POS(c_idx, x, y) \ + &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \ + (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)] +- HEVCLocalContext *lc = s->HEVClc; ++ HEVCLocalContext * const lc = s->HEVClc; + int merge_idx = 0; + struct MvField current_mv = {{{ 0 }}}; + +@@ -1720,8 +2581,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, + int y_cb = y0 >> log2_min_cb_size; + int x_pu, y_pu; + int i, j; +- +- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); ++ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); + + if (!skip_flag) + lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); +@@ -1765,12 +2625,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, + int nPbW_c = nPbW >> s->ps.sps->hshift[1]; + int nPbH_c = nPbH >> s->ps.sps->vshift[1]; + +- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0, ++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ } else ++#endif ++ { ++ luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, + ¤t_mv.mv[0], x0, y0, nPbW, nPbH, + s->sh.luma_weight_l0[current_mv.ref_idx[0]], + s->sh.luma_offset_l0[current_mv.ref_idx[0]]); ++ } + + if (s->ps.sps->chroma_format_idc) { ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ return; ++ } ++#endif + chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], + 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); +@@ -1784,12 +2661,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, + int nPbW_c = nPbW >> s->ps.sps->hshift[1]; + int nPbH_c = nPbH >> s->ps.sps->vshift[1]; + +- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1, ++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ } else ++#endif ++ { ++ luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, + ¤t_mv.mv[1], x0, y0, nPbW, nPbH, + s->sh.luma_weight_l1[current_mv.ref_idx[1]], + s->sh.luma_offset_l1[current_mv.ref_idx[1]]); ++ } + + if (s->ps.sps->chroma_format_idc) { ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ return; ++ } ++#endif + chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], + 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, + s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); +@@ -1804,11 +2698,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, + int nPbW_c = nPbW >> s->ps.sps->hshift[1]; + int nPbH_c = nPbH >> s->ps.sps->vshift[1]; + +- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_y_b(s, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); ++ } else ++#endif ++ { ++ luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, + ¤t_mv.mv[0], x0, y0, nPbW, nPbH, + ref1->frame, ¤t_mv.mv[1], ¤t_mv); ++ } + + if (s->ps.sps->chroma_format_idc) { ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c, ++ ¤t_mv, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], ++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref0->frame, ++ ref1->frame); ++ return; ++ } ++#endif + chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, + x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); + +@@ -2083,7 +2997,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) + intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); + ret = hls_pcm_sample(s, x0, y0, log2_cb_size); + if (s->ps.sps->pcm.loop_filter_disable_flag) ++ { + set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } + + if (ret < 0) + return ret; +@@ -2306,6 +3222,741 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, + lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); + } + ++#ifdef RPI ++static void rpi_execute_dblk_cmds(HEVCContext *s) ++{ ++ int n; ++ int job = s->pass1_job; ++ int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ int (*p)[2] = s->dblk_cmds[job]; ++ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) { ++ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size); ++ } ++ s->num_dblk_cmds[job] = 0; ++} ++ ++#if 0 ++static void rpi_execute_transform(HEVCContext *s) ++{ ++ int i=2; ++ int job = s->pass1_job; ++ /*int j; ++ int16_t *coeffs = s->coeffs_buf_arm[job][i]; ++ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) { ++ s->hevcdsp.idct[4-2](coeffs, 16); ++ } ++ i=3; ++ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i]; ++ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) { ++ s->hevcdsp.idct[5-2](coeffs, 32); ++ }*/ ++ ++ rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], ++ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], ++ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]); ++ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0); ++ //gpu_cache_flush(&s->coeffs_buf_accelerated); ++ //vpu_wait(s->vpu_id); ++ ++ for(i=0;i<4;i++) ++ s->num_coeffs[job][i] = 0; ++} ++#endif ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM ++static void rpi_execute_pred_cmds(HEVCContext * const s) ++{ ++ int i; ++ int job = s->pass1_job; ++ const HEVCPredCmd *cmd = s->univ_pred_cmds[job]; ++#ifdef RPI_WORKER ++ HEVCLocalContextIntra *lc = &s->HEVClcIntra; ++#else ++ HEVCLocalContext *lc = s->HEVClc; ++#endif ++ ++ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { ++ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); ++ ++ switch (cmd->type) ++ { ++ case RPI_PRED_INTRA: ++ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; ++ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; ++ lc->na.cand_left = (cmd->na >> 3) & 1; ++ lc->na.cand_up_left = (cmd->na >> 2) & 1; ++ lc->na.cand_up = (cmd->na >> 1) & 1; ++ lc->na.cand_up_right = (cmd->na >> 0) & 1; ++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ break; ++ ++ case RPI_PRED_TRANSFORM_ADD: ++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++#ifdef RPI_PRECLEAR ++ memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache ++#endif ++ break; ++ ++ case RPI_PRED_I_PCM: ++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); ++ break; ++ ++ default: ++ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ abort(); ++ } ++ } ++ s->num_pred_cmds[job] = 0; ++} ++ ++// Do any inter-pred that we want to do in software ++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here ++// All ARM ++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only) ++{ ++ unsigned int cidx; ++ AVFrame myref; ++ AVFrame myref1; ++ struct MvField mymv; ++ ++ for(; n>0 ; n--, cmd++) { ++ switch(cmd->cmd) { ++ case RPI_CMD_LUMA_UNI: ++ if (b_only) ++ break; ++ myref.data[0] = cmd->src; ++ myref.linesize[0] = cmd->srcstride; ++ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset); ++ break; ++ case RPI_CMD_LUMA_BI: ++ myref.data[0] = cmd->src; ++ myref.linesize[0] = cmd->srcstride; ++ myref1.data[0] = cmd->src1; ++ myref1.linesize[0] = cmd->srcstride1; ++ mymv.ref_idx[0] = cmd->ref_idx[0]; ++ mymv.ref_idx[1] = cmd->ref_idx[1]; ++ luma_mc_bi(s, cmd->dst, cmd->dststride, ++ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, ++ &myref1, &cmd->mv1, &mymv); ++ break; ++ case RPI_CMD_CHROMA_UNI: ++ if (b_only) ++ break; ++ mymv.mv[0] = cmd->mv; ++ chroma_mc_uni(s, cmd->dst, ++ cmd->dststride, cmd->src, cmd->srcstride, 0, ++ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset); ++ break; ++ case RPI_CMD_CHROMA_BI: ++ case RPI_CMD_CHROMA_BI+1: ++ cidx = cmd->cmd - RPI_CMD_CHROMA_BI; ++ myref.data[cidx+1] = cmd->src; ++ myref.linesize[cidx+1] = cmd->srcstride; ++ myref1.data[cidx+1] = cmd->src1; ++ myref1.linesize[cidx+1] = cmd->srcstride1; ++ mymv.ref_idx[0] = cmd->ref_idx[0]; ++ mymv.ref_idx[1] = cmd->ref_idx[1]; ++ mymv.mv[0] = cmd->mv; ++ mymv.mv[1] = cmd->mv1; ++ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1, ++ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx); ++ break; ++ } ++ } ++} ++ ++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only) ++{ ++ const int job = s->pass1_job; ++ ++ if (!qpu_luma || luma_b_only) ++ do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma); ++ s->num_mv_cmds_y[job] = 0; ++ if (!qpu_chroma || chroma_b_only) ++ do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma); ++ s->num_mv_cmds_c[job] = 0; ++} ++ ++#endif ++ ++#ifdef RPI ++// Set initial uniform job values & zero ctu_count ++static void rpi_begin(HEVCContext *s) ++{ ++#if RPI_INTER ++ int job = s->pass0_job; ++ int i; ++ ++ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1]; ++ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1]; ++ ++ for(i=0; i < QPU_N_UV;i++) { ++ qpu_mc_pred_c_t * const u = (qpu_mc_pred_c_t *)s->mvs_base[job][i]; ++ ++ u->next_fn = 0; ++ u->next_src_x = 0; ++ u->next_src_y = 0; ++ u->next_src_base_u = 0; ++ u->next_src_base_v = 0; ++ u->s.pic_w = pic_width; ++ u->s.pic_h = pic_height; ++ u->s.src_stride = s->frame->linesize[1]; ++ u->s.dst_stride = s->frame->linesize[1]; ++ u->s.wdenom = s->sh.chroma_log2_weight_denom + 6; ++ u->s.dummy0 = 0; ++ u->s.dummy1 = 0; ++ ++ s->u_mvs[job][i] = (uint32_t *)(u + 1); ++ } ++ s->curr_u_mvs = s->u_mvs[job][0]; ++ ++ for(i=0;i < QPU_N_Y;i++) { ++ // This needs to have a generally similar structure to the ++ // actual filter code as various pipelined bits need to land correctly ++ // when inserted by the filter requests ++ s->y_mvs[job][i] = s->y_mvs_base[job][i]; ++ *s->y_mvs[job][i]++ = 0; // y_x ++ *s->y_mvs[job][i]++ = 0; // ref_y_base ++ *s->y_mvs[job][i]++ = 0; // y2_x2 ++ *s->y_mvs[job][i]++ = 0; // ref_y2_base ++ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height; ++ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch ++ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch ++ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6 ++ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block ++ *s->y_mvs[job][i]++ = 0; // Next kernel ++ } ++ s->curr_y_mvs = s->y_mvs[job][0]; ++#endif ++ s->ctu_count = 0; ++} ++#endif ++ ++#ifdef RPI_SIMULATE_QPUS ++#error Rotted ++ ++static int32_t clipx(int x,int FRAME_WIDTH) ++{ ++ if (x<=0) return 0; ++ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1; ++ return x; ++} ++ ++static int32_t clipy(int y,int FRAME_HEIGHT) ++{ ++ if (y<=0) return 0; ++ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1; ++ return y; ++} ++ ++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset) ++{ ++ int32_t vsum = 0; ++ int x, y; ++ ++ for (y = 0; y < 8; y++) { ++ int32_t hsum = 0; ++ ++ for (x = 0; x < 8; x++) ++ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch]; ++ ++ vsum += lumaFilter[my][y]*hsum; ++ } ++ vsum >>= 6; ++ vsum = (((vsum*weight)+round)>>denom)+offset; ++ ++ return av_clip_uint8( vsum ); ++}*/ ++ ++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) ++{ ++ int32_t vsum = 0; ++ int x, y; ++ int chromaFilterH[4]; ++ int chromaFilterV[4]; ++ int i; ++ int offset_after = offset_weight>>16; ++ int weight = (offset_weight<<16)>>16; ++ for(i=0;i<4;i++) { ++ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24; ++ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24; ++ } ++ ++ for (y = 0; y < 4; y++) { ++ int32_t hsum = 0; ++ ++ for (x = 0; x < 4; x++) ++ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; ++ ++ vsum += chromaFilterV[y]*hsum; ++ } ++ vsum >>= 6; ++ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; ++ ++ return vsum; ++} ++ ++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} }; ++ ++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) ++{ ++ int32_t vsum = 0; ++ int x, y; ++ int i; ++ int offset_after = offset_weight>>16; ++ int weight = (offset_weight<<16)>>16; ++ ++ for (y = 0; y < 8; y++) { ++ int32_t hsum = 0; ++ ++ for (x = 0; x < 8; x++) ++ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; ++ ++ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum; ++ } ++ vsum >>= 6; ++ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; ++ ++ return vsum; ++} ++ ++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx) ++{ ++ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx]; ++ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx]; ++ int pitch = frame->linesize[cIdx]; ++ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) : ++ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame); ++ if (p>=base && pdata[cIdx] + (p-base); ++ } ++ return NULL; ++} ++ ++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx) ++{ ++ SliceHeader *sh = &s->sh; ++ uint8_t *arm = test_frame(s,p,s->frame,cIdx); ++ int i; ++ if (arm) return arm; ++ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE) ++ { ++ for(i=0;inb_refs[L0];i++) { ++ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx); ++ if (arm) return arm; ++ } ++ } ++ if (sh->slice_type == B_SLICE) { ++ for(i=0;inb_refs[L1];i++) { ++ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx); ++ if (arm) return arm; ++ } ++ } ++ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT)); ++ exit(-1); ++ return NULL; ++} ++ ++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p) ++{ ++ uint32_t next_kernel; ++ uint32_t x0; ++ uint32_t y0; ++ uint8_t *ref_u_base; ++ uint8_t *ref_v_base; ++ uint32_t frame_width = p[5]; ++ uint32_t frame_height = p[6]; ++ uint32_t pitch = p[7]; ++ uint32_t dst_pitch = p[8]; ++ int32_t offset_before = p[9]; ++ int32_t denom = p[10]; ++ uint32_t vpm_id = p[11]; ++ uint32_t tmp_u_dst[256]; ++ uint32_t tmp_v_dst[256]; ++ while(1) { ++ p += 12; ++ next_kernel = p[0-12]; ++ x0 = p[1-12]; ++ y0 = p[2-12]; ++ if (next_kernel==s->qpu_filter_uv || next_kernel==s->qpu_filter_uv_b0 || next_kernel==s->qpu_filter_uv_b) { ++ int x,y; ++ uint32_t width_height = p[5]; ++ uint32_t hcoeffs = p[6]; ++ uint32_t vcoeffs = p[7]; ++ uint32_t offset_weight_u = p[8]; ++ uint32_t offset_weight_v = p[9]; ++ uint8_t *this_u_dst; ++ uint8_t *this_v_dst; ++ uint32_t width = width_height >> 16; ++ uint32_t height = (width_height << 16) >> 16; ++ ref_u_base = compute_arm_addr(s,p[3-12],1); ++ ref_v_base = compute_arm_addr(s,p[4-12],2); ++ if (next_kernel!=s->qpu_filter_uv_b0) ++ { ++ this_u_dst = compute_arm_addr(s,p[10],1); ++ this_v_dst = compute_arm_addr(s,p[11],2); ++ } ++ for (y=0; yqpu_filter_uv) { ++ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height); ++ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height); ++ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); ++ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); ++ } else if (next_kernel==s->qpu_filter_uv_b0) { ++ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); ++ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); ++ tmp_u_dst[x+y*16] = refa; ++ tmp_v_dst[x+y*16] = refb; ++ } else { ++ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height); ++ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height); ++ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); ++ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); ++ } ++ } ++ } ++ } else { ++ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); ++ break; ++ } ++ } ++} ++ ++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel) ++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan) ++{ ++ uint32_t next_kernel; ++ int y_x,y2_x2; ++ int x0; ++ int y0; ++ int x2; ++ int y2; ++ uint32_t *p0 = p; ++ uint8_t *ref_y_base; ++ uint8_t *ref_y2_base; ++ uint32_t frame_width_height = p[4]; ++ uint32_t frame_width = frame_width_height>>16; ++ uint32_t frame_height = (frame_width_height<<16)>>16; ++ uint32_t pitch = p[5]; ++ uint32_t dst_pitch = p[6]; ++ int offset_shift = p[7]; ++ int32_t offset_before = offset_shift>>16; ++ int32_t denom = (offset_shift<<16)>>16; ++ while(1) { ++ p += 9; ++ next_kernel = p[8-9]; ++ y_x = p[0-9]; ++ x0 = (y_x<<16)>>16; ++ y0 = y_x>>16; ++ y2_x2 = p[2-9]; ++ x2 = (y2_x2<<16)>>16; ++ y2 = y2_x2>>16; ++ ++ if (next_kernel==s->qpu_filter || next_kernel==s->qpu_filter_b) { ++ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++ int x,y; ++ uint32_t width_height = p[4]; ++ uint32_t my2_mx2_my_mx = p[5]; ++ uint32_t offset_weight = p[6]; ++ uint8_t *this_dst = compute_arm_addr(s,p[7],0); ++ uint32_t width = width_height >> 16; ++ uint32_t height = (width_height << 16) >> 16; ++ uint8_t *dst_base = s->frame->data[0]; ++ ref_y_base = compute_arm_addr(s,p[1-9],0); ++ ref_y2_base = compute_arm_addr(s,p[3-9],0); ++ for (y=0; yqpu_filter) { ++ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height); ++ refa = av_clip_uint8(refa); ++ this_dst[x+y*dst_pitch] = refa; ++ } ++ else { ++ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height); ++ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height); ++ this_dst[x+y*dst_pitch] = av_clip_uint8(refb); ++ } ++ } ++ } ++ } else { ++ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); ++ break; ++ } ++ } ++} ++ ++static void rpi_simulate_inter_qpu(HEVCContext *s) ++{ ++ // First run the transform as normal ++ int i; ++ rpi_execute_transform(s); ++ for(i=0;i<8;i++) ++ { ++ rpi_simulate_inter_chroma(s,s->mvs_base[i]); ++ } ++ for(i=0;i<12;i++) ++ { ++ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i); ++ } ++} ++ ++#endif ++ ++ ++#if RPI_INTER ++static unsigned int mc_terminate_y(HEVCContext * const s, const int job) ++{ ++ unsigned int i; ++ const uint32_t exit_fn = qpu_fn(mc_exit); ++ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12); ++ const uint32_t dummy_texture = qpu_fn(mc_setup_uv); ++ unsigned int tc = 0; ++ ++ // Add final commands to Q ++ for(i = 0; i != QPU_N_Y; ++i) { ++ uint32_t * const pu = s->y_mvs[job][i] - RPI_LUMA_COMMAND_WORDS; ++ const int cmd_count = pu - s->y_mvs_base[job][i]; ++ tc += cmd_count; ++ ++ av_assert0(cmd_count < Y_COMMANDS_PER_QPU - 1); ++ ++ // We use this code as a dummy texture - safe? ++ pu[0] = 0; // x,y ++ pu[1] = dummy_texture; ++ pu[2] = 0; ++ pu[3] = dummy_texture; ++ pu[RPI_LUMA_COMMAND_WORDS - 1] = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2; // Actual fn ptr ++ } ++ ++ return tc; ++} ++ ++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job) ++{ ++ unsigned int i; ++ const uint32_t exit_fn = qpu_fn(mc_exit_c); ++#if QPU_N_UV == 8 ++ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit8c); ++#elif QPU_N_UV == 12 ++ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12c); ++#else ++#error Need appropriate exit code ++#endif ++ const uint32_t dummy_texture = qpu_fn(mc_setup_uv); ++ unsigned int tc = 0; ++ ++ // Add final commands to Q ++ for(i = 0; i != QPU_N_UV; ++i) { ++ qpu_mc_pred_c_t * const pu = (qpu_mc_pred_c_t *)s->u_mvs[job][i] - 1; ++ const int cmd_count = (uint32_t *)pu - s->mvs_base[job][i]; ++ tc += cmd_count; ++ ++ pu->next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2; // Actual fn ptr ++ // Need to set the src to something that can be (pointlessly) prefetched ++ pu->next_src_x = 0; ++ pu->next_src_y = 0; ++ // We use this code as a dummy texture - safe? ++ pu->next_src_base_u = dummy_texture; ++ pu->next_src_base_v = dummy_texture; ++ } ++ ++ return tc; ++} ++#endif ++ ++#ifdef RPI ++ ++ ++static void flush_frame(HEVCContext *s,AVFrame *frame) ++{ ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); ++} ++ ++ ++// Core execution tasks ++static void worker_core(HEVCContext * const s) ++{ ++ worker_global_env_t * const wg = &worker_global_env; ++ int arm_cost = 0; ++// vpu_qpu_wait_h sync_c; ++ vpu_qpu_wait_h sync_y; ++ int qpu_luma = 0; ++ int qpu_chroma = 0; ++ int gpu_load; ++ int arm_load; ++ static const int arm_const_cost = 2; ++ ++// static int z = 0; ++ ++ const int job = s->pass1_job; ++ unsigned int flush_start = 0; ++ unsigned int flush_count = 0; ++ ++ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ ++ if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) { ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(), ++ vpu_get_constants(), ++ s->coeffs_buf_vc[job][2], ++ s->num_coeffs[job][2] >> 8, ++ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], ++ s->num_coeffs[job][3] >> 10, ++ 0); ++ ++ rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ } ++ ++ ++#if RPI_INTER ++ pthread_mutex_lock(&wg->lock); ++ ++// ++z; ++ gpu_load = vpu_qpu_current_load(); ++ arm_load = avpriv_atomic_int_get(&wg->arm_load); ++#if !Y_B_ONLY ++ qpu_luma = gpu_load + 2 < arm_load; ++ qpu_chroma = gpu_load < arm_load + 8; ++#elif 1 ++ qpu_luma = gpu_load < arm_load + 2; ++ qpu_chroma = gpu_load < arm_load + 8; ++#else ++ qpu_chroma = 1; ++ qpu_luma = 1; ++#endif ++ ++ arm_cost = !qpu_chroma * 2 + !qpu_luma * 3; ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost); ++ ++ wg->gpu_c += qpu_chroma; ++ wg->gpu_y += qpu_luma; ++ wg->arm_c += !qpu_chroma; ++ wg->arm_y += !qpu_luma; ++ ++ ++// if ((z & 511) == 0) { ++// printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y); ++// } ++ ++ ++ { ++ int (*d)[2] = s->dblk_cmds[job]; ++ unsigned int high=(*d)[1]; ++ int n; ++ ++ flush_start = high; ++ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { ++ unsigned int y = (*d)[1]; ++ flush_start = FFMIN(flush_start, y); ++ high=FFMAX(high,y); ++ } ++ // Avoid flushing past end of frame ++ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start; ++ } ++ ++ if (qpu_chroma && mc_terminate_uv(s, job) != 0) ++ { ++ uint32_t * const unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc; ++ const uint32_t code = qpu_fn(mc_setup_uv); ++ uint32_t * p; ++ unsigned int i; ++ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; ++ ++ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { ++ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm)); ++ *p++ = code; ++ } ++ ++ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv); ++ ++#if RPI_CACHE_UNIF_MVS ++ rpi_cache_flush_add_gm_ptr(rfe, s->unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#endif ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ flush_start, flush_count, s->ps.sps->vshift[1], 0, 1); ++ } ++ ++// We can take a sync here and try to locally overlap QPU processing with ARM ++// but testing showed a slightly negative benefit with noticable extra complexity ++// vpu_qpu_job_add_sync_this(vqj, &sync_c); ++ ++ if (qpu_luma && mc_terminate_y(s, job) != 0) ++ { ++ uint32_t * const y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc; ++ const uint32_t code = qpu_fn(mc_setup); ++ uint32_t * p; ++ unsigned int i; ++ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; ++ ++ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { ++ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)); ++ *p++ = code; ++ } ++ ++ vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y); ++ ++#if RPI_CACHE_UNIF_MVS ++ rpi_cache_flush_add_gm_ptr(rfe, s->y_unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#endif ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ flush_start, flush_count, s->ps.sps->vshift[1], 1, 0); ++ } ++ ++ pthread_mutex_unlock(&wg->lock); ++ ++#endif ++ ++ vpu_qpu_job_add_sync_this(vqj, &sync_y); ++ ++ // Having accumulated some commands - do them ++ rpi_cache_flush_finish(rfe); ++ vpu_qpu_job_finish(vqj); ++ ++ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); //???? Surely we haven't done the smaller ++ ++#if Y_B_ONLY ++ if (qpu_luma) ++ vpu_qpu_wait(&sync_y); ++#endif ++ // Perform inter prediction ++ rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0); ++ ++ // Wait for transform completion ++ ++ // Perform intra prediction and residual reconstruction ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost); ++#if Y_B_ONLY ++ if (!qpu_luma) ++ vpu_qpu_wait(&sync_y); ++#else ++ vpu_qpu_wait(&sync_y); ++#endif ++ rpi_execute_pred_cmds(s); ++ ++ // Perform deblocking for CTBs in this row ++ rpi_execute_dblk_cmds(s); ++ ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost); ++} ++ ++static void rpi_do_all_passes(HEVCContext *s) ++{ ++ // Do the various passes - common with the worker code ++ worker_core(s); ++ // Prepare next batch ++ rpi_begin(s); ++} ++ ++ ++ ++#endif ++ + static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + { + HEVCContext *s = avctxt->priv_data; +@@ -2315,6 +3966,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + int y_ctb = 0; + int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; + ++#ifdef RPI ++ s->enable_rpi = s->ps.sps->bit_depth == 8 ++ && !s->ps.pps->cross_component_prediction_enabled_flag; ++ ++ if (!s->enable_rpi) { ++ if (s->ps.pps->cross_component_prediction_enabled_flag) ++ printf("Cross component\n"); ++ } ++#endif ++ //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]); ++ + if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { + av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); + return AVERROR_INVALIDDATA; +@@ -2328,6 +3990,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + } + } + ++#ifdef RPI_WORKER ++ s->pass0_job = 0; ++ s->pass1_job = 0; ++#endif ++#ifdef RPI ++ rpi_begin(s); ++#endif ++ + while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { + int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + +@@ -2335,6 +4005,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; + hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); + ++ + ff_hevc_cabac_init(s, ctb_addr_ts); + + hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); +@@ -2343,7 +4014,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; + s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; + ++#if RPI_INTER ++ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]; ++ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y]; ++#endif ++ + more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); ++ ++#ifdef RPI ++#if RPI_INTER ++ s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]= s->curr_u_mvs; ++ s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y] = s->curr_y_mvs; ++#endif ++ ++ if (s->enable_rpi) { ++ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0); ++ //av_assert0(s->num_dblk_cmds[s->pass0_job]pass0_jobpass0_job>=0); ++ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; ++ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; ++ s->ctu_count++; ++ ++ if ( s->ctu_count >= s->max_ctu_count ) { ++#ifdef RPI_WORKER ++ if (s->used_for_ref) ++ { ++// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); ++ ++// worker_wait(s); ++ // Split work load onto separate threads so we make as rapid progress as possible with this frame ++ // Pass on this job to worker thread ++ worker_submit_job(s); ++ ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s); ++ ++ // Prepare the next batch of commands ++ rpi_begin(s); ++ } else { ++ // Non-ref frame so do it all on this thread ++ rpi_do_all_passes(s); ++ } ++#else ++ rpi_do_all_passes(s); ++#endif ++ } ++ ++ } ++#endif ++ ++ + if (more_data < 0) { + s->tab_slice_address[ctb_addr_rs] = -1; + return more_data; +@@ -2352,9 +4073,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + + ctb_addr_ts++; + ff_hevc_save_states(s, ctb_addr_ts); ++#ifdef RPI ++ if (s->enable_rpi) ++ continue; ++#endif + ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size); + } + ++#ifdef RPI ++ ++#ifdef RPI_WORKER ++ // Wait for the worker to finish all its jobs ++ if (s->enable_rpi) { ++ worker_wait(s); ++ } ++#endif ++ ++ // Finish off any half-completed rows ++ if (s->enable_rpi && s->ctu_count) { ++ rpi_do_all_passes(s); ++ } ++ ++#endif ++ + if (x_ctb + ctb_size >= s->ps.sps->width && + y_ctb + ctb_size >= s->ps.sps->height) + ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); +@@ -2389,6 +4130,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int + s = s1->sList[self_id]; + lc = s->HEVClc; + ++#ifdef RPI ++ s->enable_rpi = 0; ++ //printf("Wavefront\n"); ++#endif ++ + if(ctb_row) { + ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); + +@@ -2771,6 +4517,20 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) + if (ret < 0) + return ret; + ++ // The definition of _N unit types is "non-reference for other frames ++ // with the same temporal_id" so they may/will be ref frames for pics ++ // with a higher temporal_id. ++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || ++ !(s->nal_unit_type == HEVC_NAL_TRAIL_N || ++ s->nal_unit_type == HEVC_NAL_TSA_N || ++ s->nal_unit_type == HEVC_NAL_STSA_N || ++ s->nal_unit_type == HEVC_NAL_RADL_N || ++ s->nal_unit_type == HEVC_NAL_RASL_N); ++ ++ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { ++ s->is_decoded = 0; ++ break; ++ } + if (s->max_ra == INT_MAX) { + if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { + s->max_ra = s->poc; +@@ -2894,10 +4654,18 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) + } + } + +-fail: +- if (s->ref && s->threads_type == FF_THREAD_FRAME) ++fail: // Also success path ++ if (s->ref && s->threads_type == FF_THREAD_FRAME) { ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); ++#endif + ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); +- ++ } else if (s->ref) { ++#if RPI_INTER ++ // When running single threaded we need to flush the whole frame ++ flush_frame(s,s->frame); ++#endif ++ } + return ret; + } + +@@ -3150,6 +4918,41 @@ fail: + return AVERROR(ENOMEM); + } + ++#ifdef RPI_WORKER ++static av_cold void hevc_init_worker(HEVCContext *s) ++{ ++ int err; ++ pthread_cond_init(&s->worker_cond_head, NULL); ++ pthread_cond_init(&s->worker_cond_tail, NULL); ++ pthread_mutex_init(&s->worker_mutex, NULL); ++ ++ s->worker_tail=0; ++ s->worker_head=0; ++ s->kill_worker=0; ++ err = pthread_create(&s->worker_thread, NULL, worker_start, s); ++ if (err) { ++ printf("Failed to create worker thread\n"); ++ exit(-1); ++ } ++} ++ ++static av_cold void hevc_exit_worker(HEVCContext *s) ++{ ++ void *res; ++ s->kill_worker=1; ++ pthread_cond_broadcast(&s->worker_cond_tail); ++ pthread_join(s->worker_thread, &res); ++ ++ pthread_cond_destroy(&s->worker_cond_head); ++ pthread_cond_destroy(&s->worker_cond_tail); ++ pthread_mutex_destroy(&s->worker_mutex); ++ ++ s->worker_tail=0; ++ s->worker_head=0; ++ s->kill_worker=0; ++} ++#endif ++ + static av_cold int hevc_decode_free(AVCodecContext *avctx) + { + HEVCContext *s = avctx->priv_data; +@@ -3161,6 +4964,33 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) + + av_freep(&s->cabac_state); + ++#ifdef RPI ++ ++#ifdef RPI_WORKER ++ hevc_exit_worker(s); ++#endif ++ ++ for(i=0;iunif_mv_cmds_y[i]); ++ av_freep(&s->unif_mv_cmds_c[i]); ++ av_freep(&s->univ_pred_cmds[i]); ++ ++#if RPI_INTER ++ if (s->unif_mvs[i]) { ++ gpu_free( &s->unif_mvs_ptr[i] ); ++ s->unif_mvs[i] = 0; ++ } ++ if (s->y_unif_mvs[i]) { ++ gpu_free( &s->y_unif_mvs_ptr[i] ); ++ s->y_unif_mvs[i] = 0; ++ } ++#endif ++ } ++ ++ vpu_qpu_term(); ++ ++#endif ++ + for (i = 0; i < 3; i++) { + av_freep(&s->sao_pixel_buffer_h[i]); + av_freep(&s->sao_pixel_buffer_v[i]); +@@ -3202,10 +5032,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) + return 0; + } + ++#ifdef RPI ++#ifdef RPI_PRECLEAR ++static av_cold void memclear16(int16_t *p, int n) ++{ ++ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1); ++ //int i; ++ //for(i=0;ipriv_data; + int i; ++#ifdef RPI ++ unsigned int job; ++#endif + + s->avctx = avctx; + +@@ -3215,6 +5060,82 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) + s->HEVClcList[0] = s->HEVClc; + s->sList[0] = s; + ++#ifdef RPI ++ // Whilst FFmpegs init fn is only called once the close fn is called as ++ // many times as we have threads (init_thread_copy is called for the ++ // threads). So to match init & term put the init here where it will be ++ // called by both init & copy ++ if (vpu_qpu_init() != 0) ++ goto fail; ++ ++ for(job = 0; job < RPI_MAX_JOBS; job++) { ++ s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y); ++ if (!s->unif_mv_cmds_y[job]) ++ goto fail; ++ s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C); ++ if (!s->unif_mv_cmds_c[job]) ++ goto fail; ++ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); ++ if (!s->univ_pred_cmds[job]) ++ goto fail; ++ } ++ ++#if RPI_INTER ++ // We divide the image into blocks 256 wide and 64 high ++ // We support up to 2048 widths ++ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted ++ // Also add space for the startup command for each stream. ++ ++ for (job = 0; job < RPI_MAX_JOBS; job++) { ++ uint32_t *p; ++#if RPI_CACHE_UNIF_MVS ++ gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); ++#else ++ gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); ++#endif ++ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm; ++ ++ // Set up initial locations for uniform streams ++ p = s->unif_mvs[job]; ++ for(i = 0; i < QPU_N_UV; i++) { ++ s->mvs_base[job][i] = p; ++ p += UV_COMMANDS_PER_QPU; ++ } ++ } ++ s->qpu_filter_uv = qpu_fn(mc_filter_uv); ++ s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0); ++ s->qpu_filter_uv_b = qpu_fn(mc_filter_uv_b); ++ ++ for (job=0; job < RPI_MAX_JOBS; job++) ++ { ++ uint32_t *p; ++#if RPI_CACHE_UNIF_MVS ++ gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); ++#else ++ gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); ++#endif ++ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm; ++ ++ // Set up initial locations for uniform streams ++ p = s->y_unif_mvs[job]; ++ for(i = 0; i < QPU_N_Y; i++) { ++ s->y_mvs_base[job][i] = p; ++ p += Y_COMMANDS_PER_QPU; ++ } ++ } ++ s->qpu_filter = qpu_fn(mc_filter); ++ s->qpu_filter_b = qpu_fn(mc_filter_b); ++#endif ++ //gpu_malloc_uncached(2048*64,&s->dummy); ++ ++ s->enable_rpi = 0; ++ ++#ifdef RPI_WORKER ++ hevc_init_worker(s); ++#endif ++ ++#endif ++ + s->cabac_state = av_malloc(HEVC_CONTEXTS); + if (!s->cabac_state) + goto fail; +@@ -3357,9 +5278,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) + } + + if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) +- s->threads_type = FF_THREAD_FRAME; +- else +- s->threads_type = FF_THREAD_SLICE; ++ s->threads_type = FF_THREAD_FRAME; ++ else ++ s->threads_type = FF_THREAD_SLICE; + + return 0; + } +@@ -3418,6 +5339,8 @@ AVCodec ff_hevc_decoder = { + .update_thread_context = hevc_update_thread_context, + .init_thread_copy = hevc_init_thread_copy, + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | ++// 0, ++// AV_CODEC_CAP_FRAME_THREADS, + AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, + .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE, + .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), +diff --git b/libavcodec/hevcdec.h a/libavcodec/hevcdec.h +index 0c78812..e068936 100644 +--- b/libavcodec/hevcdec.h ++++ a/libavcodec/hevcdec.h +@@ -334,17 +334,6 @@ typedef struct CodingUnit { + uint8_t cu_transquant_bypass_flag; + } CodingUnit; + +-typedef struct Mv { +- int16_t x; ///< horizontal component of motion vector +- int16_t y; ///< vertical component of motion vector +-} Mv; +- +-typedef struct MvField { +- DECLARE_ALIGNED(4, Mv, mv)[2]; +- int8_t ref_idx[2]; +- int8_t pred_flag; +-} MvField; +- + typedef struct NeighbourAvailable { + int cand_bottom_left; + int cand_left; +@@ -421,7 +410,17 @@ typedef struct HEVCFrame { + uint8_t flags; + } HEVCFrame; + ++#ifdef RPI_WORKER ++typedef struct HEVCLocalContextIntra { ++ TransformUnit tu; ++ NeighbourAvailable na; ++} HEVCLocalContextIntra; ++#endif ++ + typedef struct HEVCLocalContext { ++ TransformUnit tu; // Moved to start to match HEVCLocalContextIntra (yuk!) ++ NeighbourAvailable na; ++ + uint8_t cabac_state[HEVC_CONTEXTS]; + + uint8_t stat_coeff[4]; +@@ -436,8 +435,6 @@ typedef struct HEVCLocalContext { + + int qPy_pred; + +- TransformUnit tu; +- + uint8_t ctb_left_flag; + uint8_t ctb_up_flag; + uint8_t ctb_up_right_flag; +@@ -453,7 +450,6 @@ typedef struct HEVCLocalContext { + int ct_depth; + CodingUnit cu; + PredictionUnit pu; +- NeighbourAvailable na; + + #define BOUNDARY_LEFT_SLICE (1 << 0) + #define BOUNDARY_LEFT_TILE (1 << 1) +@@ -464,6 +460,89 @@ typedef struct HEVCLocalContext { + int boundary_flags; + } HEVCLocalContext; + ++#ifdef RPI ++ ++// The processing is done in chunks ++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma) ++// This is a distance of 1536 pixels across the screen ++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing, ++// but allocate more memory and increase the latency before data in the next frame can be processed ++#define RPI_NUM_CHUNKS 4 ++#define RPI_CHUNK_SIZE 12 ++ ++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code ++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE) ++ ++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi ++#define RPI_MAX_MV_CMDS_Y (2*16*1*(RPI_MAX_WIDTH/4)) ++#define RPI_MAX_MV_CMDS_C (2*16*2*(RPI_MAX_WIDTH/4)) ++// Each block can have an intra prediction and a transform_add command ++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4)) ++// Worst case is 16x16 CTUs ++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16) ++ ++#define RPI_CMD_LUMA_UNI 0 ++#define RPI_CMD_CHROMA_UNI 1 ++#define RPI_CMD_LUMA_BI 2 ++#define RPI_CMD_CHROMA_BI 3 ++#define RPI_CMD_V_BI 4 ++ ++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed? ++// #define RPI_PRECLEAR ++ ++// Command for inter prediction ++typedef struct HEVCMvCmd { ++ uint8_t cmd; ++ uint8_t block_w; ++ uint8_t block_h; ++ int8_t ref_idx[2]; ++ uint16_t dststride; ++ uint16_t srcstride; ++ uint16_t srcstride1; ++ int16_t weight; ++ int16_t offset; ++ int16_t x_off; ++ int16_t y_off; ++ uint8_t *src; ++ uint8_t *src1; ++ uint8_t *dst; ++ Mv mv; ++ Mv mv1; ++} HEVCMvCmd; ++ ++ ++// Command for intra prediction and transform_add of predictions to coefficients ++#define RPI_PRED_TRANSFORM_ADD 0 ++#define RPI_PRED_INTRA 1 ++#define RPI_PRED_I_PCM 2 ++ ++typedef struct HEVCPredCmd { ++ uint8_t type; ++ uint8_t size; // log2 "size" used by all variants ++ uint8_t na; // i_pred - but left here as they pack well ++ uint8_t c_idx; // i_pred ++ union { ++ struct { // TRANSFORM_ADD ++ uint8_t * dst; ++ const int16_t * buf; ++ uint32_t stride; ++ } ta; ++ struct { // INTRA ++ uint16_t x; ++ uint16_t y; ++ enum IntraPredMode mode; ++ } i_pred; ++ struct { // I_PCM ++ uint16_t x; ++ uint16_t y; ++ const void * src; ++ uint32_t src_len; ++ } i_pcm; ++ }; ++} HEVCPredCmd; ++ ++#endif ++ + typedef struct HEVCContext { + const AVClass *c; // needed by private avoptions + AVCodecContext *avctx; +@@ -472,6 +551,9 @@ typedef struct HEVCContext { + + HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; + HEVCLocalContext *HEVClc; ++#ifdef RPI_WORKER ++ HEVCLocalContextIntra HEVClcIntra; ++#endif + + uint8_t threads_type; + uint8_t threads_number; +@@ -479,6 +561,98 @@ typedef struct HEVCContext { + int width; + int height; + ++ int used_for_ref; // rpi ++#ifdef RPI ++ int enable_rpi; ++ HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS]; ++ HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS]; ++ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; ++ int buf_width; ++ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; ++ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS]; ++ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4]; ++ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4]; ++ int num_coeffs[RPI_MAX_JOBS][4]; ++ int num_xfm_cmds[RPI_MAX_JOBS]; ++ int num_mv_cmds_y[RPI_MAX_JOBS]; ++ int num_mv_cmds_c[RPI_MAX_JOBS]; ++ int num_pred_cmds[RPI_MAX_JOBS]; ++ int num_dblk_cmds[RPI_MAX_JOBS]; ++ int vpu_id; ++ int pass0_job; // Pass0 does coefficient decode ++ int pass1_job; // Pass1 does pixel processing ++ int ctu_count; // Number of CTUs done in pass0 so far ++ int max_ctu_count; // Number of CTUs when we trigger a round of processing ++ int ctu_per_y_chan; // Number of CTUs per luma QPU ++ int ctu_per_uv_chan; // Number of CTUs per chroma QPU ++ ++#if RPI_INTER ++ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS]; ++ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands ++ ++ // _base pointers are to the start of the row ++ uint32_t *mvs_base[RPI_MAX_JOBS][QPU_N_UV]; ++ // these pointers are to the next free space ++ uint32_t *u_mvs[RPI_MAX_JOBS][QPU_N_UV]; ++ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma ++ // Function pointers ++ uint32_t qpu_filter_uv; ++ uint32_t qpu_filter_uv_b0; ++ uint32_t qpu_filter_uv_b; ++ ++ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS]; ++ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands ++ uint32_t *y_mvs_base[RPI_MAX_JOBS][QPU_N_Y]; ++ uint32_t *y_mvs[RPI_MAX_JOBS][QPU_N_Y]; ++ uint32_t *curr_y_mvs; // Current uniform stream for luma ++ // Function pointers ++ uint32_t qpu_filter; ++ uint32_t qpu_filter_b; ++#endif ++ ++#ifdef RPI_WORKER ++ pthread_t worker_thread; ++ pthread_cond_t worker_cond_head; ++ pthread_cond_t worker_cond_tail; ++ pthread_mutex_t worker_mutex; ++ ++ int worker_tail; // Contains the number of posted jobs ++ int worker_head; // Contains the number of completed jobs ++ int kill_worker; // set to 1 to terminate the worker ++#endif ++ ++#define RPI_DEBLOCK_VPU_Q_COUNT 2 ++ ++#ifdef RPI_DEBLOCK_VPU ++ int enable_rpi_deblock; ++ ++ int uv_setup_width; ++ int uv_setup_height; ++ int setup_width; // Number of 16x16 blocks across the image ++ int setup_height; // Number of 16x16 blocks down the image ++ ++ struct dblk_vpu_q_s ++ { ++ GPU_MEM_PTR_T deblock_vpu_gmem; ++ ++ uint8_t (*y_setup_arm)[2][2][2][4]; ++ uint8_t (*y_setup_vc)[2][2][2][4]; ++ ++ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned ++ uint8_t (*uv_setup_vc)[2][2][2][4]; ++ ++ int (*vpu_cmds_arm)[6]; // r0-r5 for each command ++ int vpu_cmds_vc; ++ ++ vpu_qpu_wait_h cmd_id; ++ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT]; ++ ++ struct dblk_vpu_q_s * dvq; ++ unsigned int dvq_n; ++ ++#endif ++#endif ++ + uint8_t *cabac_state; + + /** 1 if the independent slice segment header was successfully parsed */ +@@ -596,6 +770,9 @@ typedef struct HEVCContext { + uint32_t max_mastering_luminance; + uint32_t min_mastering_luminance; + ++#ifdef RPI ++ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2]; ++#endif + } HEVCContext; + + int ff_hevc_decode_nal_sei(HEVCContext *s); +@@ -703,6 +880,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, + + void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); + ++#if RPI_INTER ++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n); ++#endif ++ ++ + /** + * Reset SEI values that are stored on the Context. + * e.g. Caption data that was extracted during NAL +@@ -716,4 +898,8 @@ extern const uint8_t ff_hevc_qpel_extra_before[4]; + extern const uint8_t ff_hevc_qpel_extra_after[4]; + extern const uint8_t ff_hevc_qpel_extra[4]; + ++#ifdef RPI ++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n); ++#endif ++ + #endif /* AVCODEC_HEVCDEC_H */ +diff --git b/libavcodec/hevcdsp.c a/libavcodec/hevcdsp.c +index 23e923f..a985f02 100644 +--- b/libavcodec/hevcdsp.c ++++ a/libavcodec/hevcdsp.c @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { #include "hevcdsp_template.c" #undef BIT_DEPTH @@ -7068,10 +7466,10 @@ index 9d773d9..a6534a9 100644 if (ARCH_X86) ff_hevc_dsp_init_x86(hevcdsp, bit_depth); if (ARCH_ARM) -diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h -index 9f1f6dd..e221e54 100644 ---- a/libavcodec/hevcdsp.h -+++ b/libavcodec/hevcdsp.h +diff --git b/libavcodec/hevcdsp.h a/libavcodec/hevcdsp.h +index eefb3cd..a41aa09 100644 +--- b/libavcodec/hevcdsp.h ++++ a/libavcodec/hevcdsp.h @@ -42,6 +42,17 @@ typedef struct SAOParams { uint8_t type_idx[3]; ///< sao_type_idx } SAOParams; @@ -7100,10 +7498,10 @@ index 9f1f6dd..e221e54 100644 } HEVCDSPContext; void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); -diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c +diff --git b/libavcodec/hevcpred_template.c a/libavcodec/hevcpred_template.c index 6ae87cc..28d2653 100644 ---- a/libavcodec/hevcpred_template.c -+++ b/libavcodec/hevcpred_template.c +--- b/libavcodec/hevcpred_template.c ++++ a/libavcodec/hevcpred_template.c @@ -20,6 +20,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -7137,10 +7535,43 @@ index 6ae87cc..28d2653 100644 if (s->ps.pps->constrained_intra_pred_flag == 1) { int size_in_luma_pu_v = PU(size_in_luma_v); int size_in_luma_pu_h = PU(size_in_luma_h); -diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index 099a8c5..bdff2d2 100644 ---- a/libavcodec/mmaldec.c -+++ b/libavcodec/mmaldec.c +diff --git b/libavcodec/mjpegenc_common.c a/libavcodec/mjpegenc_common.c +index 6d9c982..83a9e95 100644 +--- b/libavcodec/mjpegenc_common.c ++++ a/libavcodec/mjpegenc_common.c +@@ -91,17 +91,13 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p, + { + int i, j, size; + uint8_t *ptr; +- MpegEncContext *s = NULL; +- +- /* Since avctx->priv_data will point to LJpegEncContext in this case */ +- if (avctx->codec_id != AV_CODEC_ID_LJPEG) +- s = avctx->priv_data; ++ MpegEncContext *s = avctx->priv_data; + + if (avctx->codec_id != AV_CODEC_ID_LJPEG) { + int matrix_count = 1 + !!memcmp(luma_intra_matrix, + chroma_intra_matrix, + sizeof(luma_intra_matrix[0]) * 64); +- if (s && s->force_duplicated_matrix) ++ if (s->force_duplicated_matrix) + matrix_count = 2; + /* quant matrixes */ + put_marker(p, DQT); +@@ -138,7 +134,7 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p, + + // Only MJPEG can have a variable Huffman variable. All other + // formats use the default Huffman table. +- if (s && s->huffman == HUFFMAN_TABLE_OPTIMAL) { ++ if (s->out_format == FMT_MJPEG && s->huffman == HUFFMAN_TABLE_OPTIMAL) { + size += put_huffman_table(p, 0, 0, s->mjpeg_ctx->bits_dc_luminance, + s->mjpeg_ctx->val_dc_luminance); + size += put_huffman_table(p, 0, 1, s->mjpeg_ctx->bits_dc_chrominance, +diff --git b/libavcodec/mmaldec.c a/libavcodec/mmaldec.c +index 81fcebc..7858478 100644 +--- b/libavcodec/mmaldec.c ++++ a/libavcodec/mmaldec.c @@ -24,6 +24,9 @@ * MMAL Video Decoder */ @@ -7156,14 +7587,14 @@ index 099a8c5..bdff2d2 100644 #include #include +#pragma GCC diagnostic pop + #include #include "avcodec.h" - #include "internal.h" -diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c -index 3adf28d..2f9195f 100644 ---- a/libavcodec/mpeg4videodec.c -+++ b/libavcodec/mpeg4videodec.c -@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +diff --git b/libavcodec/mpeg4videodec.c a/libavcodec/mpeg4videodec.c +index 791a07b..502c21f 100644 +--- b/libavcodec/mpeg4videodec.c ++++ a/libavcodec/mpeg4videodec.c +@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) if (ctx->divx_version >= 0) s->workaround_bugs |= FF_BUG_HPEL_CHROMA; @@ -7173,7 +7604,7 @@ index 3adf28d..2f9195f 100644 } if (s->workaround_bugs & FF_BUG_STD_QPEL) { -@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) s->workaround_bugs, ctx->lavc_build, ctx->xvid_build, ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : ""); @@ -7181,11 +7612,25 @@ index 3adf28d..2f9195f 100644 if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && s->codec_id == AV_CODEC_ID_MPEG4 && avctx->idct_algo == FF_IDCT_AUTO) { -diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h +diff --git b/libavcodec/mpegvideo_enc.c a/libavcodec/mpegvideo_enc.c +index 882cf09..71a858f 100644 +--- b/libavcodec/mpegvideo_enc.c ++++ a/libavcodec/mpegvideo_enc.c +@@ -399,9 +399,6 @@ FF_ENABLE_DEPRECATION_WARNINGS + return AVERROR(EINVAL); + } + +- if (s->huffman && avctx->codec_id == AV_CODEC_ID_AMV) +- s->huffman = 0; +- + if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) { + av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n"); + return AVERROR(EINVAL); +diff --git b/libavcodec/rpi_hevc_transform.h a/libavcodec/rpi_hevc_transform.h new file mode 100644 index 0000000..4309f1c --- /dev/null -+++ b/libavcodec/rpi_hevc_transform.h ++++ a/libavcodec/rpi_hevc_transform.h @@ -0,0 +1,3070 @@ +unsigned char rpi_hevc_transform [] = { +21, @@ -10257,11 +10702,11 @@ index 0000000..4309f1c +33, +3, +}; -diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s +diff --git b/libavcodec/rpi_hevc_transform.s a/libavcodec/rpi_hevc_transform.s new file mode 100644 index 0000000..5543093 --- /dev/null -+++ b/libavcodec/rpi_hevc_transform.s ++++ a/libavcodec/rpi_hevc_transform.s @@ -0,0 +1,917 @@ +# ****************************************************************************** +# Argon Design Ltd. @@ -11180,12 +11625,12 @@ index 0000000..5543093 + bgt loop_cmds + + pop r6-r7, pc -diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c +diff --git b/libavcodec/rpi_mailbox.c a/libavcodec/rpi_mailbox.c new file mode 100644 -index 0000000..3904efc +index 0000000..8d8a20d --- /dev/null -+++ b/libavcodec/rpi_mailbox.c -@@ -0,0 +1,340 @@ ++++ a/libavcodec/rpi_mailbox.c +@@ -0,0 +1,118 @@ +/* +Copyright (c) 2012, Broadcom Europe Ltd. +All rights reserved. @@ -11220,7 +11665,6 @@ index 0000000..3904efc +#include +#include +#include -+#include +#include + +#include @@ -11231,75 +11675,6 @@ index 0000000..3904efc + +#include "rpi_mailbox.h" + -+#define PAGE_SIZE (4*1024) -+ -+// Shared memory will not be cached in ARM cache -+void *mapmem_shared(unsigned base, unsigned size) -+{ -+ int mem_fd; -+ unsigned offset = base % PAGE_SIZE; -+ base = base - offset; -+ /* open /dev/mem */ -+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) { -+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n"); -+ return NULL; -+ } -+ void *mem = mmap( -+ 0, -+ size, -+ PROT_READ|PROT_WRITE, -+ MAP_SHARED/*|MAP_FIXED*/, -+ mem_fd, -+ base); -+#ifdef DEBUG -+ printf("base=0x%x, mem=%p\n", base, mem); -+#endif -+ if (mem == MAP_FAILED) { -+ printf("mmap error %d\n", (int)mem); -+ return NULL; -+ } -+ close(mem_fd); -+ return (char *)mem + offset; -+} -+ -+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing -+void *mapmem_private(unsigned base, unsigned size) -+{ -+ int mem_fd; -+ unsigned offset = base % PAGE_SIZE; -+ base = base - offset; -+ /* open /dev/mem */ -+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) { -+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n"); -+ return NULL; -+ } -+ void *mem = mmap( -+ 0, -+ size, -+ PROT_READ|PROT_WRITE, -+ MAP_PRIVATE/*|MAP_FIXED*/, -+ mem_fd, -+ base); -+#ifdef DEBUG -+ printf("base=0x%x, mem=%p\n", base, mem); -+#endif -+ if (mem == MAP_FAILED) { -+ printf("mmap error %d\n", (int)mem); -+ return NULL; -+ } -+ close(mem_fd); -+ return (char *)mem + offset; -+} -+ -+void unmapmem(void *addr, unsigned size) -+{ -+ int s = munmap(addr, size); -+ if (s != 0) { -+ printf("munmap error %d\n", s); -+ exit (-1); -+ } -+} -+ +/* + * use ioctl to send mbox property message + */ @@ -11320,47 +11695,7 @@ index 0000000..3904efc + return ret_val; +} + -+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags) -+{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x3000c; // (the tag id) -+ p[i++] = 12; // (size of the buffer) -+ p[i++] = 12; // (size of the data) -+ p[i++] = size; // (num bytes? or pages?) -+ p[i++] = align; // (alignment) -+ p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING) -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned mem_free(int file_desc, unsigned handle) -+{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x3000f; // (the tag id) -+ p[i++] = 4; // (size of the buffer) -+ p[i++] = 4; // (size of the data) -+ p[i++] = handle; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned mem_lock(int file_desc, unsigned handle) ++unsigned mbox_mem_lock(int file_desc, unsigned handle) +{ + int i=0; + unsigned p[32]; @@ -11379,7 +11714,7 @@ index 0000000..3904efc + return p[5]; +} + -+unsigned mem_unlock(int file_desc, unsigned handle) ++unsigned mbox_mem_unlock(int file_desc, unsigned handle) +{ + int i=0; + unsigned p[32]; @@ -11398,118 +11733,6 @@ index 0000000..3904efc + return p[5]; +} + -+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5) -+{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x30010; // (the tag id) -+ p[i++] = 28; // (size of the buffer) -+ p[i++] = 28; // (size of the data) -+ p[i++] = code; -+ p[i++] = r0; -+ p[i++] = r1; -+ p[i++] = r2; -+ p[i++] = r3; -+ p[i++] = r4; -+ p[i++] = r5; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned qpu_enable(int file_desc, unsigned enable) -+{ -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x30012; // (the tag id) -+ p[i++] = 4; // (size of the buffer) -+ p[i++] = 4; // (size of the data) -+ p[i++] = enable; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) { -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ p[i++] = 0x30011; // (the tag id) -+ p[i++] = 16; // (size of the buffer) -+ p[i++] = 16; // (size of the data) -+ p[i++] = num_qpus; -+ p[i++] = control; -+ p[i++] = noflush; -+ p[i++] = timeout; // ms -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+void execute_multi(int file_desc, -+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout, -+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2, -+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) { -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ p[i++] = 0x30018; // (the tag id) -+ p[i++] = 88; // (size of the buffer) -+ p[i++] = 88; // (size of the data) -+ -+ p[i++] = num_qpus; -+ p[i++] = control; -+ p[i++] = noflush; -+ p[i++] = timeout; // ms -+ -+ p[i++] = num_qpus_2; -+ p[i++] = control_2; -+ p[i++] = noflush_2; -+ p[i++] = timeout_2; // ms -+ -+ p[i++] = code; -+ p[i++] = r0; -+ p[i++] = r1; -+ p[i++] = r2; -+ p[i++] = r3; -+ p[i++] = r4; -+ p[i++] = r5; -+ -+ p[i++] = code_2; -+ p[i++] = r0_2; -+ p[i++] = r1_2; -+ p[i++] = r2_2; -+ p[i++] = r3_2; -+ p[i++] = r4_2; -+ p[i++] = r5_2; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return; -+} + +int mbox_open() { + int file_desc; @@ -11526,55 +11749,29 @@ index 0000000..3904efc +void mbox_close(int file_desc) { + close(file_desc); +} -diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h +diff --git b/libavcodec/rpi_mailbox.h a/libavcodec/rpi_mailbox.h new file mode 100644 -index 0000000..5898102 +index 0000000..b51303b --- /dev/null -+++ b/libavcodec/rpi_mailbox.h -@@ -0,0 +1,25 @@ ++++ a/libavcodec/rpi_mailbox.h +@@ -0,0 +1,10 @@ +#ifndef RPI_MAILBOX_H +#define RPI_MAILBOX_H + +extern int mbox_open(void); +extern void mbox_close(int file_desc); + -+extern unsigned get_version(int file_desc); -+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags); -+extern unsigned mem_free(int file_desc, unsigned handle); -+extern unsigned mem_lock(int file_desc, unsigned handle); -+extern unsigned mem_unlock(int file_desc, unsigned handle); -+extern void *mapmem_shared(unsigned base, unsigned size); -+extern void *mapmem_private(unsigned base, unsigned size); -+extern void unmapmem(void *addr, unsigned size); -+ -+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5); -+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout); -+extern void execute_multi(int file_desc, -+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout, -+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2, -+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2); -+extern unsigned qpu_enable(int file_desc, unsigned enable); ++extern unsigned mbox_mem_lock(int file_desc, unsigned handle); ++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle); + +#endif -diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c +diff --git b/libavcodec/rpi_qpu.c a/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000..365f4a6 +index 0000000..be58458 --- /dev/null -+++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,993 @@ ++++ a/libavcodec/rpi_qpu.c +@@ -0,0 +1,827 @@ +#ifdef RPI -+// Use vchiq service for submitting jobs -+#define GPUSERVICE -+ -+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device. -+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code -+//#define RPI_TIME_TOTAL_QPU -+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code -+//#define RPI_TIME_TOTAL_VPU -+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined -+#define RPI_TIME_TOTAL_POSTED -+ +#include +#include +#include @@ -11592,22 +11789,23 @@ index 0000000..365f4a6 +#include "rpi_shader.h" +#include "rpi_hevc_transform.h" + -+#include "rpi_user_vcsm.h" -+#ifdef GPUSERVICE +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#include "interface/vmcs_host/vc_vchi_gpuserv.h" +#pragma GCC diagnostic pop -+#endif + -+// QPU profile flags -+#define NO_FLUSH 1 -+#define CLEAR_PROFILE 2 -+#define OUTPUT_COUNTS 4 ++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) ++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + -+#define FLAGS_FOR_PROFILING (NO_FLUSH) ++// QPU "noflush" flags ++// a mixture of flushing & profiling + ++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed ++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers ++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results ++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling ++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + +// On Pi2 there is no way to access the VPU L2 cache +// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache) @@ -11664,65 +11862,212 @@ index 0000000..365f4a6 +{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} +}; + ++// Code/constants on GPU +struct GPU +{ + unsigned int qpu_code[QPU_CODE_SIZE]; + unsigned int vpu_code[VPU_CODE_SIZE]; + short transMatrix2even[16*16*2]; -+ int open_count; // Number of allocated video buffers -+ int mb; // Mailbox handle -+ int vc; // Address in GPU memory -+ int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task -+ int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task +}; + ++ ++#define WAIT_COUNT_MAX 16 ++ ++typedef struct trace_time_one_s ++{ ++ int count; ++ int64_t start[WAIT_COUNT_MAX]; ++ int64_t total[WAIT_COUNT_MAX]; ++} trace_time_one_t; ++ ++typedef struct trace_time_wait_s ++{ ++ unsigned int jcount; ++ int64_t start0; ++ int64_t last_update; ++ trace_time_one_t active; ++ trace_time_one_t wait; ++} trace_time_wait_t; ++ ++typedef struct vq_wait_s ++{ ++ sem_t sem; ++ unsigned int cost; ++ struct vq_wait_s * next; ++} vq_wait_t; ++ ++#define VQ_WAIT_POOL_SIZE 16 ++typedef struct vq_wait_pool_s ++{ ++ vq_wait_t * head; ++ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; ++} vq_wait_pool_t; ++ ++static void vq_wait_pool_init(vq_wait_pool_t * const pool); ++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); ++ ++typedef struct gpu_env_s ++{ ++ int open_count; ++ int init_count; ++ int mb; ++ unsigned int current_load; ++ GPU_MEM_PTR_T code_gm_ptr; ++ vq_wait_pool_t wait_pool; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ trace_time_wait_t ttw; ++#endif ++} gpu_env_t; ++ +// Stop more than one thread trying to allocate memory or use the processing resources at once +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; -+static volatile struct GPU* gpu = NULL; -+static GPU_MEM_PTR_T gpu_mem_ptr; ++static gpu_env_t * gpu = NULL; + -+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED) -+static unsigned int Microseconds(void) { ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ++static int64_t ns_time(void) ++{ + struct timespec ts; -+ unsigned int x; -+ static unsigned int base = 0; -+ clock_gettime(CLOCK_REALTIME, &ts); -+ x = ts.tv_sec*1000000 + ts.tv_nsec/1000; -+ if (base==0) base=x; -+ return x-base; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; +} ++ ++ ++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 ++ ++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) ++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) ++#define T_ARG(t) T_SEC(t), T_MS(t) ++#define T_FMT "%u.%03u" ++ ++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) ++{ ++ // Update totals for levels that are still pending ++ for (int i = 0; i < tto->count; ++i) { ++ tto->total[i] += now - tto->start[i]; ++ tto->start[i] = now; ++ } ++ ++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", ++ prefix, ++ T_ARG(now - start0 - tto->total[0]), ++ T_ARG(tto->total[0]), ++ T_ARG(tto->total[1]), ++ T_ARG(tto->total[2]), ++ T_ARG(tto->total[3])); ++} ++ ++ ++static void tto_start(trace_time_one_t * const tto, const int64_t now) ++{ ++ av_assert0(tto->count < WAIT_COUNT_MAX); ++ tto->start[tto->count++] = now; ++} ++ ++static void tto_end(trace_time_one_t * const tto, const int64_t now) ++{ ++ const int n = --tto->count; ++ av_assert0(n >= 0); ++ tto->total[n] += now - tto->start[n]; ++} ++ ++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) ++{ ++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); ++ tto_print(&ttw->active, now, ttw->start0, "Active"); ++ tto_print(&ttw->wait, now, ttw->start0, " Wait"); ++} ++ +#endif + -+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb); -+static void gpu_free_internal(GPU_MEM_PTR_T *p); ++// GPU memory alloc fns (internal) ++ ++// GPU_MEM_PTR_T alloc fns ++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = numbytes; ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++ return 0; ++} ++ ++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = numbytes; ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++ return 0; ++} ++ ++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) { ++ mbox_mem_unlock(mb, p->vc_handle); ++ vcsm_unlock_ptr(p->arm); ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++// GPU init, free, lock, unlock ++ ++static void gpu_term(void) ++{ ++ gpu_env_t * const ge = gpu; ++ ++ // We have to hope that eveything has terminated... ++ gpu = NULL; ++ ++ vc_gpuserv_deinit(); ++ ++ gpu_free_internal(ge->mb, &ge->code_gm_ptr); ++ ++ vcsm_exit(); ++ ++ mbox_close(ge->mb); ++ ++ vq_wait_pool_deinit(&ge->wait_pool); ++ ++ free(ge); ++} ++ + +// Connect to QPU, returns 0 on success. -+static int gpu_init(volatile struct GPU **gpu) { -+ int mb = mbox_open(); -+ int vc; ++static int gpu_init(gpu_env_t ** const gpu) { + volatile struct GPU* ptr; -+ if (mb < 0) -+ return -1; -+#ifndef RPI_ASYNC -+ if (qpu_enable(mb, 1)) return -2; -+#endif ++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); ++ *gpu = NULL; ++ ++ if (ge == NULL) ++ return -1; ++ ++ if ((ge->mb = mbox_open()) < 0) ++ return -1; ++ ++ vq_wait_pool_init(&ge->wait_pool); ++ + vcsm_init(); -+ vc_gpuserv_init(); -+ gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb); -+ ptr = (volatile struct GPU*)gpu_mem_ptr.arm; -+ memset((void*)ptr, 0, sizeof *ptr); -+ vc = gpu_mem_ptr.vc; + -+ ptr->mb = mb; -+ ptr->vc = vc; ++ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr); ++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; + -+ printf("GPU allocated at 0x%x\n",vc); -+ -+ *gpu = ptr; ++ // Zero everything so we have zeros between the code bits ++ memset((void *)ptr, 0, sizeof(*ptr)); + + // Now copy over the QPU code into GPU memory + { -+ int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV); ++ int num_bytes = (char *)mc_end - (char *)rpi_shader; + av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); + memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes); + } @@ -11735,106 +12080,56 @@ index 0000000..365f4a6 + // And the transform coefficients + memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); + -+#ifdef RPI_ASYNC -+ { -+ int err; -+ vpu_async_tail = 0; -+ vpu_async_head = 0; -+ err = pthread_create(&vpu_thread, NULL, vpu_start, NULL); -+ //printf("Created thread\n"); -+ if (err) { -+ av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n"); -+ return -4; -+ } -+ -+ { -+ struct sched_param param = {0}; -+ int policy = 0; -+ -+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ -+ policy = SCHED_FIFO; -+ param.sched_priority = sched_get_priority_max(SCHED_FIFO); -+ -+ av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ -+ if (pthread_setschedparam(vpu_thread, policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ } -+ } -+ } -+ -+ } -+ -+ } -+#endif -+ ++ *gpu = ge; + return 0; +} + -+// Returns 1 if the gpu is currently idle -+static int gpu_idle(void) -+{ -+ int ret = pthread_mutex_trylock(&gpu_mutex); -+ if (ret==0) { -+ pthread_mutex_unlock(&gpu_mutex); -+ return 1; -+ } -+ return 0; -+} + -+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. -+static void gpu_lock(void) { -+ pthread_mutex_lock(&gpu_mutex); -+ -+ if (gpu==NULL) { -+ gpu_init(&gpu); -+ } -+} + +static void gpu_unlock(void) { + pthread_mutex_unlock(&gpu_mutex); +} + -+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mem_lock(mb, p->vc_handle); -+ av_assert0(p->vc); -+ return 0; ++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. ++static gpu_env_t * gpu_lock(void) { ++ pthread_mutex_lock(&gpu_mutex); ++ ++ av_assert0(gpu != NULL); ++ return gpu; +} + ++static gpu_env_t * gpu_lock_ref(void) ++{ ++ pthread_mutex_lock(&gpu_mutex); ++ ++ if (gpu == NULL) { ++ int rv = gpu_init(&gpu); ++ if (rv != 0) { ++ gpu_unlock(); ++ return NULL; ++ } ++ } ++ ++ ++gpu->open_count; ++ return gpu; ++} ++ ++static void gpu_unlock_unref(gpu_env_t * const ge) ++{ ++ if (--ge->open_count == 0) ++ gpu_term(); ++ ++ gpu_unlock(); ++} ++ ++static inline gpu_env_t * gpu_ptr(void) ++{ ++ av_assert0(gpu != NULL); ++ return gpu; ++} ++ ++// Public gpu fns ++ +// Allocate memory on GPU +// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes +// Returns 0 on success. @@ -11843,731 +12138,476 @@ index 0000000..365f4a6 +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) +{ + int r; -+ gpu_lock(); -+ r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb); -+ gpu->open_count++; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p); + gpu_unlock(); + return r; +} + -+int gpu_get_mailbox(void) -+{ -+ av_assert0(gpu); -+ return gpu->mb; -+} -+ -+// Call this to clean and invalidate a region of memory -+void gpu_cache_flush(const GPU_MEM_PTR_T * const p) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ iocache.s[0].handle = p->vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int) p->arm; -+ iocache.s[0].size = p->numbytes; -+ vcsm_clean_invalid( &iocache ); -+#else -+ void *tmp = vcsm_lock(p->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+#endif -+} -+ -+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ iocache.s[0].handle = p0->vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int) p0->arm; -+ iocache.s[0].size = p0->numbytes; -+ iocache.s[1].handle = p1->vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int) p1->arm; -+ iocache.s[1].size = p1->numbytes; -+ iocache.s[2].handle = p2->vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int) p2->arm; -+ iocache.s[2].size = p2->numbytes; -+ vcsm_clean_invalid( &iocache ); -+#else -+ void *tmp; -+ tmp = vcsm_lock(p0->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+ tmp = vcsm_lock(p1->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+ tmp = vcsm_lock(p2->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+#endif -+} -+ -+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mem_lock(gpu->mb, p->vc_handle); -+ av_assert0(p->vc); -+ return 0; -+} -+ +// This allocates data that will be +// Cached in ARM L2 +// Uncached in VPU L2 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) +{ + int r; -+ gpu_lock(); -+ r = gpu_malloc_cached_internal(numbytes, p); -+ gpu->open_count++; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_cached_internal(ge->mb, numbytes, p); + gpu_unlock(); + return r; +} + -+static void gpu_term(void) -+{ -+ int mb; -+ -+ if (gpu==NULL) -+ return; -+ mb = gpu->mb; -+ -+ // ??? Tear down anything needed for gpuexecute -+ -+ qpu_enable(mb, 0); -+ gpu_free_internal(&gpu_mem_ptr); -+ -+ vc_gpuserv_deinit(); -+ vcsm_exit(); -+ -+ mbox_close(mb); -+ gpu = NULL; -+} -+ -+void gpu_free_internal(GPU_MEM_PTR_T *p) { -+ int mb = gpu->mb; -+ mem_unlock(mb,p->vc_handle); -+ vcsm_unlock_ptr(p->arm); -+ vcsm_free(p->vcsm_handle); -+} -+ -+void gpu_free(GPU_MEM_PTR_T *p) { -+ gpu_lock(); -+ -+ gpu_free_internal(p); -+ -+ gpu->open_count--; -+ if (gpu->open_count==0) { -+ printf("Closing GPU\n"); -+ gpu_term(); -+ gpu = NULL; -+ } -+ gpu_unlock(); ++void gpu_free(GPU_MEM_PTR_T * const p) { ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_free_internal(ge->mb, p); ++ gpu_unlock_unref(ge); +} + +unsigned int vpu_get_fn(void) { + // Make sure that the gpu is initialized -+ if (gpu==NULL) { -+ printf("Preparing gpu\n"); -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ return gpu->vc + offsetof(struct GPU,vpu_code); ++ av_assert0(gpu != NULL); ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code); +} + +unsigned int vpu_get_constants(void) { -+ if (gpu==NULL) { -+ gpu_lock(); ++ av_assert0(gpu != NULL); ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even); ++} ++ ++int gpu_get_mailbox(void) ++{ ++ av_assert0(gpu); ++ return gpu->mb; ++} ++ ++// ---------------------------------------------------------------------------- ++// ++// Cache flush functions ++ ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init() ++{ ++ rpi_cache_flush_env_t * const rfe = calloc(1, sizeof(rpi_cache_flush_env_t)); ++ if (rfe == NULL) ++ return NULL; ++ ++ return rfe; ++} ++ ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) ++{ ++ if (rfe != NULL) ++ free(rfe); ++} ++ ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = (rfe->n == 0) ? 0 : vcsm_clean_invalid(&rfe->a); ++ ++ free(rfe); ++ ++ if (rc == 0) ++ return 0; ++ ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); ++ return rc; ++} ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0])); ++ ++ // Deal with empty pointer trivially ++ if (gm == NULL || gm->numbytes == 0) ++ return; ++ ++ rfe->a.s[rfe->n].cmd = mode; ++ rfe->a.s[rfe->n].handle = gm->vcsm_handle; ++ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm; ++ rfe->a.s[rfe->n].size = gm->numbytes; ++ ++rfe->n; ++} ++ ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset, const unsigned int size) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; ++ ++ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0])); ++ av_assert0(offset <= gm->numbytes); ++ av_assert0(size <= gm->numbytes); ++ av_assert0(offset + size <= gm->numbytes); ++ ++ rfe->a.s[rfe->n].cmd = mode; ++ rfe->a.s[rfe->n].handle = gm->vcsm_handle; ++ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm + offset; ++ rfe->a.s[rfe->n].size = size; ++ ++rfe->n; ++} ++ ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) ++{ ++#if !RPI_ONE_BUF ++#error Fixme! (NIF) ++#endif ++ if (gpu_is_buf1(frame)) { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); ++ } ++ else ++ { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); ++ } ++} ++ ++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma) ++{ ++ const unsigned int y_offset = frame->linesize[0] * start_line; ++ const unsigned int y_size = frame->linesize[0] * n; ++ // Round UV up/down to get everything ++ const unsigned int uv_rnd = (1U << uv_shift) >> 1; ++ const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset; ++ ++ // As all unsigned they will also reject -ve ++ // Test individually as well as added to reject overflow ++ av_assert0(start_line <= (unsigned int)frame->height); ++ av_assert0(n <= (unsigned int)frame->height); ++ av_assert0(start_line + n <= (unsigned int)frame->height); ++ ++ if (gpu_is_buf1(frame)) { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); ++ } ++ } ++ else ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++} ++ ++// Call this to clean and invalidate a region of memory ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) ++{ ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_gm_ptr(rfe, p, mode); ++ rpi_cache_flush_finish(rfe); ++} ++ ++ ++// ---------------------------------------------------------------------------- ++ ++ ++// Wait abstractions - mostly so we can easily add profile code ++static void vq_wait_pool_init(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_init(&wp->pool[i].sem, 0, 0); ++ wp->pool[i].next = wp->pool + i + 1; ++ } ++ wp->head = wp->pool + 0; ++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; ++} ++ ++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ wp->head = NULL; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_destroy(&wp->pool[i].sem); ++ wp->pool[i].next = NULL; ++ } ++} ++ ++ ++// If sem_init actually takes time then maybe we want a pool... ++static vq_wait_t * vq_wait_new(const unsigned int cost) ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ vq_wait_t * const wait = ge->wait_pool.head; ++ ge->wait_pool.head = wait->next; ++ ge->current_load += cost; ++ wait->cost = cost; ++ wait->next = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_start(&ge->ttw.active, ns_time()); ++#endif ++ ++ gpu_unlock(); ++ return wait; ++} ++ ++static void vq_wait_delete(vq_wait_t * const wait) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ wait->next = ge->wait_pool.head; ++ ge->wait_pool.head = wait; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ trace_time_wait_t * const ttw = &ge->ttw; ++ const int64_t now = ns_time(); ++ ++ttw->jcount; ++ tto_end(&ttw->wait, now); ++ ++ if (ttw->start0 == 0) ++ { ++ ttw->start0 = ttw->active.start[0]; ++ ttw->last_update = ttw->start0; ++ } ++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) ++ { ++ ttw->last_update += WAIT_TIME_PRINT_PERIOD; ++ ttw_print(ttw, now); ++ } ++ } ++#endif ++ gpu_unlock_unref(ge); ++} ++ ++static void vq_wait_wait(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ const int64_t now = ns_time(); ++ gpu_env_t * const ge = gpu_lock(); ++ tto_start(&ge->ttw.wait, now); ++ gpu_unlock(); ++ } ++#endif ++ ++ while (sem_wait(&wait->sem) == -1 && errno == EINTR) ++ /* loop */; ++} ++ ++static void vq_wait_post(vq_wait_t * const wait) ++{ ++#if !RPI_TRACE_TIME_VPU_QPU_WAIT ++ if (wait->cost != 0) ++#endif ++ { ++ gpu_env_t *const ge = gpu_lock(); ++ ge->current_load -= wait->cost; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_end(&ge->ttw.active, ns_time()); ++#endif + gpu_unlock(); + } -+ return gpu->vc + offsetof(struct GPU,transMatrix2even); ++ ++ sem_post(&wait->sem); +} + -+#ifdef GPUSERVICE -+static void callback(void *cookie) ++ ++ ++// Header comments were wrong for these two ++#define VPU_QPU_MASK_QPU 1 ++#define VPU_QPU_MASK_VPU 2 ++ ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s +{ -+ sem_post((sem_t *)cookie); ++ unsigned int n; ++ unsigned int mask; ++ unsigned int cost; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; ++ ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_env_t * vpu_qpu_job_new(void) ++{ ++ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ return vqj; +} -+#endif + -+ -+static volatile uint32_t post_done = 0; -+static volatile uint32_t post_qed = 0; -+ -+static void post_code2_cb(void * v) ++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) +{ -+ uint32_t n = (uint32_t)v; -+ if ((int32_t)(n - post_done) > 0) { -+ post_done = n; ++ memset(vqj, 0, sizeof(*vqj)); ++ free(vqj); ++} ++ ++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) ++{ ++ struct gpu_job_s * const j = vqj->j + vqj->n++; ++ av_assert0(vqj->n <= VPU_QPU_JOB_MAX); ++ return j; ++} ++ ++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) ++{ ++ if (vpu_code != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_VPU; ++ ++ j->command = EXECUTE_VPU; ++ j->u.v.q[0] = vpu_code; ++ j->u.v.q[1] = r0; ++ j->u.v.q[2] = r1; ++ j->u.v.q[3] = r2; ++ j->u.v.q[4] = r3; ++ j->u.v.q[5] = r4; ++ j->u.v.q[6] = r5; + } +} + -+ -+// Post a command to the queue -+// Returns an id which we can use to wait for completion -+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf) ++// flags are QPU_FLAGS_xxx ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail) +{ -+ struct gpu_job_s j[1] = { -+ { -+ .command = EXECUTE_VPU, -+ .u.v.q = {code, r0, r1, r2, r3, r4, r5}, -+ .callback.func = post_code2_cb -+ } -+ }; -+ uint32_t id; ++ if (n != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_QPU; ++ vqj->cost += cost; + -+ j[0].callback.cookie = (void *)(id = ++post_qed); -+ -+ av_assert0(vc_gpuserv_execute_code(1, j) == 0); -+ -+ return id; ++ j->command = EXECUTE_QPU; ++ j->u.q.jobs = n; ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; ++ j->u.q.timeout = 5000; ++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); ++ } +} + -+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ int qpu0_n, const uint32_t * qpu0_mail, -+ int qpu1_n, const uint32_t * qpu1_mail) ++// Convert callback to sem post ++static void vpu_qpu_job_callback_wait(void * v) +{ -+#if 1 -+ sem_t sync0; -+ struct gpu_job_s j[4]; ++ vq_wait_post(v); ++} + -+ sem_init(&sync0, 0, 0); ++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) ++{ ++ vq_wait_t * wait; + -+ j[0].command = EXECUTE_VPU; -+ j[0].u.v.q[0] = vpu_code; -+ j[0].u.v.q[1] = r0; -+ j[0].u.v.q[2] = r1; -+ j[0].u.v.q[3] = r2; -+ j[0].u.v.q[4] = r3; -+ j[0].u.v.q[5] = r4; -+ j[0].u.v.q[6] = r5; -+ j[0].callback.func = 0; -+ j[0].callback.cookie = NULL; ++ if (vqj->mask == 0) { ++ *wait_h = NULL; ++ return; ++ } + -+ j[1].command = EXECUTE_QPU; -+ j[1].u.q.jobs = qpu1_n; -+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[1].u.q.noflush = FLAGS_FOR_PROFILING; -+ j[1].u.q.timeout = 5000; -+ j[1].callback.func = 0; -+ j[1].callback.cookie = NULL; ++ // We are going to want a sync object ++ wait = vq_wait_new(vqj->cost); + -+ j[2].command = EXECUTE_QPU; -+ j[2].u.q.jobs = qpu0_n; -+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[2].u.q.noflush = 1; -+ j[2].u.q.timeout = 5000; -+ j[2].callback.func = 0; -+ j[2].callback.cookie = NULL; ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert0(j->callback.func == 0); + -+ j[3].command = EXECUTE_SYNC; -+ j[3].u.s.mask = 3; -+ j[3].callback.func = callback; -+ j[3].callback.cookie = (void *)&sync0; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); + -+ av_assert0(vc_gpuserv_execute_code(4, j) == 0); ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } + -+ sem_wait(&sync0); -+#else ++ vqj->cost = 0; ++ vqj->mask = 0; ++ *wait_h = wait; ++} + -+ sem_t sync0, sync2; -+ struct gpu_job_s j[3]; ++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) ++{ ++ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j); ++} + -+ sem_init(&sync0, 0, 0); -+ sem_init(&sync2, 0, 0); ++// Simple wrapper of start + delete ++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) ++{ ++ int rv; ++ rv = vpu_qpu_job_start(vqj); ++ vpu_qpu_job_delete(vqj); ++ return rv; ++} + -+ j[0].command = EXECUTE_VPU; -+ j[0].u.v.q[0] = vpu_code; -+ j[0].u.v.q[1] = r0; -+ j[0].u.v.q[2] = r1; -+ j[0].u.v.q[3] = r2; -+ j[0].u.v.q[4] = r3; -+ j[0].u.v.q[5] = r4; -+ j[0].u.v.q[6] = r5; -+ j[0].callback.func = callback; -+ j[0].callback.cookie = (void *)&sync0; ++unsigned int vpu_qpu_current_load(void) ++{ ++ return gpu_ptr()->current_load; ++} + -+ j[1].command = EXECUTE_QPU; -+ j[1].u.q.jobs = qpu1_n; -+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[1].u.q.noflush = FLAGS_FOR_PROFILING; -+ j[1].u.q.timeout = 5000; -+ j[1].callback.func = 0; -+ j[1].callback.cookie = NULL; ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) ++{ ++ if (wait_h != NULL) ++ { ++ vq_wait_t * const wait = *wait_h; ++ if (wait != NULL) { ++ *wait_h = NULL; ++ vq_wait_wait(wait); ++ vq_wait_delete(wait); ++ } ++ } ++} + -+ j[2].command = EXECUTE_QPU; -+ j[2].u.q.jobs = qpu0_n; -+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[2].u.q.noflush = 1; -+ j[2].u.q.timeout = 5000; -+ j[2].callback.func = callback; -+ j[2].callback.cookie = (void *)&sync2; ++int vpu_qpu_init() ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; + -+ av_assert0(vc_gpuserv_execute_code(3, j) == 0); -+ -+ sem_wait(&sync0); -+ sem_wait(&sync2); -+#endif ++ if (ge->init_count++ == 0) ++ { ++ vc_gpuserv_init(); ++ } + ++ gpu_unlock(); + return 0; +} + -+ -+// Wait for completion of the given command -+void vpu_wait(int id) ++void vpu_qpu_term() +{ -+ if (id == 0) { -+#if 0 -+ sem_t sync0; -+ struct gpu_job_s j[1] = -+ { -+ { -+ .command = EXECUTE_SYNC, -+ .u.s.mask = 3, -+ .callback.func = callback, -+ .callback.cookie = (void *)&sync0 -+ } -+ }; ++ gpu_env_t * const ge = gpu_lock(); + -+ sem_init(&sync0, 0, 0); ++ if (--ge->init_count == 0) { ++ vc_gpuserv_deinit(); + -+ av_assert0(vc_gpuserv_execute_code(1, j) == 0); -+ -+ sem_wait(&sync0); ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ttw_print(&ge->ttw, ns_time()); +#endif + } -+ else { -+ while ((int32_t)(post_done - (uint32_t)id) < 0) { -+ usleep(1000); -+ } -+ } ++ ++ gpu_unlock_unref(ge); +} + -+ -+unsigned int qpu_get_fn(int num) { -+ // Make sure that the gpu is initialized -+ unsigned int *fn; -+ if (gpu==NULL) { -+ printf("Preparing gpu\n"); -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ switch(num) { -+ case QPU_MC_SETUP: -+ fn = mc_setup; -+ break; -+ case QPU_MC_FILTER: -+ fn = mc_filter; -+ break; -+ case QPU_MC_EXIT: -+ fn = mc_exit; -+ break; -+ case QPU_MC_INTERRUPT_EXIT12: -+ fn = mc_interrupt_exit12; -+ break; -+ case QPU_MC_FILTER_B: -+ fn = mc_filter_b; -+ break; -+ //case QPU_MC_FILTER_HONLY: -+ // fn = mc_filter_honly; -+ // break; -+ case QPU_MC_SETUP_UV: -+ fn = mc_setup_uv; -+ break; -+ case QPU_MC_FILTER_UV: -+ fn = mc_filter_uv; -+ break; -+ case QPU_MC_FILTER_UV_B0: -+ fn = mc_filter_uv_b0; -+ break; -+ case QPU_MC_FILTER_UV_B: -+ fn = mc_filter_uv_b; -+ break; -+ case QPU_MC_INTERRUPT_EXIT8: -+ fn = mc_interrupt_exit8; -+ break; -+ case QPU_MC_END: -+ fn = mc_end; -+ break; -+ default: -+ printf("Unknown function\n"); -+ exit(-1); -+ } -+ return gpu->vc + 4*(int)(fn-rpi_shader); -+ //return code[num] + gpu->vc; -+} -+ -+#if 0 -+typedef unsigned int uint32_t; -+ -+typedef struct mvs_s { -+ GPU_MEM_PTR_T unif_mvs_ptr; -+ uint32_t *unif_mvs; // Base of memory for motion vector commands -+ -+ // _base pointers are to the start of the row -+ uint32_t *mvs_base[8]; -+ // these pointers are to the next free space -+ uint32_t *u_mvs[8]; -+ -+} HEVCContext; -+ -+#define RPI_CHROMA_COMMAND_WORDS 12 -+ -+static void rpi_inter_clear(HEVCContext *s) ++uint32_t qpu_fn(const int * const mc_fn) +{ -+ int i; -+ for(i=0;i<8;i++) { -+ s->u_mvs[i] = s->mvs_base[i]; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 128; // w -+ *s->u_mvs[i]++ = 128; // h -+ *s->u_mvs[i]++ = 128; // stride u -+ *s->u_mvs[i]++ = 128; // stride v -+ s->u_mvs[i] += 3; // Padding words -+ } ++ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code); +} + -+static void rpi_execute_inter_qpu(HEVCContext *s) -+{ -+ int k; -+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc; -+ -+ for(k=0;k<8;k++) { -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // dummy location for V -+ } -+ -+ s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore -+ -+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)) -+ ); -+} -+ -+void rpi_test_qpu(void) -+{ -+ HEVCContext mvs; -+ HEVCContext *s = &mvs; -+ int i; -+ int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS; -+ uint32_t *p; -+ printf("Allocate memory\n"); -+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr ); -+ s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->unif_mvs; -+ for(i = 0; i < 8; i++) { -+ s->mvs_base[i] = p; -+ p += uv_commands_per_qpu; -+ } -+ // Now run a simple program that should just quit immediately after a single texture fetch -+ rpi_inter_clear(s); -+ for(i=0;i<4;i++) { -+ printf("Launch QPUs\n"); -+ rpi_execute_inter_qpu(s); -+ printf("Done\n"); -+ } -+ printf("Free memory\n"); -+ gpu_free(&s->unif_mvs_ptr); -+ return; -+} -+#endif -+ -+#if 0 -+ -+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4}; -+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1}; -+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4}; -+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1}; -+ -+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24); -+ -+static uint8_t av_clip_uint8(int32_t a) -+{ -+ if (a&(~255)) return (-a)>>31; -+ else return a; -+} -+ -+static int32_t filter8(const uint8_t *data, int pitch) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += hcoeffs[x]*data[x + y * pitch]; -+ -+ vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning -+ } -+ -+ return av_clip_uint8( (vsum + 64) >> 7); -+} -+ -+// Note regression changes coefficients so is not thread safe -+//#define REGRESSION -+#ifdef REGRESSION -+#define CMAX 100 -+#else -+#define CMAX 2 -+#endif -+#define YMAX 16 -+ -+int rpi_test_shader(void) -+{ -+ int i, c; -+ -+ uint32_t *unifs; -+ -+ uint8_t *in_buffer; -+ uint8_t *out_buffer[2]; -+ -+ GPU_MEM_PTR_T unifs_ptr; -+ GPU_MEM_PTR_T in_buffer_ptr; -+ GPU_MEM_PTR_T out_buffer_ptr[2]; -+ -+ // Addresses in GPU memory of filter programs -+ uint32_t mc_setup = 0; -+ uint32_t mc_filter = 0; -+ uint32_t mc_exit = 0; -+ -+ int pitch = 0x500; -+ -+ if (gpu==NULL) { -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ -+ printf("This needs to change to reflect new assembler\n"); -+ // Use table to compute locations of program start points -+ mc_setup = code[0] + gpu->vc; -+ mc_filter = code[1] + gpu->vc; -+ mc_exit = code[2] + gpu->vc; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) { -+ return -2; -+ } -+ unifs = (uint32_t*)unifs_ptr.arm; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) { -+ return -3; -+ } -+ in_buffer = (uint8_t*)in_buffer_ptr.arm; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) { -+ return -4; -+ } -+ out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm; -+ out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm; -+ -+ for (c = 0; c < CMAX; c++) { -+ int xo[] = {rand()&31, rand()&31}; -+ -+#ifdef REGRESSION -+ for (i = 0; i < 8; i++) { -+ hcoeffs[i] = (int8_t)rand(); -+ vcoeffs[i] = (int8_t)rand(); -+ if (hcoeffs[i]==-128) -+ hcoeffs[i]++; -+ if (vcoeffs[i]==-128) -+ vcoeffs[i]++; -+ } -+#endif -+ -+ for (i = 0; i < 64*23; i++) { -+ //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]); -+ in_buffer[i] = rand(); -+ } -+ -+ // Clear output array -+ { -+ int b; -+ for(b=0;b<2;b++) { -+ for(i=0;i<16*16;i++) { -+ out_buffer[b][i] = 3; -+ } -+ } -+ } -+ -+ unifs[0] = mc_filter; -+ unifs[1] = in_buffer_ptr.vc+xo[0]+16; -+ unifs[2] = 64; // src pitch -+ unifs[3] = pitch; // dst pitch -+ unifs[4] = 0; // Padding -+ unifs[5] = 0; -+ unifs[6] = 0; -+ unifs[7 ] = mc_filter; -+ unifs[8 ] = in_buffer_ptr.vc+xo[1]+16; -+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[13] = out_buffer_ptr[0].vc; -+ unifs[14] = mc_exit; -+ unifs[15] = in_buffer_ptr.vc+xo[1]+16; // dummy -+ unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[20] = out_buffer_ptr[1].vc; -+ -+ printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc); -+ -+ // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem -+ -+ //qpu_run_shader(mc_setup, unifs_ptr.vc); -+ //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc); -+ rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]); -+ rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]); -+ -+ if (1) -+ { -+ int x, y, b; -+ int bad = 0; -+ -+ for (b=0; b<2; ++b) -+ for (y=0; yvc; -+ mc_filter = code[1] + gpu->vc; -+ mc_exit = code[2] + gpu->vc; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) { -+ return; -+ } -+ //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr); -+ //out_buffer = (uint8_t*)out_buffer_ptr.arm; -+ -+ /*for (y=0; y<16; ++y) { -+ for (x=0; x<16; ++x) { -+ out_buffer[x+y*dst_pitch] = 7; -+ } -+ }*/ -+ -+ unifs = (uint32_t*)unifs_ptr.arm; -+ -+ unifs[0] = mc_filter; -+ unifs[1] = (int)in_buffer_vc; -+ unifs[2] = src_pitch; // src pitch -+ unifs[3] = dst_pitch; // dst pitch -+ unifs[4] = 0; // Padding -+ unifs[5] = 0; -+ unifs[6] = 0; -+ unifs[7 ] = mc_exit; -+ unifs[8 ] = (int)in_buffer_vc; -+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[13] = (int)dst_vc; -+ //unifs[13] = (int)out_buffer_ptr.vc; -+ -+ //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc); -+ -+ qpu_run_shader(mc_setup, unifs_ptr.vc); -+ -+ /*for (y=0; y<16; ++y) { -+ for (x=0; x<16; ++x) { -+ dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch]; -+ } -+ }*/ -+ -+ gpu_free(&unifs_ptr); -+ //gpu_free(&out_buffer_ptr); -+} -+ -+ -+ -+#endif -+ +#endif // RPI -diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h +diff --git b/libavcodec/rpi_qpu.h a/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000..c6cdb2b +index 0000000..bcde316 --- /dev/null -+++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,176 @@ ++++ a/libavcodec/rpi_qpu.h +@@ -0,0 +1,204 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + -+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code -+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes) -+#define RPI_FAST_CACHEFLUSH ++#include + +#define RPI_ONE_BUF 1 + @@ -12582,9 +12622,7 @@ index 0000000..c6cdb2b +// General GPU functions +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); -+extern void gpu_free(GPU_MEM_PTR_T *p); -+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p); -+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2); ++extern void gpu_free(GPU_MEM_PTR_T * const p); + +#include "libavutil/frame.h" +#if !RPI_ONE_BUF @@ -12627,29 +12665,31 @@ index 0000000..c6cdb2b + return av_buffer_get_opaque(frame->buf[0]); +} + -+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n) ++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) +{ + return av_buffer_pool_opaque(frame->buf[n]); +} + ++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) ++{ ++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); ++ return gm->vc + (frame->data[n] - gm->arm); ++} ++ + +static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc; ++ return get_vc_address3(frame, 0); +} + +static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? -+ gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] : -+ gpu_buf3_gmem(frame, 1)->vc; ++ return get_vc_address3(frame, 1); +} + +static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? -+ gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] : -+ gpu_buf3_gmem(frame, 2)->vc; ++ return get_vc_address3(frame, 2); +} + -+ ++#if 0 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { + if (gpu_is_buf1(frame)) + { @@ -12686,30 +12726,44 @@ index 0000000..c6cdb2b + else + return *gpu_buf3_gmem(frame, 2); +} -+ +#endif ++#endif ++ ++// Cache flush stuff ++ ++typedef struct rpi_flush_envss { ++ unsigned int n; ++ struct vcsm_user_clean_invalid_s a; ++} rpi_cache_flush_env_t; ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(void); ++// Free env without flushing ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & free the env ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); ++ ++typedef enum ++{ ++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, ++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, ++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 ++} rpi_cache_flush_mode_t; ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, ++ const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma); ++ ++// init, add, finish for one gm ptr ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); + + +// QPU specific functions -+extern void rpi_test_qpu(void); ++uint32_t qpu_fn(const int * const mc_fn); + -+enum { -+ QPU_MC_SETUP, -+ QPU_MC_FILTER, -+ QPU_MC_EXIT, -+ QPU_MC_INTERRUPT_EXIT12, -+ QPU_MC_FILTER_B, -+ QPU_MC_FILTER_HONLY, -+ QPU_MC_SETUP_UV, -+ QPU_MC_FILTER_UV, -+ QPU_MC_FILTER_UV_B0, -+ QPU_MC_FILTER_UV_B, -+ QPU_MC_INTERRUPT_EXIT8, -+ QPU_MC_END -+ }; -+extern unsigned int qpu_get_fn(int num); -+ -+#define QPU_N_UV 8 ++#define QPU_N_UV 12 +#define QPU_N_Y 12 +#define QPU_N_MAX 16 + @@ -12718,16 +12772,32 @@ index 0000000..c6cdb2b +#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS) +#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t)) + ++struct vpu_qpu_wait_s; ++typedef struct vq_wait_s * vpu_qpu_wait_h; ++ +// VPU specific functions ++ ++struct vpu_qpu_job_env_s; ++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; ++ ++vpu_qpu_job_h vpu_qpu_job_new(void); ++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); ++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail); ++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_job_start(const vpu_qpu_job_h vqj); ++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); ++ ++ +extern unsigned int vpu_get_fn(void); +extern unsigned int vpu_get_constants(void); -+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5); -+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf); -+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ int qpu0_n, const uint32_t * qpu0_mail, -+ int qpu1_n, const uint32_t * qpu1_mail); + -+extern void vpu_wait( int id); ++// Waits for previous post_codee to complete and Will null out *wait_h after use ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); ++unsigned int vpu_qpu_current_load(void); ++int vpu_qpu_init(void); ++void vpu_qpu_term(void); + +// Simple test of shader code +extern int rpi_test_shader(void); @@ -12738,12 +12808,12 @@ index 0000000..c6cdb2b +extern int gpu_get_mailbox(void); + +#endif -diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c +diff --git b/libavcodec/rpi_shader.c a/libavcodec/rpi_shader.c new file mode 100644 -index 0000000..06fb166 +index 0000000..627cda9 --- /dev/null -+++ b/libavcodec/rpi_shader.c -@@ -0,0 +1,629 @@ ++++ a/libavcodec/rpi_shader.c +@@ -0,0 +1,624 @@ +#include "rpi_shader.h" + +#ifdef _MSC_VER @@ -12768,642 +12838,645 @@ index 0000000..06fb166 +#endif +unsigned int rpi_shader[] = { +// ::mc_setup_uv -+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num -+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif -+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif -+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base -+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1 -+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1 -+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif -+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif -+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0 -+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000 -+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64 -+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00 -+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24 -+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0 -+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0 -+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0 -+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0 -+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0 -+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0 -+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 -+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0 -+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x -+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y -+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base -+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset -+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 -+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3 -+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0 -+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch -+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2 -+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1 -+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9 -+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif -+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif -+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1 -+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15 -+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2 -+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2 -+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2 -+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3 -+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1 -+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1 -+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1 -+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1 -+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x -+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base ++/* [0x00000000] */ 0x95801ff6, 0xd002591e, // mov tmurs, 1 ; mov ra_link, unif ++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000010] */ 0x159a7d80, 0x10020827, // mov r0, elem_num ++/* [0x00000018] */ 0x0c027c00, 0x14020427, // add ra_x, ra0.16b, r0 ++/* [0x00000020] */ 0x15027d80, 0x12020767, // mov ra_y, ra0.16a ++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif ++/* [0x00000030] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000038] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base ++/* [0x00000040] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1 ++/* [0x00000048] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1 ++/* [0x00000050] */ 0x15827d80, 0x10021427, // mov rb16, unif ++/* [0x00000058] */ 0x0c827380, 0x10021627, // add rb24, r1, unif ++/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 ++/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 ++/* [0x00000070] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000078] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00000080] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000088] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000090] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++/* [0x00000098] */ 0x00000000, 0xe0020327, // mov ra12, 0 ++/* [0x000000a0] */ 0x00000000, 0xe0020367, // mov ra13, 0 ++/* [0x000000a8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 ++/* [0x000000b0] */ 0x00000000, 0xe00203e7, // mov ra15, 0 ++/* [0x000000b8] */ 0x00000000, 0xe0020267, // mov ra9, 0 ++/* [0x000000c0] */ 0x15427d80, 0x10020827, // mov r0, ra_x ++/* [0x000000c8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0 ; mov r1, ra_y ++/* [0x000000d0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base ++/* [0x000000d8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset ++/* [0x000000e0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 ++/* [0x000000e8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3 ++/* [0x000000f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 ++/* [0x000000f8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0 ++/* [0x00000100] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x00000108] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch ++/* [0x00000110] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2 ++/* [0x00000118] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1 ++/* [0x00000120] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif ++/* [0x00000128] */ 0x15827d80, 0x100009e7, // mov -, unif ++/* [0x00000130] */ 0x15827d80, 0x100009e7, // mov -, unif ++/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000140] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000150] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000158] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000160] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000168] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000170] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000178] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000180] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000188] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 ++/* [0x00000190] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x00000198] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 ++/* [0x000001a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000001a8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x000001b0] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x ++/* [0x000001b8] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base +// ::mc_filter_uv -+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif -+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb28 -+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif -+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c -+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 -+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 ++/* [0x000001c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000001c8] */ 0x15827d80, 0x100200a7, // mov ra2, unif ++/* [0x000001d0] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif ++/* [0x000001d8] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0 ++/* [0x000001e0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x000001e8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 ++/* [0x000001f0] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next ++/* [0x000001f8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 ++/* [0x00000200] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif ++/* [0x00000208] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif ++/* [0x00000210] */ 0x9509cdbf, 0x12024731, // mov ra_y_next, ra2.16a ; mov vw_setup, rb28 ++/* [0x00000218] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 ++/* [0x00000220] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b ++/* [0x00000228] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 ++/* [0x00000230] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 ++/* [0x00000238] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 ++/* [0x00000240] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait ++/* [0x00000248] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:filter_uv_1 ++/* [0x00000250] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b ++/* [0x00000258] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 ++/* [0x00000260] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif ++/* [0x00000268] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 ++/* [0x00000270] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 ++/* [0x00000278] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 ++// :filter_uv_1 ++/* [0x00000280] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000288] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a ++/* [0x00000290] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b ++/* [0x00000298] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c ++/* [0x000002a0] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d ++/* [0x000002a8] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 ++/* [0x000002b0] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 ++/* [0x000002b8] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 +// :uvloop -+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16 -+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0 -+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x000002c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x000002c8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 ++/* [0x000002d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++/* [0x000002d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x000002e0] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 ++/* [0x000002e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 ++/* [0x000002f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x000002f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000300] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000308] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 ++/* [0x00000310] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000318] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00000320] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000328] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000330] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000338] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000340] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000348] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000350] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000358] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00000360] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x00000368] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x00000370] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x00000378] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 ++/* [0x00000380] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x00000388] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 ++/* [0x00000390] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 ++/* [0x00000398] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait ++/* [0x000003a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x000003a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000003b0] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 ++/* [0x000003b8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x000003c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 ++/* [0x000003c8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x000003d0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 ++/* [0x000003d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 ++/* [0x000003e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 ++/* [0x000003e8] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26 ++/* [0x000003f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000003f8] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29 ++/* [0x00000400] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000408] */ 0x15827d80, 0x100202e7, // mov ra11, unif +// ::mc_filter_uv_b0 -+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif -+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb21 -+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0, r0, i_shift16 ; mov ra3, unif -+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a -+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b -+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c -+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d -+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov rb14, unif -+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0 ++/* [0x00000410] */ 0x15827d80, 0x100009e7, // mov -, unif ++/* [0x00000418] */ 0x15827d80, 0x100200a7, // mov ra2, unif ++/* [0x00000420] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif ++/* [0x00000428] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0 ++/* [0x00000430] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000438] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 ++/* [0x00000440] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next ++/* [0x00000448] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 ++/* [0x00000450] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif ++/* [0x00000458] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif ++/* [0x00000460] */ 0x150a7d80, 0x12020727, // mov ra_y_next, ra2.16a ++/* [0x00000468] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 ++/* [0x00000470] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b ++/* [0x00000478] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 ++/* [0x00000480] */ 0x0c043dc0, 0xd20207e7, // add ra31, ra1.16a, 3 ++/* [0x00000488] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 ++/* [0x00000490] */ 0x8c0601bf, 0x14025803, // add r0, r0, ra1.16b ; mov ra3, unif ++/* [0x00000498] */ 0x918101f6, 0xd002480e, // shl r0, r0, i_shift16 ; mov rb14, unif ++/* [0x000004a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 ++/* [0x000004a8] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a ++/* [0x000004b0] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b ++/* [0x000004b8] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c ++/* [0x000004c0] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d ++/* [0x000004c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x000004d0] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0 +// :uvloop_b0 -+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18 -+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6 -+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000004d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x000004e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 ++/* [0x000004e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++/* [0x000004f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x000004f8] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 ++/* [0x00000500] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 ++/* [0x00000508] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000510] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000518] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000520] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 ++/* [0x00000528] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000530] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00000538] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000540] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000548] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000550] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000558] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000560] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000568] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000570] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00000578] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x00000580] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0 ++/* [0x00000588] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x00000590] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 ++/* [0x00000598] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x000005a0] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 ++/* [0x000005a8] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 ++/* [0x000005b0] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0 ; mov ra7, rb6 ++/* [0x000005b8] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31 ++/* [0x000005c0] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6 ; mov rb6, ra5 ++/* [0x000005c8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:uvloop_b0 ++/* [0x000005d0] */ 0x95104ff6, 0x10024144, // mov ra5, rb4 ; mov rb4, ra4 ++/* [0x000005d8] */ 0x95185ff6, 0x10024105, // mov ra4, rb5 ; mov rb5, ra6 ++/* [0x000005e0] */ 0x95207ff6, 0x10024187, // mov ra6, rb7 ; mov rb7, ra8 ++/* [0x000005e8] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3 ++/* [0x000005f0] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin ++/* [0x000005f8] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3 ; mov -, unif ++/* [0x00000600] */ 0x95810ff6, 0xd0020827, // mov r0, i_shift16 ; mov -, unif ++/* [0x00000608] */ 0x00010000, 0xe0020867, // mov r1, 0x10000 ++/* [0x00000610] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12 ++/* [0x00000618] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 ++/* [0x00000620] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 ++/* [0x00000628] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 ++/* [0x00000630] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 ++/* [0x00000638] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30 ++/* [0x00000640] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 ++/* [0x00000648] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 ++/* [0x00000650] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin ++/* [0x00000658] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 ++/* [0x00000660] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 ++/* [0x00000668] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 ++// :uv_b0_post12 ++/* [0x00000670] */ 0x95105dbf, 0x100248a3, // mov r2, ra4 ; mov r3, rb5 ++/* [0x00000678] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 ++/* [0x00000680] */ 0x959e749b, 0x100241c6, // mov ra7, r2 ; mov rb6, r3 ++/* [0x00000688] */ 0x95187dbf, 0x100248a3, // mov r2, ra6 ; mov r3, rb7 ++/* [0x00000690] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 ++/* [0x00000698] */ 0x959e749b, 0x10024144, // mov ra5, r2 ; mov rb4, r3 +// ::mc_filter_uv_b -+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 -+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0 ; mov ra_y_next, unif -+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 -+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21 ; mov ra3, unif -+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8 -+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21 -+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c -+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 ++// :uv_b0_post_fin ++/* [0x000006a0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000006a8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait ++/* [0x000006b0] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:uv_filter_b_1 ++/* [0x000006b8] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x000006c0] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 ++/* [0x000006c8] */ 0x0c027c00, 0x14020827, // add r0, ra0.16b, r0 ++/* [0x000006d0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 ++/* [0x000006d8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 ++/* [0x000006e0] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 ++// :uv_filter_b_1 ++/* [0x000006e8] */ 0x930001f6, 0xd202581c, // max r0, r0, 0 ; mov ra_y_next, ra0.16a ++/* [0x000006f0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif ++/* [0x000006f8] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 ++/* [0x00000700] */ 0x8c8270f6, 0x10020827, // add r0, r0, r3 ; mov -, unif ++/* [0x00000708] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif ++/* [0x00000710] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000720] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a ++/* [0x00000728] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b ++/* [0x00000730] */ 0x8c0d3eb6, 0x1c02468a, // add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c ++/* [0x00000738] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d ++/* [0x00000740] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 ++/* [0x00000748] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 +// :uvloop_b -+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0 ; mul24 r0, vpm, ra4 -+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a -+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop ; mul24 r0, r0, rb14 -+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16 -+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0 -+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000750] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x00000758] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 ++/* [0x00000760] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++/* [0x00000768] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x00000770] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 ++/* [0x00000778] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 ++/* [0x00000780] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000788] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000790] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000798] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 ++/* [0x000007a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x000007a8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x000007b0] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000007b8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000007c0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000007c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000007d0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000007d8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000007e0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000007e8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x000007f0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x000007f8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x00000800] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x00000808] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 ++/* [0x00000810] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x00000818] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 ++/* [0x00000820] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 ++/* [0x00000828] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 ++/* [0x00000830] */ 0x55586fce, 0x100241e1, // mov ra7, rb6 ; mul24 r1, r1, ra_k256 ++/* [0x00000838] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14 ; mov rb6, ra5 ++/* [0x00000840] */ 0x55044fce, 0x12024161, // mov ra5, rb4 ; mul24 r1, r1, ra1.16a ++/* [0x00000848] */ 0x8c127236, 0x10024844, // add r1, r1, r0 ; mov rb4, ra4 ++/* [0x00000850] */ 0x55585fce, 0x10024121, // mov ra4, rb5 ; mul24 r1, r1, ra_k256 ++/* [0x00000858] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12 ; mov rb5, ra6 ++/* [0x00000860] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31 ; mov ra6, rb7 ++/* [0x00000868] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x00000870] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13 ++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait ; mov rb7, ra8 ++/* [0x00000880] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a ++/* [0x00000888] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26 ++/* [0x00000890] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000898] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29 ++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x000008a8] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++// ::mc_exit_c ++/* [0x000008b0] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait ++/* [0x000008b8] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit_c_1 ++/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000008d8] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 ++/* [0x000008e0] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 ++/* [0x000008e8] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 ++/* [0x000008f0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit -+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0) -+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop ; nop -+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_interrupt_exit8 -+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++// :exit_c_1 ++/* [0x000008f8] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000900] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000908] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000910] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x00000918] */ 0x00000000, 0xe80009e7, // mov -,srel(0) ++/* [0x00000920] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend +/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop ++/* [0x00000930] */ 0x009e7000, 0x100009e7, // nop ; nop +// ::mc_setup -+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16 -+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif ++/* [0x00000938] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1 ; mov ra8, unif +/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif +/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif +/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif -+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1 -+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1 -+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif -+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0 -+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8 -+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 -+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 -+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0 -+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2 -+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10 -+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 -+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1 -+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0 -+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2 -+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64 -+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00 -+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24 -+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0 -+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0 -+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0 -+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0 -+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0 -+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0 -+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 -+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0 -+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2 -+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2 -+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2 -+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3 -+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1 -+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1 -+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 -+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base -+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 -+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 -+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2 ++/* [0x00000958] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000960] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000968] */ 0x0d0c1dc0, 0xd4021667, // sub rb_frame_width_minus_1, ra3.16b, 1 ++/* [0x00000970] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_frame_height_minus_1, ra3.16a, 1 ++/* [0x00000978] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000980] */ 0x15827380, 0x10021627, // or rb24, r1, unif ++/* [0x00000988] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x00000990] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3 ++/* [0x00000998] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x000009a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 ++/* [0x000009a8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 ++/* [0x000009b0] */ 0x0c201dc0, 0xd4020767, // add ra_y, ra8.16b, 1 ++/* [0x000009b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 ++/* [0x000009c0] */ 0x0c267c00, 0x100208a7, // add r2, ra9, r0 ++/* [0x000009c8] */ 0x13200dc0, 0xd4020867, // max r1, ra8.16b, 0 ++/* [0x000009d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x000009d8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x000009e0] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2 ++/* [0x000009e8] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3 ++/* [0x000009f0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x000009f8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 ++/* [0x00000a00] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 ++/* [0x00000a08] */ 0x0c281dc0, 0xd4120567, // add ra_y2, ra10.16b, 1 ++/* [0x00000a10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 ++/* [0x00000a18] */ 0x0c2e7c00, 0x100208a7, // add r2, ra11, r0 ++/* [0x00000a20] */ 0x13280dc0, 0xd4020867, // max r1, ra10.16b, 0 ++/* [0x00000a28] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x00000a30] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x00000a38] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2 ++/* [0x00000a40] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 ++/* [0x00000a48] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 ++/* [0x00000a50] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000a58] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000a60] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000a68] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000a70] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++/* [0x00000a78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000a80] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000a88] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000a90] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000a98] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000aa0] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000aa8] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000ab0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000ab8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000ac0] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000ac8] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 ++/* [0x00000ad0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 ++/* [0x00000ad8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x00000ae0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 ++/* [0x00000ae8] */ 0x55810d8f, 0x100049e1, // mov -, unif ; mul24 r1, r1, rb_pitch ++/* [0x00000af0] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base ++/* [0x00000af8] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 ++/* [0x00000b00] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 ++/* [0x00000b08] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 ++/* [0x00000b10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x00000b18] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2 +// :per_block_setup -+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num -+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next -+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next -+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8 ; mov ra_y_next, ra1.16b -+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0 -+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3 ; mov ra_y2_next, ra1.16b -+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0 -+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5 -+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7 -+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif -+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16 -+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3 -+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400 -+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d -+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c -+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d -+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c -+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c -+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d ; mov r0, unif -+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c ; mov r1, rb13 -+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1 ; mov rb4, ra3.8a -+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3 ; mov rb5, ra3.8b -+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3 ; mov rb6, ra3.8c -+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d -+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9 ++/* [0x00000b20] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000b28] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b30] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num ++/* [0x00000b38] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next ++/* [0x00000b40] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next ++/* [0x00000b48] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 ++/* [0x00000b50] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000b58] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif ++/* [0x00000b60] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 ++/* [0x00000b68] */ 0x15067d80, 0x14020727, // mov ra_y_next, ra1.16b ++/* [0x00000b70] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif ++/* [0x00000b78] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0 ++/* [0x00000b80] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 ++/* [0x00000b88] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif ++/* [0x00000b98] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 ++/* [0x00000ba0] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b ++/* [0x00000ba8] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif ++/* [0x00000bb0] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0 ++/* [0x00000bb8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 ++/* [0x00000bc0] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b ++/* [0x00000bc8] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5 ++/* [0x00000bd0] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7 ++/* [0x00000bd8] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 ++/* [0x00000be0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b ++/* [0x00000be8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 ++/* [0x00000bf0] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif ++/* [0x00000bf8] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16 ; mov ra5, unif ++/* [0x00000c00] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400 ++/* [0x00000c08] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3 ; mov rb14, ra5.16a ++/* [0x00000c10] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000c18] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00000c20] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00000c28] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d ++/* [0x00000c30] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c ++/* [0x00000c38] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00000c40] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00000c48] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00000c50] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00000c58] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00000c60] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00000c68] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000c70] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d ++/* [0x00000c78] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c ++/* [0x00000c80] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00000c88] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d ++/* [0x00000c90] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c ++/* [0x00000c98] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000ca0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x00000ca8] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c ++/* [0x00000cb0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x00000cb8] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d ++/* [0x00000cc0] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c ++/* [0x00000cc8] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a ; mov ra18, unif ++/* [0x00000cd0] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b ++/* [0x00000cd8] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c ++/* [0x00000ce0] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18 ++/* [0x00000ce8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000cf0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13 ++/* [0x00000cf8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9 ++/* [0x00000d00] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d +// ::mc_filter -+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15 ++/* [0x00000d08] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1 +// :yloop -+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 -+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 -+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 -+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000d10] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x00000d18] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 ++/* [0x00000d20] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++/* [0x00000d28] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x00000d30] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++/* [0x00000d38] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 ++/* [0x00000d40] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000d48] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000d50] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000d58] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00000d60] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000d68] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00000d70] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000d78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000d80] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00000d88] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000d90] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000d98] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000da0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000da8] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000db0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000db8] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000dc0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000dc8] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000dd0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00000dd8] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00000de0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00000de8] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000df0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00000df8] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00000e00] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00000e08] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 ++/* [0x00000e10] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00000e18] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00000e20] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00000e28] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00000e30] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 ++/* [0x00000e38] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00000e40] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00000e48] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00000e50] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00000e58] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00000e60] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00000e68] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00000e70] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00000e78] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait ++/* [0x00000e80] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x00000e88] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00000e90] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 ++/* [0x00000e98] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 ++/* [0x00000ea0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x00000ea8] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00000eb0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 ++/* [0x00000eb8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 ++/* [0x00000ec0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 ++/* [0x00000ec8] */ 0xfffffc38, 0xf0f809e7, // brr -, r:per_block_setup ++/* [0x00000ed0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x00000ed8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x00000ee0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif +// ::mc_filter_b -+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16 +// :yloopb -+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 -+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 -+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 -+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 -+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 -+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8 -+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000ee8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x00000ef0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 ++/* [0x00000ef8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++/* [0x00000f00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x00000f08] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++/* [0x00000f10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 ++/* [0x00000f18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000f20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000f28] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000f30] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00000f38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 ++/* [0x00000f40] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00000f48] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000f50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000f58] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00000f60] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000f68] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000f70] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000f78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000f80] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000f88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000f90] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000f98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000fa0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000fa8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00000fb0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00000fb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00000fc0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000fc8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00000fd0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00000fd8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00000fe0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 ++/* [0x00000fe8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00000ff0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x00000ff8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00001000] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001008] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 ++/* [0x00001010] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00001018] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00001020] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001028] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001030] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001038] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001040] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001048] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001050] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 ++/* [0x00001058] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x00001060] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001068] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 ++/* [0x00001070] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 ++/* [0x00001078] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait ++/* [0x00001080] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x00001088] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x00001090] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 ++/* [0x00001098] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 ++/* [0x000010a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 ++/* [0x000010a8] */ 0xfffffa58, 0xf0f809e7, // brr -, r:per_block_setup ++/* [0x000010b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x000010b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x000010c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++// ::mc_interrupt_exit12c ++/* [0x000010c8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait ++/* [0x000010d0] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit12_c_1 ++/* [0x000010d8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000010e8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000010f0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 ++/* [0x000010f8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 ++/* [0x00001100] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 ++/* [0x00001108] */ 0x00000000, 0xe0020267, // mov ra9, 0 +// ::mc_interrupt_exit12 -+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop ; nop ++// :exit12_c_1 ++/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00001118] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00001120] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00001128] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001178] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001180] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00001188] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00001190] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++/* [0x00001198] */ 0x009e7000, 0x100009e7, // nop ; nop +// ::mc_exit1 -+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop ; nop ++/* [0x000011a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x000011b0] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x000011b8] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x000011c0] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x000011c8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000011d0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++/* [0x000011d8] */ 0x009e7000, 0x100009e7, // nop ; nop +// ::mc_end +}; +#ifdef __HIGHC__ +#pragma Align_to(8, rpi_shader) +#endif -diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h +diff --git b/libavcodec/rpi_shader.h a/libavcodec/rpi_shader.h new file mode 100644 -index 0000000..9772796 +index 0000000..3b1229e --- /dev/null -+++ b/libavcodec/rpi_shader.h -@@ -0,0 +1,19 @@ ++++ a/libavcodec/rpi_shader.h +@@ -0,0 +1,20 @@ +#ifndef rpi_shader_H +#define rpi_shader_H + +extern unsigned int rpi_shader[]; + +#define mc_setup_uv (rpi_shader + 0) -+#define mc_filter_uv (rpi_shader + 132) -+#define mc_filter_uv_b0 (rpi_shader + 274) -+#define mc_filter_uv_b (rpi_shader + 392) -+#define mc_exit (rpi_shader + 540) -+#define mc_interrupt_exit8 (rpi_shader + 558) -+#define mc_setup (rpi_shader + 588) -+#define mc_filter (rpi_shader + 872) -+#define mc_filter_b (rpi_shader + 992) -+#define mc_interrupt_exit12 (rpi_shader + 1114) -+#define mc_exit1 (rpi_shader + 1152) -+#define mc_end (rpi_shader + 1168) ++#define mc_filter_uv (rpi_shader + 112) ++#define mc_filter_uv_b0 (rpi_shader + 260) ++#define mc_filter_uv_b (rpi_shader + 424) ++#define mc_exit_c (rpi_shader + 556) ++#define mc_exit (rpi_shader + 574) ++#define mc_setup (rpi_shader + 590) ++#define mc_filter (rpi_shader + 834) ++#define mc_filter_b (rpi_shader + 954) ++#define mc_interrupt_exit12c (rpi_shader + 1074) ++#define mc_interrupt_exit12 (rpi_shader + 1092) ++#define mc_exit1 (rpi_shader + 1128) ++#define mc_end (rpi_shader + 1144) + +#endif -diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm +diff --git b/libavcodec/rpi_shader.qasm a/libavcodec/rpi_shader.qasm new file mode 100644 -index 0000000..aa9e1e7 +index 0000000..6fd6af5 --- /dev/null -+++ b/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1098 @@ ++++ a/libavcodec/rpi_shader.qasm +@@ -0,0 +1,1150 @@ ++ ++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress ++# the warning that we are using rotation & ra/rb registers. r0..3 can be ++# rotated through all 16 elems ra regs can only be routated through their ++# local 4. As it happens this is what is wanted here as we do not want the ++# constants from the other half of the calc. ++ +# register allocation +# +# ra0...ra7 eight horizontal filter coefficients @@ -13420,7 +13493,7 @@ index 0000000..aa9e1e7 +# +# rb8...rb11 eight vertical filter coefficients + -+# ra4 y: Fiter, UV: 0x10000 ++# ra4 y: Fiter, UV: part -of b0 -> b stash + +# rb12 offset to add before shift (round + weighting offsets) +# rb13 shift: denom + 6 + 9 @@ -13442,10 +13515,10 @@ index 0000000..aa9e1e7 +# ra22 ra_k256 256 +# ra23 ra_y2_next ra_y2_next +# -+# rb20 0xffffff00 -+# rb21 vpm_setup for reading/writing 16bit results into VPM ++# rb20 -- free -- ++# rb21 -- free -- +# rb22 rb_k255 255 -+# rb23 24 ++# rb23 -- free -- +# +# rb24 vdw_setup_1(dst_pitch) +# rb25 frame width-1 @@ -13462,9 +13535,10 @@ index 0000000..aa9e1e7 +# ra27 next ra25 +# ra28 next y +# ra29 y for next texture access -+# ra30 64 +# -+# ra31 next kernel address ++# Use an even numbered register as a link register to avoid corrupting flags ++# ra30 next kernel address ++# ra31 chroma-B height+3; free otherwise + +.set rb_frame_width_minus_1, rb25 +.set rb_frame_height_minus_1, rb30 @@ -13496,22 +13570,46 @@ index 0000000..aa9e1e7 +.set rb_k255, rb22 +.set ra_k256, ra22 + ++.set ra_link, ra30 ++ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. +.set i_shift16, -16 +.set i_shift21, -11 ++.set i_shift30, -2 ++ ++# Much of the setup code is common between Y & C ++# Macros that express this - obviously these can't be overlapped ++# so are probably unsuitable for loop code ++ ++.macro m_calc_dma_regs, r_vpm, r_dma ++ mov r2, qpu_num ++ asr r1, r2, 2 ++ shl r1, r1, 6 ++ and r0, r2, 3 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later ++ shl r0, r0, 5 ++ add r_dma, r0, r1 # DMA out ++.endm ++ + +################################################################################ +# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) +::mc_setup_uv -+ -+# Read starting kernel -+mov ra31, unif ++ mov tmurs, 1 ; mov ra_link, unif # No swap TMUs ; Next fn + +# Load first request location -+add ra_x, unif, elem_num # Store x -+mov ra_y, unif # Store y ++mov ra0, unif ++mov r0, elem_num ++ ++add ra_x, ra0.16b, r0 # Store x ++mov ra_y, ra0.16a # Store y +mov ra_frame_base, unif # Store frame u base -+nop ++mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_frame_base +sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame + +# Read image dimensions @@ -13521,77 +13619,59 @@ index 0000000..aa9e1e7 +# get source pitch +mov rb16, unif + -+# get destination pitch -+mov r0, unif -+mov r1, vdw_setup_1(0) -+add rb24, r1, r0 ++# get destination vdw setup ++add rb24, r1, unif # dst_stride + +# load constants ++ mov ra_k1, 1 ++ mov ra_k256, 256 ++ mov rb_k255, 255 + -+mov ra4, 0x10000 -+mov ra_k1, 1 -+mov ra_k256, 256 -+mov ra30, 64 ++# touch registers to keep simulator happy + -+mov rb20, 0xffffff00 -+mov rb_k255, 255 -+mov rb23, 24 ++ # ra/b4..7: B0 -> B stash registers ++ mov ra4, 0 ; mov rb4, 0 ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 + -+# touch vertical context to keep simulator happy ++ # ra12..15: vertical scroll registers ++ mov ra12, 0 ++ mov ra13, 0 ++ mov ra14, 0 ++ mov ra15, 0 + -+mov ra8, 0 -+mov ra9, 0 -+mov ra10, 0 -+mov ra11, 0 -+mov ra12, 0 -+mov ra13, 0 -+mov ra14, 0 -+mov ra15, 0 ++ # ra9 - delayed setup - must be 0 initially ++ mov ra9, 0 + +# Compute base address for first and second access +mov r0, ra_x # Load x -+max r0, r0, 0; mov r1, ra_y # Load y ++max r0, r0, 0 ; mov r1, ra_y # Load y +min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base -+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset ++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset +add ra_y, r1, 1 +add r0, r0, r3 +and r0, r0, ~3 -+max r1, r1, 0 ; mov ra_x, r0 # y ++max r1, r1, 0 ; mov ra_x, r0 # y +min r1, r1, rb_frame_height_minus_1 +# submit texture requests for first line +add r2, r2, r0 ; mul24 r1, r1, rb_pitch +add t0s, r0, r1 ; mov ra_frame_base, r2 +add t1s, r2, r1 + -+mov r2, 9 -+add rb13, r2, unif # denominator ++add rb13, 9, unif # denominator +mov -, unif # Unused + -+# Compute part of VPM to use for DMA output -+mov r2, unif -+shl r2, r2, 1 # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results) -+and r2, r2, 15 -+mov r1, r2 -+asr r1, r1, 2 -+shl r1, r1, 6 -+mov r0, r2 -+and r0, r0, 3 -+add r0, r0, r1 ++mov -, unif # ??? same as (register) qpu_num + -+mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+add rb28, r0, r1 # VPM 8bit storage -+asr r2, r0, 1 # r0 = bc0000d -+mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit -+add rb21, r2, r1 # VPM for 16bit intermediates -+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+shl r0, r0, 5 -+add rb27, r0, r1 # DMA out ++# Compute part of VPM to use for DMA output ++m_calc_dma_regs rb28, rb27 + +# submit texture requests for second line +max r1, ra_y, 0 +min r1, r1, rb_frame_height_minus_1 +add ra_y, ra_y, 1 -+bra -, ra31 ++bra -, ra_link +nop ; mul24 r1, r1, rb_pitch +add t0s, r1, ra_x +add t1s, r1, ra_frame_base @@ -13605,20 +13685,24 @@ index 0000000..aa9e1e7 +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv -+mov ra31, unif ++mov ra_link, unif + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov r1, unif # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base ++mov ra2, unif # x_y ++mov r0, elem_num ; mov r3, unif # frame_base ++ ++add r0, ra2.16b, r0 # x ++max r0, r0, 0 ++min r0, r0, rb_frame_width_minus_1 +# compute offset from frame base u to frame base v +sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next +shl ra_xshift_next, r0, 3 +add r0, r0, r3 ; mov ra1, unif # ; width_height +and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs -+mov ra_y_next, r1 ; mov vw_setup, rb28 ++mov ra_y_next, ra2.16a ; mov vw_setup, rb28 ++ +add ra_frame_base_next, rb_x_next, r2 + +# set up VPM write @@ -13628,9 +13712,19 @@ index 0000000..aa9e1e7 +add rb17, ra1.16a, 1 +add rb18, ra1.16a, 3 +shl r0, ra1.16a, 7 ++ ++ mov.setf -, ra9 ; mov -, vw_wait ++ brr.anyz -, r:filter_uv_1 ++ +add r0, r0, ra1.16b # Combine width and height of destination area +shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register +add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs ++# >>> (skip V DMA if never requested) ++ ++ sub vw_setup, ra9, -16 ++ mov vw_setup, ra10 ++ mov vw_addr, ra11 ++:filter_uv_1 + +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + @@ -13662,12 +13756,12 @@ index 0000000..aa9e1e7 +shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 +mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 +mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +max r2, ra_y, 0 # y +min r2, r2, rb_frame_height_minus_1 +add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 ++add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 +add t1s, ra_frame_base, r2 + +# generate seven shifted versions @@ -13677,13 +13771,13 @@ index 0000000..aa9e1e7 + +# apply horizontal filter +nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 ++nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +sub r0, r2, r3 ; mov r3, rb31 +sub.setf -, r3, 4 ; mov ra12, ra13 +brr.anyn -, r:uvloop @@ -13707,24 +13801,15 @@ index 0000000..aa9e1e7 +asr r1, r1, rb13 +min r1, r1, rb_k255 # Delay 2 +max vpm, r1, 0 # Delay 3 ++# >>> + -+# DMA out for U -+ -+mov vw_setup, rb26 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW -+ -+# DMA out for V -+# We need to wait for the U to complete first, but have nothing useful to compute while we wait. -+# Could potentially push this write into the start of the next pipeline stage. -+mov r0, 16 -+mov -, vw_wait -+ -+bra -, ra31 -+add vw_setup, rb26, r0 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW -+ ++# DMA out for U & stash for V ++ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0 ++ bra -, ra_link ++ mov vw_setup, rb29 ; mov ra10, rb29 # Stride ++ mov vw_addr, unif # u_dst_addr ++ mov ra11, unif # v_dst_addr ++# >>> + +################################################################################ + @@ -13733,19 +13818,23 @@ index 0000000..aa9e1e7 +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv_b0 -+mov ra31, unif ++mov -, unif # Ignore chain address - always "b" + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov r1, unif # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base -+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ; ++mov ra2, unif # x_y ++mov r0, elem_num ; mov r3, unif # frame_base ++ ++add r0, ra2.16b, r0 # x ++max r0, r0, 0 ++min r0, r0, rb_frame_width_minus_1 ++# compute offset from frame base u to frame base v ++sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next +shl ra_xshift_next, r0, 3 -+add r0, r0, r3 ; mov ra1, unif # ; width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # ; H filter coeffs -+mov ra_y_next, r1 ; mov vw_setup, rb21 ++add r0, r0, r3 ; mov ra1, unif # ; width_height ++and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs ++mov ra_y_next, ra2.16a + +add ra_frame_base_next, rb_x_next, r2 + @@ -13753,14 +13842,12 @@ index 0000000..aa9e1e7 +# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the +# filter code. Unpack into b regs for V + -+# set up VPM write, we need to save 16bit precision -+ +sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) +add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 ++add ra31, ra1.16a, 3 +shl r0, ra1.16a, 7 -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 ; mov ra3, unif # ; V filter coeffs ++add r0, r0, ra1.16b ; mov ra3, unif # Combine width and height of destination area ; V filter coeffs ++shl r0, r0, i_shift16 ; mov rb14, unif # U weight L0 +add rb26, r0, rb27 + +mov rb8, ra3.8a @@ -13773,8 +13860,8 @@ index 0000000..aa9e1e7 + +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + -+mov rb14, unif # U weight L0 +mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter ++ +# rb14 unused in b0 but will hang around till the second pass + +# retrieve texture results and pick out bytes @@ -13785,62 +13872,127 @@ index 0000000..aa9e1e7 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment ++ shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 ++ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++ shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+add t1s, ra_frame_base, r2 ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_frame_height_minus_1 ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 ++ add t1s, ra_frame_base, r2 + +# generate seven shifted versions +# interleave with scroll of vertical context + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop_b0 -+mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 -+mov ra14, ra15 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++ nop ; mul24 r3, ra0.8a, r0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ sub r0, r2, r3 ; mov r3, rb31 ++ sub.setf -, r3, 4 ; mov ra12, ra13 ++ brr.anyn -, r:uvloop_b0 ++ mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 ++ mov ra14, ra15 ; mul24 r2, ra15, rb10 # ra15 is about to be ra14 ++ mov ra15, r0 ; mul24 r0, ra12, rb8 +# >>> .anyn uvloop_b0 + -+# apply vertical filter and write to VPM ++# apply vertical filter and write to B-FIFO + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+sub.setf -, r3, rb18 -+brr.anyn -, r:uvloop_b0 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+sub r1, r1, r0 ; mov -, vw_wait -+asr vpm, r1, 6 -+# >>> .anyn uvloop_b0 ++ sub r1, r1, r0 ; mov ra8.16b, ra7 # start of B FIFO writes ++ add r1, r1, r2 ; mul24 r0, ra15, rb11 # N.B. ra15 write gap ++ sub r1, r1, r0 ; mov ra7, rb6 + -+# in pass0 we don't really need to save any results, but need to discard the uniforms -+# DMA out for U ++# FIFO goes: ++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b ++# This arrangement optimizes the inner loop FIFOs at the expense of making the ++# bulk shift between loops quite a bit nastier ++# a8 used as temp + -+bra -, ra31 -+mov -, unif # Delay 1 -+mov -, unif # Delay 2 -+nop # Delay 3 ++ sub.setf -, r3, ra31 ++ asr ra8.16a, r1, 6 ; mov rb6, ra5 # This discards the high bits that might be bad ++ brr.anyn -, r:uvloop_b0 ++ mov ra5, rb4 ; mov rb4, ra4 ++ mov ra4, rb5 ; mov rb5, ra6 ++ mov ra6, rb7 ; mov rb7, ra8 ++# >>> + ++# 1st half done all results now in the a/b4..7 fifo ++ ++# Need to bulk rotate FIFO for heights other than 16 ++# plausible heights are 16, 12, 8, 6, 4, 3, 2 and that is all we deal with ++# we are allowed 3/4 cb_size w/h :-( ++ ++# Destination uniforms discarded ++# At the end drop through to _b - we will always do b after b0 ++ ++ sub.setf -, 15, r3 # 12 + 3 of preroll ++ brr.anyn -, r:uv_b0_post_fin # h > 12 (n) => 16 (do nothing) ++ sub r3, 11, r3 ; mov -, unif # r3 = shifts wanted ; Discard u_dst_addr ++ mov r0, i_shift16 ; mov -, unif # ; Discard v_dst_addr ++ mov r1, 0x10000 ++# >>> ++ brr.anyz -, r:uv_b0_post12 # h == 12 deal with specially ++# If h != 16 && h != 12 then h <= 8 so ++# shift 8 with discard (.16b = .16a on all regs) ++ shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 ++ shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 ++ shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 ++# >>> ++ shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 ++ ++ shl.setf -, r3, i_shift30 # b2 -> C, b1 -> N ++# Shift 4 ++ mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 ++ mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 ++ # If we shifted by 4 here then the max length remaining is 4 ++ # so that is it ++ ++ brr -, r:uv_b0_post_fin ++# Shift 2 ++ mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 ++ mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 ++ mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 ++ # 6 / 2 so need 6 outputs ++# >>> ++ ++:uv_b0_post12 ++# this one is annoying as we need to swap halves of things that don't ++# really want to be swapped ++ ++# b7a, a6a, b5a, a4a ++# b4a, a5a, b6a, a7a ++# b7b, a6b, b5b, a4b ++# b4b, a5b, b6b, a7b ++ ++ mov r2, ra4 ; mov r3, rb5 ++ shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 ++ mov ra7, r2 ; mov rb6, r3 ++ ++ mov r2, ra6 ; mov r3, rb7 ++ shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 ++ mov ra5, r2 ; mov rb4, r3 ++ ++:uv_b0_post_fin ++ # drop through + +################################################################################ + +::mc_filter_uv_b -+mov ra31, unif ++ ++ mov ra_link, unif ++ mov.setf -, ra9 ; mov -, vw_wait # Delayed V DMA ++ brr.anyz -, r:uv_filter_b_1 ++ ++ mov ra0, unif ; mov r0, elem_num + +# per-channel shifts were calculated on the *previous* invocation + @@ -13848,30 +14000,23 @@ index 0000000..aa9e1e7 +mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 + +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov ra_y_next, unif # y ++add r0, ra0.16b, r0 # x ++# >>> ++ sub vw_setup, ra9, -16 ++ mov vw_setup, ra10 ++ mov vw_addr, ra11 ++:uv_filter_b_1 ++ ++max r0, r0, 0 ; mov ra_y_next, ra0.16a # y +min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base +# compute offset from frame base u to frame base v +sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base -+add r0, r0, r3 ; mov ra1, unif # width_height ++add r0, r0, r3 ; mov -, unif # discard width_height +and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 -+shl r0, ra1.16a, 7 ++# rb17, rb26, rb29, ra31 inherited from B0 as w/h must be the same + -+add ra_frame_base_next, rb_x_next, r2 -+ -+# r0 is currently height<<7 -+# For vr_setup we want height<<20 (so 20-7=13 additional bits) -+shl r3, r0, i_shift21 ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs -+shr r3, r3, 8 -+add vr_setup, r3, rb21 -+ -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+add rb26, r0, rb27 ++mov ra3, unif # V filter coeffs + +# get filter coefficients + @@ -13882,7 +14027,7 @@ index 0000000..aa9e1e7 +# The unif read occurs unconditionally, only the write is conditional +mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ; +mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ; -+nop ; mov rb10, ra3.8c ++add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c +mov r3, 0 ; mov rb11, ra3.8d # Loop counter ; + +shl r1, ra1.16b, rb13 @@ -13902,12 +14047,12 @@ index 0000000..aa9e1e7 +shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 +mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 +mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +max r2, ra_y, 0 # y +min r2, r2, rb_frame_height_minus_1 +add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 ++add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 +add t1s, ra_frame_base, r2 + +# generate seven shifted versions @@ -13916,100 +14061,105 @@ index 0000000..aa9e1e7 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + +nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 ++nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +sub r0, r2, r3 ; mov r3, rb31 +sub.setf -, r3, 4 ; mov ra12, ra13 +brr.anyn -, r:uvloop_b +mov ra13, ra14 ; mul24 r1, ra14, rb9 -+mov ra14, ra15 ++mov ra14, ra15 ; mul24 r2, ra15, rb10 +mov ra15, r0 ; mul24 r0, ra12, rb8 +# >>> .anyn uvloop_b + +# apply vertical filter and write to VPM + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it -+sub r1, r1, r0 ; mul24 r0, vpm, ra4 # ra4 = 0x10000 -+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+asr r1, r1, 14 # shift2=6 ++ sub r1, r1, r0 ; mov ra8.16b, ra7 # FIFO rotate (all ra/b4..7) ++ add r1, r1, r2 ; mul24 r0, ra15, rb11 ++ sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 ++ mov ra7, rb6 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ; mov rb6, ra5 # shift2=6 + -+asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a -+nop ; mul24 r0, r0, rb14 ++ mov ra5, rb4 ; mul24 r1, r1, ra1.16a ++ add r1, r1, r0 ; mov rb4, ra4 + -+add r1, r1, r0 ; mov -, vw_wait -+shl r1, r1, 8 # Lose bad top 8 bits & sign extend ++ mov ra4, rb5 ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ add r1, r1, rb12 ; mov rb5, ra6 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) + -+add r1, r1, rb12 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) ++ sub.setf -, r3, ra31 ; mov ra6, rb7 ++ brr.anyn -, r:uvloop_b ++ asr ra3.8as, r1, rb13 ++ mov -, vw_wait ; mov rb7, ra8 # vw_wait is B-reg (annoyingly) ; Final FIFO mov ++ mov vpm, ra3.8a ++# >>> + -+brr.anyn -, r:uvloop_b -+asr r1, r1, rb13 # Delay 1 -+min r1, r1, rb_k255 # Delay 2 -+max vpm, r1, 0 # Delay 3 ++# DMA out for U & stash for V ++ ++ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0 ++ bra -, ra_link ++ mov vw_setup, rb29 ; mov ra10, rb29 # Stride ++ mov vw_addr, unif # u_dst_addr ++ mov ra11, unif # v_dst_addr + + -+# DMA out for U -+ -+mov vw_setup, rb26 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW -+ -+# DMA out for V -+# We need to wait for the U to complete first, but have nothing useful to compute while we wait. -+# Could potentially push this write into the start of the next pipeline stage. -+mov r0, 16 -+mov -, vw_wait -+ -+bra -, ra31 -+add vw_setup, rb26, r0 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW + +################################################################################ + +# mc_exit() + ++::mc_exit_c ++ mov.setf -, ra9 ; mov -, vw_wait ++# Annoyingly it looks iike condition codes don't work on writes to special ++# registers so we have to branch around the writes ++ brr.anyz -, r:exit_c_1 ++ nop ++ nop ++ nop ++# >>> ++ ++ sub vw_setup, ra9, -16 ++ mov vw_setup, ra10 ++ mov vw_addr, ra11 ++ nop ++:exit_c_1 ++ +::mc_exit -+mov -, vw_wait # wait on the VDW ++ ldtmu0 ++ ldtmu1 ++ ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW + -+mov -,srel(0) ++ mov -,srel(0) + -+ldtmu0 -+ldtmu1 -+ldtmu0 -+ldtmu1 -+ -+nop ; nop ; thrend -+nop ; nop # delay slot 1 -+nop ; nop # delay slot 2 ++ nop ; nop ; thrend ++ nop ; nop # delay slot 1 ++ nop ; nop # delay slot 2 + +# mc_interrupt_exit8() -+::mc_interrupt_exit8 -+mov -, vw_wait # wait on the VDW -+ -+ldtmu0 -+ldtmu1 -+ldtmu0 -+ldtmu1 -+ -+mov -,sacq(0) # 1 -+mov -,sacq(0) # 2 -+mov -,sacq(0) # 3 -+mov -,sacq(0) # 4 -+mov -,sacq(0) # 5 -+mov -,sacq(0) # 6 -+mov -,sacq(0) # 7 -+ -+nop ; nop ; thrend -+mov interrupt, 1; nop # delay slot 1 -+nop ; nop # delay slot 2 -+ ++#::mc_interrupt_exit8 ++#mov -, vw_wait # wait on the VDW ++# ++#ldtmu0 ++#ldtmu1 ++#ldtmu0 ++#ldtmu1 ++# ++#mov -,sacq(0) # 1 ++#mov -,sacq(0) # 2 ++#mov -,sacq(0) # 3 ++#mov -,sacq(0) # 4 ++#mov -,sacq(0) # 5 ++#mov -,sacq(0) # 6 ++#mov -,sacq(0) # 7 ++# ++#nop ; nop ; thrend ++#mov interrupt, 1; nop # delay slot 1 ++#nop ; nop # delay slot 2 ++# + + + @@ -14022,115 +14172,79 @@ index 0000000..aa9e1e7 +################################################################################ +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel) +::mc_setup -+ mov r3, 16 -+ + # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov ra8, unif # y_x -+ mov ra9, unif # ref_y_base -+ mov ra10, unif # y2_x2 -+ mov ra11, unif # ref_y2_base ++ mov tmurs, 1 ; mov ra8, unif # No TMU swap ; y_x ++ mov ra9, unif # ref_y_base ++ mov ra10, unif # y2_x2 ++ mov ra11, unif # ref_y2_base + +# Read image dimensions -+ mov r1, unif # width_height -+ shl r0,r1,r3 -+ asr r1,r1,r3 # width -+ asr r0,r0,r3 # height -+ sub rb_frame_width_minus_1,r1,1 -+ sub rb_frame_height_minus_1,r0,1 -+ -+# get source pitch -+ mov rb_pitch, unif # src_pitch ++ mov ra3, unif # width_height ++ mov rb_pitch, unif # src_pitch [ra3 delay] ++ sub rb_frame_width_minus_1, ra3.16b, 1 ++ sub rb_frame_height_minus_1, ra3.16a, 1 + +# get destination pitch -+ mov r0, unif # dst_pitch + mov r1, vdw_setup_1(0) -+ add rb24, r1, r0 ++ or rb24, r1, unif # dst_pitch + +# Compute base address for first and second access -+ mov r1, ra8 # y_x -+ shl r0,r1,r3 # r0 is x<<16 -+ asr r1,r1,r3 # r1 is y -+ asr r0,r0,r3 # r0 is x -+ add r0, r0, elem_num # Load x ++ mov r3, elem_num ++ add r0, ra8.16a, r3 # Load x + elem_num + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 # Load the frame base ++ min r0, r0, rb_frame_width_minus_1 + shl ra_xshift_next, r0, 3 # Compute shifts -+ add ra_y, r1, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, r2, r0 # r2 is address for frame0 (not including y offset) -+ max r1, r1, 0 ++ add ra_y, ra8.16b, 1 ++ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate ++ add r2, ra9, r0 # ra9 is address for frame0 (not including y offset) ++ max r1, ra8.16b, 0 + min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t0s, r2, r1 ; mov ra_frame_base, r2 ++ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 ++ add t0s, r2, r1 ; mov ra_frame_base, r2 + -+ mov r1, ra10 # y_x -+ shl r0,r1,r3 # r0 is x<<16 -+ asr r1,r1,r3 # r1 is y -+ asr r0,r0,r3 # r0 is x -+ add r0, r0, elem_num # Load x ++ # r3 still contains elem_num ++ add r0, ra10.16a, r3 # Load x + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 # Load the frame base ++ min r0, r0, rb_frame_width_minus_1 + shl rx_xshift2_next, r0, 3 # Compute shifts -+ add ra_y2, r1, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, r2, r0 # r2 is address for frame1 (not including y offset) -+ max r1, r1, 0 ++ add ra_y2, ra10.16b, 1 ++ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate ++ add r2, ra11, r0 # r2 is address for frame1 (not including y offset) ++ max r1, ra10.16b, 0 + min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t1s, r2, r1 ; mov ra_frame_base2, r2 -+ ++ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 ++ add t1s, r2, r1 ; mov ra_frame_base2, r2 + +# load constants + + mov ra_k1, 1 + mov ra_k256, 256 -+ mov ra30, 64 -+ -+ mov rb20, 0xffffff00 + mov rb_k255, 255 -+ mov rb23, 24 + +# touch vertical context to keep simulator happy + -+ mov ra8, 0 -+ mov ra9, 0 -+ mov ra10, 0 -+ mov ra11, 0 -+ mov ra12, 0 -+ mov ra13, 0 -+ mov ra14, 0 -+ mov ra15, 0 ++ mov ra8, 0 ; mov rb8, 0 ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 + +# Compute part of VPM to use -+ mov r2, qpu_num -+ mov r1, r2 -+ asr r1, r1, 2 -+ shl r1, r1, 6 -+ mov r0, r2 -+ and r0, r0, 3 -+ add r0, r0, r1 -+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+ add rb28, r0, r1 # VPM for saving data -+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+ shl r0, r0, 5 -+ add rb27, r0, r1 # Command for dma output ++ m_calc_dma_regs rb28, rb27 + +# Weighted prediction denom -+ add rb13, unif, 9 # unif = weight denom + 6 -+ -+ mov -, unif # Unused ++ add rb13, unif, 9 # unif = weight denom + 6 + +# submit texture requests for second line + max r1, ra_y, 0 + min r1, r1, rb_frame_height_minus_1 + add ra_y, ra_y, 1 -+ nop ; mul24 r1, r1, rb_pitch ++ mov -, unif ; mul24 r1, r1, rb_pitch # unused ; + add t0s, r1, ra_frame_base + + max r1, ra_y2, 0 + min r1, r1, rb_frame_height_minus_1 + add ra_y2, ra_y2, 1 -+ nop ; mul24 r1, r1, rb_pitch ++ nop ; mul24 r1, r1, rb_pitch + add t1s, r1, ra_frame_base2 + +# FALL THROUGHT TO PER-BLOCK SETUP @@ -14139,7 +14253,7 @@ index 0000000..aa9e1e7 +# P and B blocks share the same setup code to save on Icache space +:per_block_setup + mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ mov ra31, unif ++ mov ra_link, unif + + mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack?? + @@ -14153,7 +14267,7 @@ index 0000000..aa9e1e7 + max r0, r0, 0 + min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base + shl ra_xshift_next, r0, 3 # Compute shifts -+ mov r3, 8 ; mov ra_y_next, ra1.16b ++ mov ra_y_next, ra1.16b + and r0, r0, ~3 ; mov ra1, unif # y2_x2 + add ra_frame_base_next, r2, r0 + @@ -14161,7 +14275,7 @@ index 0000000..aa9e1e7 + max r0, r0, 0 + min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base + shl rx_xshift2_next, r0, 3 # Compute shifts -+ add r3, r3, r3 ; mov ra_y2_next, ra1.16b # r3 = 16 ; ++ mov ra_y2_next, ra1.16b + and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate + add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset) + @@ -14178,8 +14292,9 @@ index 0000000..aa9e1e7 + add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifz r0, r0, i_shift16 # Pick half to use -+ shl ra8, r0, 3 ++ shl.ifz r0, r0, i_shift16 ; mov ra5, unif # Pick half to use ; L0 offset/weight ++ mov r2, 0x01040400 # [ra5 delay] ++ shl ra8, r0, 3 ; mov rb14, ra5.16a + +# Pack the 1st 4 filter coefs for H & V tightly + @@ -14187,9 +14302,8 @@ index 0000000..aa9e1e7 + ror ra2.8a, r1, ra8.8d + ror ra0.8a, r1, ra8.8c + -+ mov r1,0x01040400 -+ ror ra2.8b, r1, ra8.8d -+ ror ra0.8b, r1, ra8.8c ++ ror ra2.8b, r2, ra8.8d ++ ror ra0.8b, r2, ra8.8c + + mov r1,0x050b0a00 # -ve + ror ra2.8c, r1, ra8.8d @@ -14215,27 +14329,31 @@ index 0000000..aa9e1e7 + ror ra3.8c, r1, ra8.8d + ror ra1.8c, r1, ra8.8c + -+# Extract weighted prediction information in parallel -+ + mov r1,0x01010000 # -ve -+ ror ra3.8d, r1, ra8.8d ; mov r0, unif # ; weight L1 weight L1 (hi16)/weight L0 (lo16) -+ ror ra1.8d, r1, ra8.8c ; mov r1, rb13 # ; rb13 = weight denom + 6 + 9 ++ ror ra3.8d, r1, ra8.8d ++ ror ra1.8d, r1, ra8.8c + -+# r3 = 16 from (long way) above -+ shl r1, unif, r1 ; mov rb4, ra3.8a # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ; -+ asr ra18, r0, r3 ; mov rb5, ra3.8b -+ bra -, ra31 -+ shl r0, r0, r3 ; mov rb6, ra3.8c -+ mov r3, 0 ; mov rb7, ra3.8d # loop count ; -+ asr rb12, r1, 9 ++# Extract weighted prediction information in parallel ++# We are annoyingly A src limited here + -+# >>> branch ra31 ++ mov rb4, ra3.8a ; mov ra18, unif ++ mov rb5, ra3.8b ++ mov rb6, ra3.8c ++ mov.ifnz ra5, ra18 ++ ++ bra -, ra_link ++ ++ shl r0, ra5.16b, rb13 # Offset calc ++ asr rb12, r0, 9 # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ mov r3, 0 ; mov rb7, ra3.8d ++# >>> branch ra_link +# +# r3 = 0 -+# ra18 = weight L1 -+# r0 = weight L0 << 16 (will be put into rb14 in filter preamble) -+# rb13 = weight denom + 6 + 9 -+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) ++# ra18.16a = weight L1 ++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) ++# rb12 = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) ++# rb13 = weight denom + 6 + 9 ++# rb14 = weight L0 + + +################################################################################ @@ -14244,8 +14362,9 @@ index 0000000..aa9e1e7 +# At this point we have already issued two pairs of texture requests for the current block + +::mc_filter -+# r0 = weight << 16; We want weight * 2 in rb14 -+ asr rb14, r0, 15 ++# ra5.16a = weight << 16; We want weight * 2 in rb14 ++ ++ shl rb14, ra5.16a, 1 + +# r3 = 0 + @@ -14269,12 +14388,12 @@ index 0000000..aa9e1e7 + max r2, ra_y, 0 # y + min r2, r2, rb_frame_height_minus_1 + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + + max r2, ra_y2, 0 # y + min r2, r2, rb_frame_height_minus_1 + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 ++ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14283,21 +14402,21 @@ index 0000000..aa9e1e7 + +# apply horizontal filter + nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + sub r0, r2, r3 ; mov r3, rb31 + + sub.setf -, r3, 8 ; mov r1, ra8 @@ -14359,7 +14478,7 @@ index 0000000..aa9e1e7 + +::mc_filter_b + # r0 = weightL0 << 16, we want it in rb14 -+ asr rb14, r0, i_shift16 ++# asr rb14, r0, i_shift16 + +:yloopb +# retrieve texture results and pick out bytes @@ -14377,12 +14496,12 @@ index 0000000..aa9e1e7 + max r2, ra_y, 0 # y + min r2, r2, rb_frame_height_minus_1 + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + + max r2, ra_y2, 0 # y + min r2, r2, rb_frame_height_minus_1 + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 ++ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14391,21 +14510,21 @@ index 0000000..aa9e1e7 + +# apply horizontal filter + nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + sub r0, r2, r3 ; mov r3, rb31 + + sub.setf -, r3, 8 ; mov r1, ra8 @@ -14417,7 +14536,6 @@ index 0000000..aa9e1e7 + # >>> .anyn yloopb + + # apply vertical filter and write to VPM -+ + nop ; mul24 r0, rb8, ra2.8a + nop ; mul24 r1, rb9, ra2.8b + sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c @@ -14433,7 +14551,7 @@ index 0000000..aa9e1e7 + + asr r1, r1, 14 + nop ; mul24 r0, r1, rb14 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8 ++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 + + add r1, r1, r0 ; mov -, vw_wait + shl r1, r1, 8 @@ -14450,26 +14568,26 @@ index 0000000..aa9e1e7 + mov vw_addr, unif # start the VDW Delay 3 + +################################################################################ ++::mc_interrupt_exit12c ++ mov.setf -, ra9 ; mov -, vw_wait ++ brr.anyz -, r:exit12_c_1 ++ nop ++ nop ++ nop ++# >>> ++ ++ sub vw_setup, ra9, -16 ++ mov vw_setup, ra10 ++ mov vw_addr, ra11 ++ mov ra9, 0 ++:exit12_c_1 + +# mc_interrupt_exit12() +::mc_interrupt_exit12 -+ mov -, vw_wait # wait on the VDW -+ -+ # Dummy wait to test instructions -+# mov r3,1000000 -+#:dummy_loop -+# sub.setf r3, r3, 1 -+# nop -+# nop -+# brr.anynn -, r:dummy_loop -+# nop -+# nop -+# nop -+ -+ ldtmu0 + ldtmu0 + ldtmu1 -+ ldtmu1 ++ ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW + + mov -,sacq(0) # 1 + mov -,sacq(0) # 2 @@ -14502,477 +14620,12 @@ index 0000000..aa9e1e7 + +::mc_end +# Do not add code here because mc_end must appear after all other code. -diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h +diff --git b/libavcodec/rpi_zc.c a/libavcodec/rpi_zc.c new file mode 100644 -index 0000000..db41a4d +index 0000000..9ac22aa --- /dev/null -+++ b/libavcodec/rpi_user_vcsm.h -@@ -0,0 +1,459 @@ -+/***************************************************************************** -+* Copyright 2001 - 2011 Broadcom Corporation. All rights reserved. -+* -+* This program is the proprietary software of Broadcom Corporation and/or -+* its licensors, and may only be used, duplicated, modified or distributed -+* pursuant to the terms and conditions of a separate, written license -+* agreement executed between you and Broadcom (an "Authorized License"). -+* Except as set forth in an Authorized License, Broadcom grants no license -+* (express or implied), right to use, or waiver of any kind with respect to -+* the Software, and Broadcom expressly reserves all rights in and to the -+* Software and all intellectual property rights therein. IF YOU HAVE NO -+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY -+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF -+* THE SOFTWARE. -+* -+* Except as expressly set forth in the Authorized License, -+* 1. This program, including its structure, sequence and organization, -+* constitutes the valuable trade secrets of Broadcom, and you shall use -+* all reasonable efforts to protect the confidentiality thereof, and to -+* use this information only in connection with your use of Broadcom -+* integrated circuit products. -+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS" -+* AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR -+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH -+* RESPECT TO THE SOFTWARE. BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL -+* IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS -+* FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, -+* QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU -+* ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE. -+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS -+* LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT, -+* OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO -+* YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN -+* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS -+* OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER -+* IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF -+* ESSENTIAL PURPOSE OF ANY LIMITED REMEDY. -+*****************************************************************************/ -+ -+#ifndef __USER_VCSM__H__INCLUDED__ -+#define __USER_VCSM__H__INCLUDED__ -+ -+/* VideoCore Shared Memory - user interface library. -+** -+** This library provides all the necessary abstraction for any application to -+** make use of the shared memory service which is distributed accross a kernel -+** driver and a videocore service. -+** -+** It is an application design decision to choose or not to use this service. -+** -+** The logical flow of operations that a user application needs to follow when -+** using this service is: -+** -+** 1) Initialize the service. -+** 2) Allocate shared memory blocks. -+** 3) Start using the allocated blocks. -+** - In order to gain ownership on a block, lock the allocated block, -+** locking a block returns a valid address that the user application -+** can access. -+** - When finished with using the block for the current execution cycle -+** or function, and so when giving up the ownership, unlock the block. -+** 4) A block can be locked/unlocked as many times required - within or outside -+** of - a specific execution context. -+** 5) To completely release an allocated block, free it. -+** 6) If the service is no longer required, terminate it. -+** -+** -+** Some generic considerations: -+ -+** Allocating memory blocks. -+** -+** Memory blocks can be allocated in different manners depending on the cache -+** behavior desired. A given block can either be: -+ -+** - Allocated in a non cached fashion all the way through host and videocore. -+** - Allocated in a cached fashion on host OR videocore. -+** - Allocated in a cached fashion on host AND videocore. -+** -+** It is an application decision to determine how to allocate a block. Evidently -+** if the application will be doing substantial read/write accesses to a given block, -+** it is recommended to allocate the block at least in a 'host cached' fashion for -+** better results. -+** -+** -+** Locking memory blocks. -+** -+** When the memory block has been allocated in a host cached fashion, locking the -+** memory block (and so taking ownership of it) will trigger a cache invalidation. -+** -+** For the above reason and when using host cached allocation, it is important that -+** an application properly implements the lock/unlock mechanism to ensure cache will -+** stay coherent, otherwise there is no guarantee it will at all be. -+** -+** It is possible to dynamically change the host cache behavior (ie cached or non -+** cached) of a given allocation without needing to free and re-allocate the block. -+** This feature can be useful for such application which requires access to the block -+** only at certain times and not otherwise. By changing the cache behavior dynamically -+** the application can optimize performances for a given duration of use. -+** Such dynamic cache behavior remapping only applies to host cache and not videocore -+** cache. If one requires to change the videocore cache behavior, then a new block -+** must be created to replace the old one. -+** -+** On successful locking, a valid pointer is returned that the application can use -+** to access to data inside the block. There is no guarantee that the pointer will -+** stay valid following the unlock action corresponding to this lock. -+** -+** -+** Unocking memory blocks. -+** -+** When the memory block has been allocated in a host cached fashion, unlocking the -+** memory block (and so forgiving its ownership) will trigger a cache flush unless -+** explicitely asked not to flush the cache for performances reasons. -+** -+** For the above reason and when using host cached allocation, it is important that -+** an application properly implements the lock/unlock mechanism to ensure cache will -+** stay coherent, otherwise there is no guarantee it will at all be. -+** -+** -+** A complete API is defined below. -+*/ -+ -+#ifdef __cplusplus -+extern "C" -+{ -+#endif -+ -+/* Different status that can be dumped. -+*/ -+typedef enum -+{ -+ VCSM_STATUS_VC_WALK_ALLOC = 0, // Walks *all* the allocation on videocore. -+ // Result of the walk is seen in the videocore -+ // log. -+ VCSM_STATUS_HOST_WALK_MAP, // Walks the *full* mapping allocation on host -+ // driver (ie for all processes). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_HOST_WALK_PID_MAP, // Walks the per process mapping allocation on host -+ // driver (for current process). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host -+ // driver (for current process). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_VC_MAP_ALL, // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and -+ // VCSM_STATUS_HOST_WALK_MAP. -+ // -+ VCSM_STATUS_NONE, // Must be last - invalid. -+ -+} VCSM_STATUS_T; -+ -+/* Different kind of cache behavior. -+*/ -+typedef enum -+{ -+ VCSM_CACHE_TYPE_NONE = 0, // No caching applies. -+ VCSM_CACHE_TYPE_HOST, // Allocation is cached on host (user space). -+ VCSM_CACHE_TYPE_VC, // Allocation is cached on videocore. -+ VCSM_CACHE_TYPE_HOST_AND_VC, // Allocation is cached on both host and videocore. -+ -+} VCSM_CACHE_TYPE_T; -+ -+/* Initialize the vcsm processing. -+** -+** Must be called once before attempting to do anything else. -+** -+** Returns 0 on success, -1 on error. -+*/ -+int vcsm_init( void ); -+ -+ -+/* Terminates the vcsm processing. -+** -+** Must be called vcsm services are no longer needed, it will -+** take care of removing any allocation under the current process -+** control if deemed necessary. -+*/ -+void vcsm_exit( void ); -+ -+ -+/* Queries the status of the the vcsm. -+** -+** Triggers dump of various kind of information, see the -+** different variants specified in VCSM_STATUS_T. -+** -+** Pid is optional. -+*/ -+void vcsm_status( VCSM_STATUS_T status, int pid ); -+ -+ -+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory -+** allocator. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc( unsigned int size, char *name ); -+ -+ -+/* Allocates a cached block of memory of size 'size' via the vcsm memory -+** allocator, the type of caching requested is passed as argument of the -+** function call. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name ); -+ -+ -+/* Shares an allocated block of memory via the vcsm memory allocator. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc_share( unsigned int handle ); -+ -+ -+/* Resizes a block of memory allocated previously by vcsm_alloc. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** The handle must be unlocked by user prior to attempting any -+** resize action. -+** -+** On error, the original size allocated against the handle -+** remains available the same way it would be following a -+** successful vcsm_malloc. -+*/ -+int vcsm_resize( unsigned int handle, unsigned int new_size ); -+ -+ -+/* Frees a block of memory that was successfully allocated by -+** a prior call the vcms_alloc. -+** -+** The handle should be considered invalid upon return from this -+** call. -+** -+** Whether any memory is actually freed up or not as the result of -+** this call will depends on many factors, if all goes well it will -+** be freed. If something goes wrong, the memory will likely end up -+** being freed up as part of the vcsm_exit process. In the end the -+** memory is guaranteed to be freed one way or another. -+*/ -+void vcsm_free( unsigned int handle ); -+ -+ -+/* Retrieves a videocore opaque handle from a mapped user address -+** pointer. The videocore handle will correspond to the actual -+** memory mapped in videocore. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** Note: the videocore opaque handle is distinct from the user -+** opaque handle (allocated via vcsm_malloc) and it is only -+** significant for such application which knows what to do -+** with it, for the others it is just a number with little -+** use since nothing can be done with it (in particular -+** for safety reason it cannot be used to map anything). -+*/ -+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr ); -+ -+ -+/* Retrieves a videocore opaque handle from a opaque handle -+** pointer. The videocore handle will correspond to the actual -+** memory mapped in videocore. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** Note: the videocore opaque handle is distinct from the user -+** opaque handle (allocated via vcsm_malloc) and it is only -+** significant for such application which knows what to do -+** with it, for the others it is just a number with little -+** use since nothing can be done with it (in particular -+** for safety reason it cannot be used to map anything). -+*/ -+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle ); -+ -+ -+/* Retrieves a user opaque handle from a mapped user address -+** pointer. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+*/ -+unsigned int vcsm_usr_handle( void *usr_ptr ); -+ -+ -+/* Retrieves a mapped user address from an opaque user -+** handle. -+** -+** Returns: 0 on error -+** a non-zero address on success. -+** -+** On success, the address corresponds to the pointer -+** which can access the data allocated via the vcsm_malloc -+** call. -+*/ -+void *vcsm_usr_address( unsigned int handle ); -+ -+ -+/* Locks the memory associated with this opaque handle. -+** -+** Returns: NULL on error -+** a valid pointer on success. -+** -+** A user MUST lock the handle received from vcsm_malloc -+** in order to be able to use the memory associated with it. -+** -+** On success, the pointer returned is only valid within -+** the lock content (ie until a corresponding vcsm_unlock_xx -+** is invoked). -+*/ -+void *vcsm_lock( unsigned int handle ); -+ -+ -+/* Locks the memory associated with this opaque handle. The lock -+** also gives a chance to update the *host* cache behavior of the -+** allocated buffer if so desired. The *videocore* cache behavior -+** of the allocated buffer cannot be changed by this call and such -+** attempt will be ignored. -+** -+** The system will attempt to honour the cache_update mode request, -+** the cache_result mode will provide the final answer on which cache -+** mode is really in use. Failing to change the cache mode will not -+** result in a failure to lock the buffer as it is an application -+** decision to choose what to do if (cache_result != cache_update) -+** -+** The value returned in cache_result can only be considered valid if -+** the returned pointer is non NULL. The cache_result pointer may be -+** NULL if the application does not care about the actual outcome of -+** its action with regards to the cache behavior change. -+** -+** Returns: NULL on error -+** a valid pointer on success. -+** -+** A user MUST lock the handle received from vcsm_malloc -+** in order to be able to use the memory associated with it. -+** -+** On success, the pointer returned is only valid within -+** the lock content (ie until a corresponding vcsm_unlock_xx -+** is invoked). -+*/ -+void *vcsm_lock_cache( unsigned int handle, -+ VCSM_CACHE_TYPE_T cache_update, -+ VCSM_CACHE_TYPE_T *cache_result ); -+ -+ -+/* Unlocks the memory associated with this user mapped address. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking a mapped address, the user should no longer -+** attempt to reference it. -+*/ -+int vcsm_unlock_ptr( void *usr_ptr ); -+ -+ -+/* Unlocks the memory associated with this user mapped address. -+** Apply special processing that would override the otherwise -+** default behavior. -+** -+** If 'cache_no_flush' is specified: -+** Do not flush cache as the result of the unlock (if cache -+** flush was otherwise applicable in this case). -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking a mapped address, the user should no longer -+** attempt to reference it. -+*/ -+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush ); -+ -+ -+/* Unlocks the memory associated with this user opaque handle. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking an opaque handle, the user should no longer -+** attempt to reference the mapped addressed once associated -+** with it. -+*/ -+int vcsm_unlock_hdl( unsigned int handle ); -+ -+ -+/* Unlocks the memory associated with this user opaque handle. -+** Apply special processing that would override the otherwise -+** default behavior. -+** -+** If 'cache_no_flush' is specified: -+** Do not flush cache as the result of the unlock (if cache -+** flush was otherwise applicable in this case). -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking an opaque handle, the user should no longer -+** attempt to reference the mapped addressed once associated -+** with it. -+*/ -+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush ); -+ -+/* Clean and/or invalidate the memory associated with this user opaque handle -+** -+** Returns: non-zero on error -+** -+** structure contains a list of flush/invalidate commands. Commands are: -+** 0: nop -+** 1: invalidate given virtual range in L1/L2 -+** 2: clean given virtual range in L1/L2 -+** 3: clean+invalidate given virtual range in L1/L2 -+** 4: flush all L1/L2 -+*/ -+struct vcsm_user_clean_invalid_s { -+ struct { -+ unsigned int cmd; -+ unsigned int handle; -+ unsigned int addr; -+ unsigned int size; -+ } s[8]; -+}; -+ -+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s ); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* __USER_VCSM__H__INCLUDED__ */ -diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c -new file mode 100644 -index 0000000..9580165 ---- /dev/null -+++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,406 @@ ++++ a/libavcodec/rpi_zc.c +@@ -0,0 +1,453 @@ +#include "config.h" +#ifdef RPI +#include "rpi_qpu.h" @@ -14985,6 +14638,7 @@ index 0000000..9580165 +typedef struct ZcPool +{ + int numbytes; ++ unsigned int n; + struct ZcPoolEnt * head; + pthread_mutex_t lock; +} ZcPool; @@ -14993,27 +14647,48 @@ index 0000000..9580165 +{ + // It is important that we start with gmem as other bits of code will expect to see that + GPU_MEM_PTR_T gmem; ++ unsigned int n; + struct ZcPoolEnt * next; + struct ZcPool * pool; +} ZcPoolEnt; + -+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size) ++#if 1 ++//#define ALLOC_PAD 0x1000 ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++//#define ALLOC_N_OFFSET 0x100 ++#define ALLOC_N_OFFSET 0 ++#define STRIDE_ROUND 0x80 ++#define STRIDE_OR 0x80 ++#else ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++#define ALLOC_N_OFFSET 0 ++#define STRIDE_ROUND 32 ++#define STRIDE_OR 0 ++#endif ++ ++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size) +{ + ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt)); + ++ // Round up to 4k & add 4k ++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); ++ + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); + goto fail0; + } + -+ if (gpu_malloc_cached(size, &zp->gmem) != 0) ++ if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0) + { -+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size); ++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); + goto fail1; + } + + zp->next = NULL; + zp->pool = pool; ++ zp->n = pool->n++; + return zp; + +fail1: @@ -15062,6 +14737,10 @@ index 0000000..9580165 + } + + pthread_mutex_unlock(&pool->lock); ++ ++ // Start with our buffer empty of preconceptions ++// rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE); ++ + return zp; +} + @@ -15127,7 +14806,8 @@ index 0000000..9580165 + const unsigned int video_width, const unsigned int video_height) +{ + AVRpiZcFrameGeometry geo; -+ geo.stride_y = (video_width + 32 + 31) & ~31; ++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++// geo.stride_y = ((video_width + 32 + 31) & ~31); + geo.stride_c = geo.stride_y / 2; +// geo.height_y = (video_height + 15) & ~15; + geo.height_y = (video_height + 32 + 31) & ~31; @@ -15139,13 +14819,21 @@ index 0000000..9580165 +{ + ZcPoolEnt *const zp = zc_pool_alloc(pool, size); + AVBufferRef * buf; ++ intptr_t idata = (intptr_t)zp->gmem.arm; ++#if ALLOC_N_OFFSET != 0 ++ intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1); ++#endif + + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); + goto fail0; + } + -+ if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL) ++#if ALLOC_N_OFFSET != 0 ++ idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0); ++#endif ++ ++ if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n"); + goto fail2; @@ -15317,6 +15005,18 @@ index 0000000..9580165 + return p == NULL ? -1 : p->vc_handle; +} + ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : fr_ref->data - p->arm; ++} ++ ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) ++{ ++ return fr_ref == NULL ? 0 : fr_ref->size; ++} ++ ++ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) +{ + const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); @@ -15379,12 +15079,12 @@ index 0000000..9580165 + +#endif // RPI + -diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h +diff --git b/libavcodec/rpi_zc.h a/libavcodec/rpi_zc.h new file mode 100644 -index 0000000..f0109f4 +index 0000000..4dd7a8b --- /dev/null -+++ b/libavcodec/rpi_zc.h -@@ -0,0 +1,83 @@ ++++ a/libavcodec/rpi_zc.h +@@ -0,0 +1,88 @@ +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + @@ -15439,6 +15139,11 @@ index 0000000..f0109f4 +// Get the vc_handle from the frame ref +// Returns -1 if ref doesn't look valid +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); ++// Get offset from the start of the memory referenced ++// by the vc_handle to valid data ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); ++// Length of buffer data ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); +// Get the number of bytes allocated from the frame ref +// Returns 0 if ref doesn't look valid +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); @@ -15468,10 +15173,10 @@ index 0000000..f0109f4 + +#endif + -diff --git a/libavcodec/utils.c b/libavcodec/utils.c -index f7adb52..708526e 100644 ---- a/libavcodec/utils.c -+++ b/libavcodec/utils.c +diff --git b/libavcodec/utils.c a/libavcodec/utils.c +index 3e8677d..f1efc0d 100644 +--- b/libavcodec/utils.c ++++ a/libavcodec/utils.c @@ -26,6 +26,12 @@ */ @@ -15496,7 +15201,7 @@ index f7adb52..708526e 100644 #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS static int default_lockmgr_cb(void **arg, enum AVLockOp op) { -@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, +@@ -508,6 +518,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, return ret; } @@ -15544,7 +15249,7 @@ index f7adb52..708526e 100644 static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) { FramePool *pool = avctx->internal->pool; -@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) +@@ -555,6 +606,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) av_buffer_pool_uninit(&pool->pools[i]); pool->linesize[i] = linesize[i]; if (size[i]) { @@ -15559,10 +15264,48 @@ index f7adb52..708526e 100644 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, CONFIG_MEMORY_POISONING ? NULL : -diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c -index b31d233..2767306 100644 ---- a/libavformat/mpegts.c -+++ b/libavformat/mpegts.c +diff --git b/libavformat/matroskaenc.c a/libavformat/matroskaenc.c +index 9c7a213..af941ce 100644 +--- b/libavformat/matroskaenc.c ++++ a/libavformat/matroskaenc.c +@@ -2223,7 +2223,7 @@ static int mkv_check_new_extra_data(AVFormatContext *s, AVPacket *pkt) + + switch (par->codec_id) { + case AV_CODEC_ID_FLAC: +- if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL) && !mkv->is_live) { ++ if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL)) { + AVCodecParameters *codecpriv_par; + int64_t curpos; + if (side_data_size != par->extradata_size) { +diff --git b/libavformat/mov.c a/libavformat/mov.c +index f2296f8..4550cf0 100644 +--- b/libavformat/mov.c ++++ a/libavformat/mov.c +@@ -1186,12 +1186,6 @@ static void mov_metadata_creation_time(AVDictionary **metadata, int64_t time) + if (time) { + if(time >= 2082844800) + time -= 2082844800; /* seconds between 1904-01-01 and Epoch */ +- +- if ((int64_t)(time * 1000000ULL) / 1000000 != time) { +- av_log(NULL, AV_LOG_DEBUG, "creation_time is not representable\n"); +- return; +- } +- + avpriv_dict_set_timestamp(metadata, "creation_time", time * 1000000); + } + } +@@ -5794,7 +5788,6 @@ static int mov_read_close(AVFormatContext *s) + av_freep(&mov->fragment_index_data); + + av_freep(&mov->aes_decrypt); +- av_freep(&mov->chapter_tracks); + + return 0; + } +diff --git b/libavformat/mpegts.c a/libavformat/mpegts.c +index 3eff152..30dfb14 100644 +--- b/libavformat/mpegts.c ++++ a/libavformat/mpegts.c @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { #endif { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, @@ -15572,11 +15315,11 @@ index b31d233..2767306 100644 { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 }, { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, -diff --git a/libavformat/utils.c b/libavformat/utils.c -index 6f343f2..83f26d5 100644 ---- a/libavformat/utils.c -+++ b/libavformat/utils.c -@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in +diff --git b/libavformat/utils.c a/libavformat/utils.c +index a059046..ef70074 100644 +--- b/libavformat/utils.c ++++ a/libavformat/utils.c +@@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in int default_stream_index = av_find_default_stream_index(s); if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) { for (i = 0; i < s->nb_streams; i++) { @@ -15585,11 +15328,11 @@ index 6f343f2..83f26d5 100644 continue; s->streams[i]->pts_wrap_reference = pts_wrap_reference; s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; -diff --git a/libavutil/buffer.c b/libavutil/buffer.c -index 694e116..203ca7b 100644 ---- a/libavutil/buffer.c -+++ b/libavutil/buffer.c -@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool) +diff --git b/libavutil/buffer.c a/libavutil/buffer.c +index 8d1aa5f..649876d 100644 +--- b/libavutil/buffer.c ++++ a/libavutil/buffer.c +@@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool) return ret; } @@ -15599,11 +15342,11 @@ index 694e116..203ca7b 100644 + BufferPoolEntry *buf = av_buffer_get_opaque(ref); + return buf->opaque; +} -diff --git a/libavutil/buffer.h b/libavutil/buffer.h -index 0c0ce12..82e0bc3 100644 ---- a/libavutil/buffer.h -+++ b/libavutil/buffer.h -@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool); +diff --git b/libavutil/buffer.h a/libavutil/buffer.h +index 73b6bd0..d907de3 100644 +--- b/libavutil/buffer.h ++++ a/libavutil/buffer.h +@@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool); */ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool); @@ -15613,11 +15356,11 @@ index 0c0ce12..82e0bc3 100644 /** * @} */ -diff --git a/pi-util/conf.sh b/pi-util/conf.sh +diff --git b/pi-util/conf.sh a/pi-util/conf.sh new file mode 100755 index 0000000..8b596a2 --- /dev/null -+++ b/pi-util/conf.sh ++++ a/pi-util/conf.sh @@ -0,0 +1,33 @@ +echo "Configure for Pi2/3" + @@ -15652,11 +15395,11 @@ index 0000000..8b596a2 + +# gcc option for getting asm listing +# -Wa,-ahls -diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv +diff --git b/pi-util/conf_h265.csv a/pi-util/conf_h265.csv new file mode 100644 -index 0000000..61d1399 +index 0000000..d3db338 --- /dev/null -+++ b/pi-util/conf_h265.csv ++++ a/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 +2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 @@ -15783,7 +15526,7 @@ index 0000000..61d1399 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 -+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5 ++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 +2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 @@ -15802,12 +15545,12 @@ index 0000000..61d1399 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 -diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py +diff --git b/pi-util/ffconf.py a/pi-util/ffconf.py new file mode 100644 -index 0000000..38f942f +index 0000000..c896bc6 --- /dev/null -+++ b/pi-util/ffconf.py -@@ -0,0 +1,146 @@ ++++ a/pi-util/ffconf.py +@@ -0,0 +1,154 @@ +#!/usr/bin/env python + +import os @@ -15851,16 +15594,18 @@ index 0000000..38f942f + except: + pass + -+ rv = False + if m1 and m2 and m1.group() == m2.group(): + print >> flog, "Match: " + m1.group() -+ rv = True ++ rv = 0 + elif not m1: + print >> flog, "****** Cannot find m1" ++ rv = 3 + elif not m2: + print >> flog, "****** Cannot find m2" ++ rv = 2 + else: + print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group() ++ rv = 1 + flog.close() + return rv + @@ -15906,19 +15651,25 @@ index 0000000..38f942f + print "==== ", name, + sys.stdout.flush() + -+ if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) : -+ if exp_test == 1: -+ failures.append(name) -+ print ": * FAIL *" -+ else: -+ print ": fail" -+ else: ++ rv = testone(os.path.join(conf_root, name), name, a[2], a[3]) ++ if (rv == 0): + if exp_test == 2: + print ": * OK *" + unx_success.append(name) + else: + print ": ok" -+ ++ elif exp_test > 1 and rv == 1: ++ print ": fail" ++ else: ++ failures.append(name) ++ if rv == 1: ++ print ": * FAIL *" ++ elif (rv == 2) : ++ print ": * CRASH *" ++ elif (rv == 3) : ++ print ": * MD5 MISSING *" ++ else : ++ print ": * BANG *" + + if failures or unx_success: + print "Unexpected Failures:", failures @@ -15954,11 +15705,11 @@ index 0000000..38f942f + + doconf(csva, args.tests) + -diff --git a/pi-util/qasm.py b/pi-util/qasm.py +diff --git b/pi-util/qasm.py a/pi-util/qasm.py new file mode 100644 index 0000000..1eacc04 --- /dev/null -+++ b/pi-util/qasm.py ++++ a/pi-util/qasm.py @@ -0,0 +1,2502 @@ +#!/usr/bin/env python + @@ -18462,11 +18213,25 @@ index 0000000..1eacc04 + +if __name__ == '__main__': + main() -diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py +diff --git b/pi-util/qem.sh a/pi-util/qem.sh +new file mode 100644 +index 0000000..20ce7ee +--- /dev/null ++++ a/pi-util/qem.sh +@@ -0,0 +1,8 @@ ++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex ++QASM=python\ pi-util/qasm.py ++SRC_FILE=libavcodec/rpi_shader.qasm ++DST_BASE=shader ++ ++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c ++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h ++ +diff --git b/pi-util/rebase_liblinks.py a/pi-util/rebase_liblinks.py new file mode 100755 index 0000000..6a9a33f --- /dev/null -+++ b/pi-util/rebase_liblinks.py ++++ a/pi-util/rebase_liblinks.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + @@ -18505,11 +18270,11 @@ index 0000000..6a9a33f + + + -diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh +diff --git b/pi-util/syncroot.sh a/pi-util/syncroot.sh new file mode 100755 index 0000000..d8bdd91 --- /dev/null -+++ b/pi-util/syncroot.sh ++++ a/pi-util/syncroot.sh @@ -0,0 +1,43 @@ +set -e + @@ -18554,4 +18319,84 @@ index 0000000..d8bdd91 +pi-util/rebase_liblinks.py $DST + + - +diff --git b/pi-util/v3dusage.py a/pi-util/v3dusage.py +new file mode 100644 +index 0000000..7e336a9 +--- /dev/null ++++ a/pi-util/v3dusage.py +@@ -0,0 +1,75 @@ ++#!/usr/bin/env python ++ ++import sys ++import argparse ++import re ++ ++def main(): ++ argp = argparse.ArgumentParser(description="QPU/VPU perf summary") ++ argp.add_argument("logfile") ++ args = argp.parse_args() ++ ++ ++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ') ++ ++ ttotal = {'idle':0.0} ++ tstart = {} ++ time0 = None ++ idle_start = None ++ qpu_op_no = 0 ++ op_count = 0 ++ ++ with open(args.logfile, "rt") as infile: ++ for line in infile: ++ match = rmatch.match(line) ++ if match: ++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":" ++ time = float(match.group(1)) ++ unit = match.group(3) ++ opstart = not match.group(2) ++ optype = match.group(7) ++ hascb = match.group(8) != "0" ++ ++ if unit == 'qpu1': ++ unit = unit + "." + str(qpu_op_no) ++ if not opstart: ++ if hascb or optype == 'EXECUTE_SYNC': ++ qpu_op_no = 0 ++ else: ++ qpu_op_no += 1 ++ ++ # Ignore sync type ++ if optype == 'EXECUTE_SYNC': ++ continue ++ ++ if not time0: ++ time0 = time ++ ++ if opstart: ++ tstart[unit] = time; ++ elif unit in tstart: ++ op_count += 1 ++ if not unit in ttotal: ++ ttotal[unit] = 0.0 ++ ttotal[unit] += time - tstart[unit] ++ del tstart[unit] ++ ++ if not idle_start and not tstart: ++ idle_start = time ++ elif idle_start and tstart: ++ ttotal['idle'] += time - idle_start ++ idle_start = None ++ ++ if not time0: ++ print "No v3d profile records found" ++ else: ++ tlogged = time - time0 ++ ++ print "Logged time:", tlogged, " Op count:", op_count ++ for unit in sorted(ttotal): ++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged) ++ ++ ++if __name__ == '__main__': ++ main() ++ diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch index 721a065449..5240cf58ce 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch @@ -22,4 +22,3 @@ index 2fd3f2b..7165652 100644 if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) { *poutbuf = NULL; *poutbuf_size = 0; - diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch index 15d449d284..37b53e8fb6 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch @@ -53,4 +53,3 @@ index aca8382..f473f6c 100644 -- 2.7.4 - diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch deleted file mode 100644 index 848158d727..0000000000 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- a/libavformat/tls_openssl.c -+++ b/libavformat/tls_openssl.c -@@ -233,12 +233,13 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op - if ((ret = ff_tls_open_underlying(c, h, uri, options)) < 0) - goto fail; - -- p->ctx = SSL_CTX_new(c->listen ? TLSv1_server_method() : TLSv1_client_method()); -+ p->ctx = SSL_CTX_new(c->listen ? SSLv23_server_method() : SSLv23_client_method()); - if (!p->ctx) { - av_log(h, AV_LOG_ERROR, "%s\n", ERR_error_string(ERR_get_error(), NULL)); - ret = AVERROR(EIO); - goto fail; - } -+ SSL_CTX_set_options(p->ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); - if (c->ca_file) { - if (!SSL_CTX_load_verify_locations(p->ctx, c->ca_file, NULL)) - av_log(h, AV_LOG_ERROR, "SSL_CTX_load_verify_locations %s\n", ERR_error_string(ERR_get_error(), NULL));