diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk
index f8e24b3e12..3bcc9af1de 100644
--- a/packages/mediacenter/LibreELEC-settings/package.mk
+++ b/packages/mediacenter/LibreELEC-settings/package.mk
@@ -1,24 +1,25 @@
################################################################################
-# This file is part of OpenELEC - http://www.openelec.tv
+# This file is part of LibreELEC - https://libreelec.tv
+# Copyright (C) 2017-present Team LibreELEC
# Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv)
#
-# OpenELEC is free software: you can redistribute it and/or modify
+# LibreELEC is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
-# OpenELEC is distributed in the hope that it will be useful,
+# LibreELEC is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with OpenELEC. If not, see .
+# along with LibreELEC. If not, see .
################################################################################
PKG_NAME="LibreELEC-settings"
-PKG_VERSION="0ec74f6"
-PKG_SHA256="f9e5a1ead9c1a3832122deb4831980dac87ec3b8f748e6449b6b090c40f09249"
+PKG_VERSION="a562ed0"
+PKG_SHA256="98f2d5aa3ef3d422a359fc0a10e2c50efc14d3eaf351312b3aceea449a0ff151"
PKG_ARCH="any"
PKG_LICENSE="GPL"
PKG_SITE="https://libreelec.tv"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
index ae03236b23..91cef254d5 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.2sf"
-PKG_VERSION="5f70a33"
-PKG_SHA256="378952a4745e93742ec1ff66de87c7f0532f00ba8ac0d80969edcbf832c4e4b0"
+PKG_VERSION="afe3580"
+PKG_SHA256="d3225745b1f52cc7af32615b967e0ed628a8e98d0f86f408603e3a3e9473b18a"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
index 347a36bf37..7ca5e13bb6 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.gsf"
-PKG_VERSION="0795b7e"
-PKG_SHA256="d6515f4d0a860251ef7cab5f7598438f9bf46231c32201d5f835bf44d0fdfd11"
+PKG_VERSION="081ee65"
+PKG_SHA256="063a5b0ac606e889e93256fd9ca45db3d7b52e0736ffaa1c22526bfe89f64afb"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
index e0c476d61c..9f55417f7c 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.ncsf"
-PKG_VERSION="236bcf9"
-PKG_SHA256="0e85db9bd16374e024243420dc12bb8bf17c9d71d769eacb6effb887032e595a"
+PKG_VERSION="149f324"
+PKG_SHA256="f5879d227ee63b63bba872f7cfda5a562b5f6e16c7e3e06c3522124eb11e528e"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
index 17b576e268..0d64f567d4 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.openmpt"
-PKG_VERSION="fb1041a"
-PKG_SHA256="f953c8c7f59c4bd2490c272a77fef128eaa3273d2548448c6e2a6e6cb68e2329"
+PKG_VERSION="47e3814"
+PKG_SHA256="8485250d47b290820aa7f60f6334bb89c9cbe56b524a8d81476f216e76541d0b"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
index 74008c46e7..bd47a459c7 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.qsf"
-PKG_VERSION="9182d5e"
-PKG_SHA256="38678039bb15e272abc7da6e94952ab1434e5f51e1bf2766fe6d96cb093ff053"
+PKG_VERSION="876201e"
+PKG_SHA256="06f74b44375c1b3bf565fb219dcf164490b7c894ebc76d8684503d1324b86665"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
index a87cd122a4..b1549fb5cd 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.ssf"
-PKG_VERSION="8801d12"
-PKG_SHA256="9a130e94542c82e8ddf1b6a8a38d49796488902d0862b809cf60b5dcb3a9f8cc"
+PKG_VERSION="8adf121"
+PKG_SHA256="18328f92bdfd426814bfd4e7549f674a171420c945f9284aa6183d70870b7f60"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
index f76a31cbe6..d494f0dad0 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.upse"
-PKG_VERSION="de09fb0"
-PKG_SHA256="c450453389ac75612c12b599bdb32f85c86a277f70eceac5f4b21c476ff9a932"
+PKG_VERSION="6fa70f8"
+PKG_SHA256="e0fcf4c85122c293aed7a4ba5f71802db9231d65a69ece9ea47732facb386d1c"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
index 6688bb01e0..bfaaa8fbbc 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="audiodecoder.usf"
-PKG_VERSION="cec0fe2"
-PKG_SHA256="1bb0afd2debc806fe72c466de76385043b642a9c5b0e2dc6d15ee3bfa0533f7b"
+PKG_VERSION="ccb1edc"
+PKG_SHA256="d0dc7bc7ad61bc19ec1297da4b04e2291ad27b68e0dc384d378e5106bba87709"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index 174b03ef21..2f7bb5fa0d 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="inputstream.adaptive"
-PKG_VERSION="0f0ced4"
-PKG_SHA256="5d3b640f59abcf591d1fb3c4d4ab788683a0e3326bfaa8e9b4f5c2a78f58b947"
+PKG_VERSION="babcca4"
+PKG_SHA256="1351012bbdfe18e683f217ea999d596e0a7f21ea48e9a5c1783ca06e864b144e"
PKG_LICENSE="GPL"
PKG_SITE="http://www.kodi.tv"
PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
index 98e9986820..801767ec17 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="inputstream.rtmp"
-PKG_VERSION="3b5058d"
-PKG_SHA256="cf2b935bcd12dee759aeaca6c9ad65294a4323d430c7029e7f2923e10aa1a679"
+PKG_VERSION="b8e3f39"
+PKG_SHA256="eb6cc5f164c3bc76582362ea0683cfdc368ae31591786e11b4a75e0b30c8b0b8"
PKG_LICENSE="GPL"
PKG_SITE="http://www.kodi.tv"
PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
index 231134987a..5fc2d2fe44 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.argustv"
-PKG_VERSION="377f796"
-PKG_SHA256="7ac85250793690c2e05692a5c3db7398fc84cffa9cf023c1d2a97d378fe53eb3"
+PKG_VERSION="2bce465"
+PKG_SHA256="2e80867293949e452ca623ac3ed88aa33e5de50fe7e0c6c51f476fca1fa5841a"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
index 40c086ed3d..2d29359c4f 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.demo"
-PKG_VERSION="d5e5cd1"
-PKG_SHA256="cb63a50c85a02f7ca38144d2f1a536e85116b01dd849bcce9300ca778d0de7ea"
+PKG_VERSION="20d81d8"
+PKG_SHA256="67b37fc6d7401dfa7b508241ff2d230fbf0879286b43a70667fd3fb89002470a"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
index eded9c6311..41ca080b14 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.dvblink"
-PKG_VERSION="c61ea73"
-PKG_SHA256="127fc5139603c59c1e3a27cf3694e558d46d0fb22219f0e78c45372fd356c75f"
+PKG_VERSION="a87258b"
+PKG_SHA256="a9ddc8b70d42e174aa9486b84d467296afa870f80fff32dd84223b12abf762e8"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
index 71cc7fcbc7..3336865d47 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.dvbviewer"
-PKG_VERSION="098d23c"
-PKG_SHA256="e47ccbbb6c1ee7fa096d91e93ae9878ee33fe442bd02baafa745c2946fa02d40"
+PKG_VERSION="884b732"
+PKG_SHA256="13e2c95aabfc5ee8ded5bcf1259492bd4487574ad2e2ee531061989b2e8f4e41"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
index ae7ca6840a..5fdc85ed58 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.filmon"
-PKG_VERSION="0ca1665"
-PKG_SHA256="3ea8ae440fd7adb65f3e8d619af592c0224da366636ba0ba7aadb89406b6ec5b"
+PKG_VERSION="470ca1c"
+PKG_SHA256="be27454a280664b0bb20c1f18d281ca293d0d74cfa464eaabd771c417c5ff174"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
index 851d180aec..f1f749d291 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.hdhomerun"
-PKG_VERSION="484b91d"
-PKG_SHA256="a6d00a4e293dda7a2a48262d94548bda6c9e34971061e05e437edb1c2cf8515b"
+PKG_VERSION="4639740"
+PKG_SHA256="0682689ff55e0585ccd9b57e81af57defab1efde6c56b2e645c03ab4438e2e44"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index 62ea381327..52474200e8 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.hts"
-PKG_VERSION="9533cce"
-PKG_SHA256="2fa8490abcaefdc1e0652d9fa5b873b246946f578842eba0e5aebd4bc0c69b20"
+PKG_VERSION="4f7196d"
+PKG_SHA256="12f5a51e9923b96f870be59a47336c33d160a8e8903e58027f0dd0cd82cf8347"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
index dc922f8e59..67e58c5186 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.iptvsimple"
-PKG_VERSION="77156cb"
-PKG_SHA256="96da93cedab5ecafb4ca49fc8942ce0979b2b931b7115359ec97f55f260f9e5f"
+PKG_VERSION="e220777"
+PKG_SHA256="ed6159cea372129ec49776a778aa9284898abdc2996c1744401273ac1fc21ef5"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
index 24bb53155a..514b8fa7a9 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.mediaportal.tvserver"
-PKG_VERSION="9702684"
-PKG_SHA256="53d295c69a53c775c477c117e7efc3a4a2f61bd896396087004a1e8c58f2e2b6"
+PKG_VERSION="c4e32b0"
+PKG_SHA256="16531a64827dd0f475c5184c7f89aa47d279736919a06e7cd55d8154f7bac798"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
index 4548dba7c8..d5070944bc 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.nextpvr"
-PKG_VERSION="de30ff2"
-PKG_SHA256="a468a22b7d9e709950cd24b9c9d6ce025d91e2e5509fc4a39f7ffd35e163ed3d"
+PKG_VERSION="78a80de"
+PKG_SHA256="25cd42764b2b8285f8f7d8855bef24a960d6ae8b18f2f9870c0c429af32116d8"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
index bfbd3690b0..ff9147cd8d 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.njoy"
-PKG_VERSION="bd6581f"
-PKG_SHA256="f99f4b31577b3c388183fc1c4aef3f4fde077e7df84e84b643ff5cdeb61fb221"
+PKG_VERSION="5a2c2d3"
+PKG_SHA256="14a02f78df7651dd8cb668c1c587e398ec8788125289ed66058e91ba111328f6"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
index 637223c496..16180a94a7 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.pctv"
-PKG_VERSION="5e95300"
-PKG_SHA256="878aee780117d878e9658a0916f47cfba66f884718af41d5d22d2b6aeee73c3e"
+PKG_VERSION="17c1897"
+PKG_SHA256="9a1277275833ac0288ac34083daf8521472f2f550d21f8953078d2d4c73559db"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
index f4868145b3..6777f540fa 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.stalker"
-PKG_VERSION="da6f0f8"
-PKG_SHA256="d043b9b144496efb7a7bd2106716d139d701add883d6db25a0eb26847858baf4"
+PKG_VERSION="0700069"
+PKG_SHA256="a3322c8567400b7dbdc9a91bfa5e21375064a9483b4b676414e4164a577d307f"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
index 510da61715..9a8482db40 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.teleboy"
-PKG_VERSION="a84e5ef"
-PKG_SHA256="84ef0fcc6dda0f67df0dfbd7d9e781f8834e4c7319bafc919430c28a705d2e55"
+PKG_VERSION="3e9e537"
+PKG_SHA256="5c40d59c4403688d15d9b8a5b96112bd21e2558667a85adc13afeca6aac43fb3"
PKG_REV="1"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
index ebd3624a15..c5234f3d98 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.vbox"
-PKG_VERSION="3cf15ad"
-PKG_SHA256="f57a67a14a6b260ef35bc15bccbf5280a104b2a5a8fe96d2cf13003762daafa2"
+PKG_VERSION="48ffcba"
+PKG_SHA256="07e46dbc9df1253af0d277c924850ddaf12c02a3e1b8ff1559096b16e528d29a"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
index 5f5c6f6de6..f39c0d5511 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="d6fe796"
-PKG_SHA256="f56e9bfeab4596526ff1243f90ebd36c41c057cc78ed655072e5491aaa6c1a00"
+PKG_VERSION="a2880c7"
+PKG_SHA256="975ce55c888b46b9b47bf7a8bbe4db56a2169aeebfda11fa9ca51510d1db2148"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index 75048da951..4e3fad543f 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.vuplus"
-PKG_VERSION="00a963f"
-PKG_SHA256="b286ed850ddce31b4fec1f55cf3639467c7ae39e548051b5485db035e20bf51e"
+PKG_VERSION="6c94eec"
+PKG_SHA256="fc645a611a78250299a83edca56dd03686d4ad67900be20fe00f46b2fb6d8e17"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
index cc3731b5e7..171c13c979 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.wmc"
-PKG_VERSION="2acca13"
-PKG_SHA256="6d19fbc313f089eff40af72f3f8b70358e357491bff8504a76aa029ef6f3fe21"
+PKG_VERSION="a7ec576"
+PKG_SHA256="ecc460e5e50c6e75a857dc7ec0e8de8142fb3bbb036e9253bca72ac20b5a2111"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
index a7095b565b..4bfca61773 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="pvr.zattoo"
-PKG_VERSION="1244e14"
-PKG_SHA256="20543c189b3d77bb8fc9f2306be9646235461db6c12e1f83623e82740279cba0"
+PKG_VERSION="f04367b"
+PKG_SHA256="5685ccafe979935123bce6cea2a7499f5bab8ff16f4b1d5b60c9ed3b943ac6b6"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
index d1713d74a9..34edf6eae7 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="screensaver.shadertoy"
-PKG_VERSION="2638205"
-PKG_SHA256="0c04af6aa45f1838ad785a2914a47ad4ce5c6b7998f73d848aa92b4480096b58"
+PKG_VERSION="0290c8e"
+PKG_SHA256="970eed3e63db75043fafe5a172bcd218bba3b5ae5f3b418206da00865ccb4647"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
index 246359bd96..15ed585ef8 100644
--- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="screensavers.rsxs"
-PKG_VERSION="36b9f97"
-PKG_SHA256="43fcaae28e00fd0a58fd12091560d25258cf5a228114e46799847031de65e063"
+PKG_VERSION="be03db6"
+PKG_SHA256="b0f35760a3f444769c2f0f948defc220b34459dde1bea06522708498eefe2e99"
PKG_REV="2"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk
index dcf210d389..66e21cd078 100644
--- a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="vfs.libarchive"
-PKG_VERSION="e7d149e"
-PKG_SHA256="dd9604752dcb4fbe38b082455935e95dc7b572a1424a49c935989292038f1b74"
+PKG_VERSION="84a4876"
+PKG_SHA256="38591095f93a380aac4be58c5e92bf870da095679a152a3ca4a1552ac4415968"
PKG_REV="1"
PKG_ARCH="any"
PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index b8ebd019d5..f4420f76b8 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -1,24 +1,25 @@
################################################################################
-# This file is part of OpenELEC - http://www.openelec.tv
+# This file is part of LibreELEC - https://libreelec.tv
+# Copyright (C) 2017-present Team LibreELEC
# Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv)
#
-# OpenELEC is free software: you can redistribute it and/or modify
+# LibreELEC is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
-# OpenELEC is distributed in the hope that it will be useful,
+# LibreELEC is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with OpenELEC. If not, see .
+# along with LibreELEC. If not, see .
################################################################################
PKG_NAME="kodi"
-PKG_VERSION="3a989ee"
-PKG_SHA256="deb3526aa28d1b64f8d295f18637c42cb031a476cabdbd9dc15af1e33c5d8965"
+PKG_VERSION="593949a"
+PKG_SHA256="7a4ccfacd24461d5dfbba9be362372912ebc26dd6743e52b706907b6cc081be5"
PKG_ARCH="any"
PKG_LICENSE="GPL"
PKG_SITE="http://www.kodi.tv"
diff --git a/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch b/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch
index ba07ff7999..7fbca0cd66 100644
--- a/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch
+++ b/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch
@@ -1,20 +1,20 @@
-From 5d3b9dae20b9c9b1c9236d98bf9ce64306d8b63f Mon Sep 17 00:00:00 2001
+From 09ef179755107108722027dcc04ae62795c5d522 Mon Sep 17 00:00:00 2001
From: MilhouseVH
-Date: Thu, 5 Apr 2018 11:43:28 +0100
+Date: Tue, 22 May 2018 00:28:13 +0100
Subject: [PATCH] allow using alsa and pulse together
---
- xbmc/windowing/X11/WinSystemX11GLContext.cpp | 28 ++--------------------------
+ xbmc/windowing/X11/WinSystemX11GLContext.cpp | 35 ++--------------------------
xbmc/windowing/amlogic/WinSystemAmlogic.cpp | 2 ++
- xbmc/windowing/gbm/WinSystemGbm.cpp | 27 ++-------------------------
+ xbmc/windowing/gbm/WinSystemGbm.cpp | 34 ++-------------------------
xbmc/windowing/rpi/WinSystemRpi.cpp | 4 ++++
- 4 files changed, 10 insertions(+), 51 deletions(-)
+ 4 files changed, 10 insertions(+), 65 deletions(-)
diff --git a/xbmc/windowing/X11/WinSystemX11GLContext.cpp b/xbmc/windowing/X11/WinSystemX11GLContext.cpp
-index 17b83a0..2e76053 100644
+index 6e31a80..2e76053 100644
--- a/xbmc/windowing/X11/WinSystemX11GLContext.cpp
+++ b/xbmc/windowing/X11/WinSystemX11GLContext.cpp
-@@ -52,32 +52,8 @@ std::unique_ptr CWinSystemBase::CreateWinSystem()
+@@ -52,39 +52,8 @@ std::unique_ptr CWinSystemBase::CreateWinSystem()
CWinSystemX11GLContext::CWinSystemX11GLContext()
{
@@ -29,6 +29,10 @@ index 17b83a0..2e76053 100644
- {
- OPTIONALS::PulseAudioRegister();
- }
+- else if (StringUtils::EqualsNoCase(envSink, "OSS"))
+- {
+- OPTIONALS::OSSRegister();
+- }
- else if (StringUtils::EqualsNoCase(envSink, "SNDIO"))
- {
- OPTIONALS::SndioRegister();
@@ -39,7 +43,10 @@ index 17b83a0..2e76053 100644
- {
- if (!OPTIONALS::ALSARegister())
- {
-- OPTIONALS::SndioRegister();
+- if (!OPTIONALS::SndioRegister())
+- {
+- OPTIONALS::OSSRegister();
+- }
- }
- }
- }
@@ -50,7 +57,7 @@ index 17b83a0..2e76053 100644
}
diff --git a/xbmc/windowing/amlogic/WinSystemAmlogic.cpp b/xbmc/windowing/amlogic/WinSystemAmlogic.cpp
-index 1db2ba7..517aeea 100644
+index 324d47f..1766308 100644
--- a/xbmc/windowing/amlogic/WinSystemAmlogic.cpp
+++ b/xbmc/windowing/amlogic/WinSystemAmlogic.cpp
@@ -32,6 +32,7 @@
@@ -61,21 +68,21 @@ index 1db2ba7..517aeea 100644
#include "windowing/GraphicContext.h"
#include "windowing/Resolution.h"
#include "platform/linux/powermanagement/LinuxPowerSyscall.h"
-@@ -79,6 +80,7 @@ CWinSystemAmlogic::CWinSystemAmlogic()
+@@ -78,6 +79,7 @@ CWinSystemAmlogic::CWinSystemAmlogic() :
// Register sink
AE::CAESinkFactory::ClearSinks();
CAESinkALSA::Register();
+ CAESinkPULSE::Register();
CLinuxPowerSyscall::Register();
- }
-
+ m_lirc.reset(OPTIONALS::LircRegister());
+ m_libinput->Start();
diff --git a/xbmc/windowing/gbm/WinSystemGbm.cpp b/xbmc/windowing/gbm/WinSystemGbm.cpp
-index 45783bd..7b5e2ba 100644
+index 72ddf6a..79e81d5 100644
--- a/xbmc/windowing/gbm/WinSystemGbm.cpp
+++ b/xbmc/windowing/gbm/WinSystemGbm.cpp
-@@ -43,31 +43,8 @@ CWinSystemGbm::CWinSystemGbm() :
- m_GBM(new CGBMUtils),
- m_delayDispReset(false)
+@@ -43,38 +43,8 @@ CWinSystemGbm::CWinSystemGbm() :
+ m_delayDispReset(false),
+ m_libinput(new CLibInputHandler)
{
- std::string envSink;
- if (getenv("AE_SINK"))
@@ -88,6 +95,10 @@ index 45783bd..7b5e2ba 100644
- {
- OPTIONALS::PulseAudioRegister();
- }
+- else if (StringUtils::EqualsNoCase(envSink, "OSS"))
+- {
+- OPTIONALS::OSSRegister();
+- }
- else if (StringUtils::EqualsNoCase(envSink, "SNDIO"))
- {
- OPTIONALS::SndioRegister();
@@ -98,22 +109,25 @@ index 45783bd..7b5e2ba 100644
- {
- if (!OPTIONALS::ALSARegister())
- {
-- OPTIONALS::SndioRegister();
+- if (!OPTIONALS::SndioRegister())
+- {
+- OPTIONALS::OSSRegister();
+- }
- }
- }
- }
+ OPTIONALS::ALSARegister();
+ OPTIONALS::PulseAudioRegister();
- m_winEvents.reset(new CWinEventsLinux());
CLinuxPowerSyscall::Register();
+ m_lirc.reset(OPTIONALS::LircRegister());
diff --git a/xbmc/windowing/rpi/WinSystemRpi.cpp b/xbmc/windowing/rpi/WinSystemRpi.cpp
-index 82534f2..d4e8ba9 100644
+index fac5cc4..f90e46d 100644
--- a/xbmc/windowing/rpi/WinSystemRpi.cpp
+++ b/xbmc/windowing/rpi/WinSystemRpi.cpp
-@@ -34,7 +34,9 @@
+@@ -33,7 +33,9 @@
+ #include "guilib/DispResource.h"
#include "utils/log.h"
- #include "../WinEventsLinux.h"
#include "cores/AudioEngine/AESinkFactory.h"
+#include "cores/AudioEngine/Sinks/AESinkALSA.h"
#include "cores/AudioEngine/Sinks/AESinkPi.h"
@@ -121,15 +135,15 @@ index 82534f2..d4e8ba9 100644
#include "platform/linux/powermanagement/LinuxPowerSyscall.h"
#include
-@@ -56,6 +58,8 @@ CWinSystemRpi::CWinSystemRpi()
- m_winEvents.reset(new CWinEventsLinux());
+@@ -55,6 +57,8 @@ CWinSystemRpi::CWinSystemRpi() :
+
AE::CAESinkFactory::ClearSinks();
CAESinkPi::Register();
+ CAESinkALSA::Register();
+ CAESinkPULSE::Register();
CLinuxPowerSyscall::Register();
m_lirc.reset(OPTIONALS::LircRegister());
- }
+ m_libinput->Start();
--
2.14.1
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 057d1692fe..73136814e1 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -1,19 +1,20 @@
################################################################################
-# This file is part of OpenELEC - http://www.openelec.tv
+# This file is part of LibreELEC - https://libreelec.tv
+# Copyright (C) 2017-present Team LibreELEC
# Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv)
#
-# OpenELEC is free software: you can redistribute it and/or modify
+# LibreELEC is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
-# OpenELEC is distributed in the hope that it will be useful,
+# LibreELEC is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with OpenELEC. If not, see .
+# along with LibreELEC. If not, see .
################################################################################
PKG_NAME="ffmpeg"
@@ -36,29 +37,29 @@ get_graphicdrivers
if [ "$VAAPI_SUPPORT" = "yes" ]; then
PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET libva"
- FFMPEG_VAAPI="--enable-vaapi"
+ PKG_FFMPEG_VAAPI="--enable-vaapi"
else
- FFMPEG_VAAPI="--disable-vaapi"
+ PKG_FFMPEG_VAAPI="--disable-vaapi"
fi
if [ "$VDPAU_SUPPORT" = "yes" -a "$DISPLAYSERVER" = "x11" ]; then
PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET libvdpau"
- FFMPEG_VDPAU="--enable-vdpau"
+ PKG_FFMPEG_VDPAU="--enable-vdpau"
else
- FFMPEG_VDPAU="--disable-vdpau"
+ PKG_FFMPEG_VDPAU="--disable-vdpau"
fi
if [ "$PROJECT" = "Rockchip" ]; then
PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET rkmpp"
- FFMPEG_RKMPP="--enable-rkmpp --enable-libdrm --enable-version3"
+ PKG_FFMPEG_RKMPP="--enable-rkmpp --enable-libdrm --enable-version3"
else
- FFMPEG_RKMPP="--disable-rkmpp"
+ PKG_FFMPEG_RKMPP="--disable-rkmpp"
fi
if build_with_debug; then
- FFMPEG_DEBUG="--enable-debug --disable-stripping"
+ PKG_FFMPEG_DEBUG="--enable-debug --disable-stripping"
else
- FFMPEG_DEBUG="--disable-debug --enable-stripping"
+ PKG_FFMPEG_DEBUG="--disable-debug --enable-stripping"
fi
if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then
@@ -67,17 +68,23 @@ fi
case "$TARGET_ARCH" in
arm)
- FFMPEG_TABLES="--enable-hardcoded-tables"
+ PKG_FFMPEG_TABLES="--enable-hardcoded-tables"
;;
*)
- FFMPEG_TABLES="--disable-hardcoded-tables"
+ PKG_FFMPEG_TABLES="--disable-hardcoded-tables"
;;
esac
if target_has_feature neon; then
- FFMPEG_FPU="--enable-neon"
+ PKG_FFMPEG_FPU="--enable-neon"
else
- FFMPEG_FPU="--disable-neon"
+ PKG_FFMPEG_FPU="--disable-neon"
+fi
+
+if [ "$TARGET_ARCH" = "x86_64" ]; then
+ PKG_FFMPEG_X86ASM="--enable-x86asm --x86asmexe=yasm"
+else
+ PKG_FFMPEG_X86ASM="--disable-x86asm"
fi
pre_configure_target() {
@@ -86,10 +93,10 @@ pre_configure_target() {
if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then
CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux $CFLAGS"
- FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
- FFMPEG_RPI="--enable-rpi"
+ PKG_FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
+ PKG_FFMPEG_RPI="--enable-rpi"
else
- FFMPEG_RPI="--disable-rpi"
+ PKG_FFMPEG_RPI="--disable-rpi"
fi
}
@@ -112,7 +119,7 @@ configure_target() {
--host-ldflags="$HOST_LDFLAGS" \
--extra-cflags="$CFLAGS" \
--extra-ldflags="$LDFLAGS" \
- --extra-libs="$FFMPEG_LIBS" \
+ --extra-libs="$PKG_FFMPEG_LIBS" \
--disable-static \
--enable-shared \
--enable-gpl \
@@ -120,7 +127,7 @@ configure_target() {
--enable-nonfree \
--enable-logging \
--disable-doc \
- $FFMPEG_DEBUG \
+ $PKG_FFMPEG_DEBUG \
--enable-pic \
--pkg-config="$TOOLCHAIN/bin/pkg-config" \
--enable-optimizations \
@@ -147,20 +154,19 @@ configure_target() {
--enable-mdct \
--enable-rdft \
--disable-crystalhd \
- $FFMPEG_VAAPI \
- $FFMPEG_VDPAU \
- $FFMPEG_RPI \
- $FFMPEG_RKMPP \
+ $PKG_FFMPEG_VAAPI \
+ $PKG_FFMPEG_VDPAU \
+ $PKG_FFMPEG_RPI \
+ $PKG_FFMPEG_RKMPP \
--disable-dxva2 \
--enable-runtime-cpudetect \
- $FFMPEG_TABLES \
+ $PKG_FFMPEG_TABLES \
--disable-encoders \
--enable-encoder=ac3 \
--enable-encoder=aac \
--enable-encoder=wmav2 \
--enable-encoder=mjpeg \
--enable-encoder=png \
- --disable-decoder=mpeg_xvmc \
--enable-hwaccels \
--disable-muxers \
--enable-muxer=spdif \
@@ -200,8 +206,8 @@ configure_target() {
--enable-zlib \
--enable-asm \
--disable-altivec \
- $FFMPEG_FPU \
- --enable-yasm \
+ $PKG_FFMPEG_FPU \
+ $PKG_FFMPEG_X86ASM \
--disable-symver
}
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index ef2f4d7d62..5300c1252b 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644
/ffplay
/ffprobe
diff --git a/configure b/configure
-index dee507cb6a..9a93189107 100755
+index dee507cb6a..0ee9efe1e7 100755
--- a/configure
+++ b/configure
@@ -318,6 +318,7 @@ External library support:
@@ -30,6 +30,15 @@ index dee507cb6a..9a93189107 100755
--disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
--disable-nvenc disable Nvidia video encoding code [autodetect]
--enable-omx enable OpenMAX IL code [no]
+@@ -1036,7 +1037,7 @@ EOF
+
+ check_insn(){
+ log check_insn "$@"
+- check_inline_asm ${1}_inline "$2"
++ check_inline_asm ${1}_inline "\"$2\""
+ check_as ${1}_external "$2"
+ }
+
@@ -1776,6 +1777,7 @@ FEATURE_LIST="
gray
hardcoded_tables
@@ -582,18 +591,19 @@ index 4d4ef530e4..fba8776c9f 100644
{
const AVCodec *p, *experimental = NULL;
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index e656011c3c..69cd820f06 100644
+index e656011c3c..f8801dfab6 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
+@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \
++ arm/rpi_hevcpred_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
-@@ -136,10 +137,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
+@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
@@ -608,7 +618,13 @@ index e656011c3c..69cd820f06 100644
+ arm/rpi_hevcdsp_idct_neon.o \
+ arm/rpi_hevcdsp_res8_neon.o \
+ arm/rpi_hevcdsp_res16_neon.o \
-+ arm/rpi_hevcdsp_sao_neon.o
++ arm/rpi_hevcdsp_sao_neon.o \
++ arm/rpi_hevcpred_init_neon.o \
++ arm/rpi_hevcpred_intra_angular_neon.o \
++ arm/rpi_hevcpred_intra_dc_neon.o \
++ arm/rpi_hevcpred_intra_filter_neon.o \
++ arm/rpi_hevcpred_intra_hv_neon.o \
++ arm/rpi_hevcpred_intra_planar_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
@@ -1674,10 +1690,10 @@ index 0000000000..0211e447a8
+
diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
new file mode 100644
-index 0000000000..1bdf2ab09f
+index 0000000000..3bbfb443bf
--- /dev/null
+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,26 @@
+@@ -0,0 +1,226 @@
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
@@ -1704,6 +1720,206 @@ index 0000000000..1bdf2ab09f
+
+2: vst1.8 {q0,q1}, [r0:256]
+ bx lr
++endfunc
++
++@ PIC jump tables are more expensive than absolute for A32 code
++.set jent_pic, CONFIG_PIC || CONFIG_THUMB
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++T .short ((0 + \lab) - (0 + 98b)) / 2
++A .short (0 + \lab) - (4 + 98b)
++.else
++T .word 1 + \lab
++A .word \lab
++.endif
++.endm
++
++.macro cpy_compound val, p1, p2
++.if \p1 + \p2 != \val
++.error "Bad addition! \p1 + \p2 != \val"
++.endif
++.if \val <= 64
++@ As max we deal with 128 vals above 64 will never be recursed to
++100\val\():
++ push {r11, lr}
++.endif
++\val\():
++ push {r0-r3}
++ bl 100\p1\()b
++ pop {r0-r3}
++ add r0, #\p1
++ add r2, #\p1
++ b \p2\()b
++.endm
++
++@ ff_hevc_cpy_blks8x4_neon(
++@ dst [r0]
++@ dst_stride [r1]
++@ src [r2]
++@ src_stride [r3]
++@ width [sp, #0] (bytes)
++@ height) [sp, #4]
++@
++@ Power of 2 widths are directly coded, all others are done in stripes
++@ We expect the vast majority of calls to be power of 2
++@
++@ Currently has min width of 8, but we could make that 4 without issue
++@ Min height is 4
++
++function ff_hevc_rpi_cpy_blks8x4_neon, export=1
++ ldr r12, [sp, #0]
++ push {r11, lr}
++ sub r12, #1
++A adr lr, 98f
++ ubfx r12, r12, #3, #4
++ ldr r11, [sp, #(8 + 4)]
++.if jent_pic
++A lsl r12, #1
++A ldrsh lr, [lr, r12]
++A add pc, lr
++T tbh [pc, r12, lsl #1]
++.else
++ @ A32 only, Thumb is always PIC
++ ldr pc, [lr, r12, lsl #2]
++.endif
++
++98:
++ jent 8f
++ jent 16f
++ jent 24f
++ jent 32f
++ jent 40f
++ jent 48f
++ jent 56f
++ jent 64f
++ jent 72f
++ jent 80f
++ jent 88f
++ jent 96f
++ jent 104f
++ jent 112f
++ jent 120f
++ jent 128f
++
++1008:
++ push {r11, lr}
++8:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {d0 }, [r2], r3
++ vld1.32 {d1 }, [lr], r3
++ vld1.32 {d2 }, [r2], r3
++ vld1.32 {d3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {d0 }, [r0], r1
++ vst1.32 {d1 }, [r12], r1
++ vst1.32 {d2 }, [r0], r1
++ vst1.32 {d3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10016:
++ push {r11, lr}
++16:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q0 }, [r2], r3
++ vld1.32 {q1 }, [lr], r3
++ vld1.32 {q2 }, [r2], r3
++ vld1.32 {q3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {q0 }, [r0], r1
++ vst1.32 {q1 }, [r12], r1
++ vst1.32 {q2 }, [r0], r1
++ vst1.32 {q3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++cpy_compound 24, 16, 8
++
++10032:
++ push {r11, lr}
++32:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #4
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++cpy_compound 40, 32, 8
++cpy_compound 48, 32, 16
++cpy_compound 56, 32, 24
++
++10064:
++ push {r11, lr}
++64:
++ add lr, r2, #32
++ add r12, r0, #32
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #2
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++cpy_compound 104, 64, 40
++cpy_compound 112, 64, 48
++cpy_compound 120, 64, 56
++
++128:
++ push {r4, r5}
++ @ We could do this with fewer registers if we jump around but I
++ @ have a primative urge to load sequentially
++ mov r4, #64
++ add lr, r2, #32
++ add r12, r0, #32
++ sub r3, r4
++ sub r1, r4
++1:
++ vld1.32 {q8, q9 }, [r2], r4
++ vld1.32 {q10, q11}, [lr], r4
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #1
++ vst1.32 {q8, q9 }, [r0], r4
++ vst1.32 {q10, q11}, [r12], r4
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r4, r5, r11, pc}
++
++endfunc
++
diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
new file mode 100644
index 0000000000..62b9326532
@@ -1738,10 +1954,10 @@ index 0000000000..62b9326532
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
new file mode 100644
-index 0000000000..e665bd848a
+index 0000000000..98512d21dc
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1249 @@
+@@ -0,0 +1,1625 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi
+ *
@@ -1766,65 +1982,72 @@ index 0000000000..e665bd848a
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
+ vsubl.u8 q0, \Q0a, \P0a
-+ vsubl.u8 q2, \P1a, \Q1a
-+ vshl.i16 q0, #2
-+ vadd.i16 q0, q2
++ vsubl.u8 q1, \P1a, \Q1a
+ vdup.16 d4, r2
-+
-+ vrshr.s16 q0, #3
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
+ vmovl.u8 q2, d4
-+
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
-+ vmax.s16 q0, q2
-+ vaddw.u8 q2, q0, \P0a
-+
-+ vqmovun.s16 \P0a, q2
+ vmovl.u8 q2, \Q0a
-+ vsub.i16 q2, q0
-+
-+ vqmovun.s16 \Q0a, q2
++ vmax.s16 q0, q1
++ vaddw.u8 q1, q0, \P0a
++ vsub.i16 q0, q2, q0
++ vqmovun.s16 \P0a, q1
++ vqmovun.s16 \Q0a, q0
+.endm
+
+
-+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
-+ vsubl.u8 q0, \Q0u, \P0u
-+ vsubl.u8 q1, \Q0v, \P0v
-+ vsubl.u8 q2, \P1u, \Q1u
-+ vsubl.u8 q3, \P1v, \Q1v
-+ vshl.i16 q0, #2
-+ vshl.i16 q1, #2
-+ vadd.i16 q0, q2
-+ vdup.16 d4, r2
-+ lsr r2, #16
-+ vadd.i16 q1, q3
-+
-+ vrshr.s16 q0, #3
-+ vdup.16 d6, r2
-+ vmovl.u8 q2, d4
-+ vmovl.u8 q3, d6
-+ vrshr.s16 q1, #3
-+
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b
++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a
++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vmovl.u8 q2, d4 @ tc0a, tc0b
++ \I3
++ vmovl.u8 q3, d6 @ tc1a, tc1b
++ \I4
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
+ vmin.s16 q1, q3
-+ vneg.s16 q3, q3
-+ vmax.s16 q0, q2
-+ vaddw.u8 q2, q0, \P0u
-+ vmax.s16 q1, q3
-+ vaddw.u8 q3, q1, \P0v
-+
-+ vqmovun.s16 \P0u, q2
-+ vmovl.u8 q2, \Q0u
-+ vqmovun.s16 \P0v, q3
-+ vmovl.u8 q3, \Q0v
-+ vsub.i16 q2, q0
-+ vsub.i16 q3, q1
-+
-+ vqmovun.s16 \Q0u, q2
-+ vqmovun.s16 \Q0v, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q1, q3 @ delta0b
++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a
++ vsub.i16 q0, q2, q0 @ q0a - delta0a
++ vmovl.u8 q2, \Q0b
++ vsub.i16 q2, q1 @ q0b - delta0b
++ vaddw.u8 q1, \P0b @ p0b + delta0b
++ vqmovun.s16 \Q0a, q0
++ vqmovun.s16 \P0a, q3
++ vqmovun.s16 \Q0b, q2
++ vqmovun.s16 \P0b, q1
+.endm
+
+
@@ -1835,33 +2058,36 @@ index 0000000000..e665bd848a
+@ [0..7] tc U a
+@ [8..15] tc V a
+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
+ vsub.i16 q0, \Q0a, \P0a
-+ vsub.i16 q2, \P1a, \Q1a
-+ vshl.i16 q0, #2
-+ vadd.i16 q0, q2
-+ vrshr.s16 q0, #3
-+
++ vsub.i16 q1, \P1a, \Q1a
+ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
+ vshll.u8 q2, d4, #\bit_depth - 8
-+
-+ movw r2, #(1 << \bit_depth) - 1
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
-+ vmax.s16 q0, q2
-+ vmov.i64 q2, #0
-+ vdup.i16 q3, r2
++ vmov.i16 q2, #0
++ vmax.s16 q0, q1
+ vadd.i16 \P0a, q0
+ vsub.i16 \Q0a, q0
-+
++ vmov.i16 q1, #(1 << \bit_depth) - 1
+ vmax.s16 \P0a, q2
+ vmax.s16 \Q0a, q2
-+ vmin.s16 \P0a, q3
-+ vmin.s16 \Q0a, q3
++ vmin.s16 \P0a, q1
++ vmin.s16 \Q0a, q1
+.endm
+
-+@ Preserves r12
-+@ Clobbers r2
++@ Clobbers r2, r12
+@ P0a et al all contain UVUVUVUV
+@ r2 (tc4) contains
+@ [0..7] tc U a
@@ -1869,38 +2095,41 @@ index 0000000000..e665bd848a
+@ [16..23] tc U b
+@ [24..31] tc V b
+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth
-+ vsub.i16 q0, \Q0a, \P0a
-+ vsub.i16 q1, \Q0b, \P0b
-+ vsub.i16 q2, \P1a, \Q1a
-+ vsub.i16 q3, \P1b, \Q1b
-+ vshl.i16 q0, #2
-+ vshl.i16 q1, #2
-+ vadd.i16 q0, q2
-+ vrshr.s16 q0, #3
-+ vadd.i16 q1, q3
-+ vrshr.s16 q1, #3
-+
-+ vdup.16 d4, r2
-+ lsr r2, #16
-+ vdup.16 d6, r2
-+ vshll.u8 q2, d4, #\bit_depth - 8
-+ vshll.u8 q3, d6, #\bit_depth - 8
-+
-+ movw r2, #(1 << \bit_depth) - 1
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b
++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a
++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++ \I3
++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++ \I4
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
+ vmin.s16 q1, q3
-+ vneg.s16 q3, q3
-+ vmax.s16 q0, q2
-+ vmov.i64 q2, #0
-+ vmax.s16 q1, q3
-+ vdup.i16 q3, r2
-+ vadd.i16 \P0a, q0
-+ vsub.i16 \Q0a, q0
-+ vadd.i16 \P0b, q1
-+ vsub.i16 \Q0b, q1
-+
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vadd.i16 \P0a, q0 @ p0a + delta0a
++ vsub.i16 \Q0a, q0 @ q0a - delta0a
++ vmax.s16 q1, q3 @ delta0b
++ vadd.i16 \P0b, q1 @ p0b + delta0b
++ vsub.i16 \Q0b, q1 @ q0b - delta0b
++ vmov.i16 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
+ vmax.s16 \P0a, q2
+ vmax.s16 \Q0a, q2
+ vmax.s16 \P0b, q2
@@ -1923,11 +2152,10 @@ index 0000000000..e665bd848a
+ it eq
+ bxeq lr
+ push {r4-r10,lr} @ 32 bytes
-+ ldr r5, [sp, #32] @ &_no_p
-+ ldrb r10, [r5]
-+ ldr r5, [sp, #36] @ &_no_q
++ ldrd r4, r5, [sp, #32] @ &_no_p
++ ldrb r4, [r4]
+ ldrb r5, [r5]
-+ cmp r10, #0
++ movs r10, r4
+ it ne
+ movne r10, #1
+ cmp r5, #0
@@ -1950,244 +2178,207 @@ index 0000000000..e665bd848a
+@ Junks:
+@ r5, r6, r7, r8, r9
+
-+.macro m_filter_luma bit_depth
++.macro m_filter_luma bit_depth, Q11, Q15
+.if \bit_depth == 8
-+ vmovl.u8 q15, d23
-+ vmovl.u8 q14, d22
-+ vmovl.u8 q13, d21
-+ vmovl.u8 q12, d20
-+ vmovl.u8 q11, d19
-+ vmovl.u8 q10, d18
-+ vmovl.u8 q9, d17
-+ vmovl.u8 q8, d16
++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
+.endif
-+ vadd.i16 q7, q9, q11
++ vadd.i16 q0, q9, \Q11 @ P2 + P0
+.if \bit_depth > 8
-+ lsl r2, r2, #(\bit_depth - 8)
++ lsl r3, r3, #(\bit_depth - 8)
+.endif
-+ vadd.i16 q6, q14, q12
++ vadd.i16 q1, q14, q12 @ Q2 + Q0
+.if \bit_depth > 8
-+ lsl r3, r3, #(\bit_depth - 8)
++ lsl r2, r2, #(\bit_depth - 8)
+.endif
-+ vsub.i16 q7, q10
-+ vsub.i16 q6, q13
-+ vabd.s16 q7, q7, q10
-+ vabd.s16 q6, q6, q13
++ vsub.i16 q0, q10 @ P2 - P1 + P0
++ lsr r5, r3, #16
++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0)
++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++ vmov.i64 q2, #0xffffffff0000
++ vbic q0, q2 @ only dp0(') and dp3(')
++ vbic q1, q2 @ only dq0(') and dq3(')
++ vsra.u64 q0, #16
++ vsra.u64 q1, #16
++ vdup.16 q3, r2 @ beta
++ vdup.16 d14, r3 @ tC[0]
++ vdup.16 d15, r5 @ tC[1]
++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0
++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0
++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3
++ vshl.s16 q6, q7, #2 @ tC[] * 4
++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1
++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta)
++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3
++ cmp r7, #0
++ beq .Lbypasswrite
+
-+ vdup.16 q0, r2
-+ vmov q4, q7
-+ vmov q5, q6
-+ vdup.16 d4, r3
-+ lsr r3, r3, #16
-+ vtrn.16 q7, q4
-+ vtrn.16 q6, q5
++ vcgt.s16 q5, q6, q5 @ if < tc25
++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++ vand q4, q5
++ vbic d8, d4
++ vbic d9, d4
++ vshr.s16 q3, #2 @ beta_2 = beta >> 2
++ vsra.u64 q4, #16
++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1
++ vshl.i16 q7, #1 @ tc2 = tC[] << 1
++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc
++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half
++ vand d6, d8 @ && beta_2 tests, prime in ms half
++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++ vneg.s16 q6, q7 @ -tc2
++ vmovn.i32 d8, q3
++ vshrn.i32 d6, q3, #16
++ vand d6, d8
++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3
++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block)
++ vadd.i16 q0, \Q11, q12 @ p0 + q0
++ ands r9, r7, r8
++ beq 1f
+
-+ vshl.u64 q7, #32
-+ vshr.u64 q4, #32
-+ vshl.u64 q6, #32
-+ vshr.u64 q5, #32
-+ vshr.u64 q7, #32
-+ vshr.u64 q6, #32
-+ vshl.u64 q5, #32
-+ vshl.u64 q4, #32
-+ vorr q6, q5
-+ vorr q7, q4
-+ vdup.16 d5, r3
-+ vadd.i16 q5, q7, q6
-+
-+ vmov q4, q5
-+ vmov q3, q5
-+ vtrn.32 q3, q4
-+
-+ vadd.i16 q4, q3
-+
-+ vshl.s16 q5, q5, #1
-+ vcgt.s16 q3, q0, q4
-+
-+ vmovn.i16 d6, q3
-+ vshr.s16 q1, q0, #2
-+ vmovn.i16 d6, q3
-+ vcgt.s16 q5, q1, q5
-+ vmov r7, s12
-+ cmp r7, #0
-+ beq .Lbypasswrite
-+
-+ vpadd.i32 d0, d14, d12
-+ vpadd.i32 d1, d15, d13
-+ vmov q4, q2
-+ vshl.s16 q2, #2
-+ vshr.s16 q1, q1, #1
-+ vrhadd.s16 q2, q4
-+
-+ vabd.s16 q7, q8, q11
-+ vaba.s16 q7, q15, q12
-+
-+ vmovn.i32 d0, q0
-+ vmov r5, r6, s0, s1
-+ vcgt.s16 q6, q1, q7
-+ vand q5, q5, q6
-+ vabd.s16 q7, q11, q12
-+ vcgt.s16 q6, q2, q7
-+ vand q5, q5, q6
-+
-+ vmov q2, q5
-+ vtrn.s16 q5, q2
-+ vshr.u64 q2, #32
-+ vshl.u64 q5, #32
-+ vshl.u64 q2, #32
-+ vshr.u64 q5, #32
-+ vorr q5, q2
-+
-+ vmov q2, q5
-+ vshl.i16 q7, q4, #1
-+ vtrn.32 q2, q5
-+ vand q5, q2
-+ vneg.s16 q6, q7
-+ vmovn.i16 d4, q5
-+ vmovn.i16 d4, q2
-+ vmov r8, s8
-+
-+ and r9, r8, r7
-+ cmp r9, #0
-+ beq 1f
-+
-+ vadd.i16 q2, q11, q12
-+ vadd.i16 q4, q9, q8
-+ vadd.i16 q1, q2, q10
-+ vdup.16 d10, r9
-+ vadd.i16 q0, q1, q9
-+ vshl.i16 q4, #1
-+ lsr r9, #16
-+ vadd.i16 q1, q0
-+ vrshr.s16 q3, q0, #2
-+ vadd.i16 q1, q13
-+ vadd.i16 q4, q0
-+ vsub.i16 q3, q10
-+ vrshr.s16 q1, #3
-+ vrshr.s16 q4, #3
-+ vmax.s16 q3, q6
-+ vsub.i16 q1, q11
-+ vsub.i16 q4, q9
-+ vmin.s16 q3, q7
-+ vmax.s16 q4, q6
-+ vmax.s16 q1, q6
-+ vadd.i16 q3, q10
-+ vmin.s16 q4, q7
-+ vmin.s16 q1, q7
-+ vdup.16 d11, r9
-+ vadd.i16 q4, q9
-+ vadd.i16 q1, q11
-+ vbit q9, q4, q5
-+ vadd.i16 q4, q2, q13
-+ vbit q11, q1, q5
-+ vadd.i16 q0, q4, q14
-+ vadd.i16 q2, q15, q14
-+ vadd.i16 q4, q0
-+
-+ vshl.i16 q2, #1
-+ vadd.i16 q4, q10
-+ vbit q10, q3, q5
-+ vrshr.s16 q4, #3
-+ vadd.i16 q2, q0
-+ vrshr.s16 q3, q0, #2
-+ vsub.i16 q4, q12
-+ vrshr.s16 q2, #3
-+ vsub.i16 q3, q13
-+ vmax.s16 q4, q6
-+ vsub.i16 q2, q14
-+ vmax.s16 q3, q6
-+ vmin.s16 q4, q7
-+ vmax.s16 q2, q6
-+ vmin.s16 q3, q7
-+ vadd.i16 q4, q12
-+ vmin.s16 q2, q7
-+ vadd.i16 q3, q13
-+ vbit q12, q4, q5
-+ vadd.i16 q2, q14
-+ vbit q13, q3, q5
-+ vbit q14, q2, q5
++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0
++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1
++ lsr r3, r9, #16
++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping)
++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++ vadd.i16 q0, q8, q9 @ p3 + p2
++ vadd.i16 q5, \Q15, q14 @ q2 + q3
++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2
++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3
++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++ vrshr.s16 q0, #3 @ scale, with rounding
++ vrshr.s16 q5, #3
++ vrshr.s16 q1, #2
++ vrshr.s16 q4, #2
++ vrshr.s16 q2, #3
++ vrshr.s16 q3, #3
++ vsub.i16 q0, q9 @ find difference
++ vsub.i16 q5, q14
++ vsub.i16 q1, q10
++ vsub.i16 q4, q13
++ vsub.i16 q2, \Q11
++ vsub.i16 q3, q12
++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2
++ vmax.s16 q5, q6
++ vmax.s16 q1, q6
++ vmax.s16 q4, q6
++ vmax.s16 q2, q6
++ vmax.s16 q3, q6
++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure
++ vdup.16 d13, r3
++ vmin.s16 q0, q7
++ vmin.s16 q5, q7
++ vmin.s16 q1, q7
++ vmin.s16 q4, q7
++ vmin.s16 q2, q7
++ vmin.s16 q3, q7
++ vadd.i16 q0, q9 @ apply difference
++ vadd.i16 q5, q14
++ vadd.i16 q1, q10
++ vadd.i16 q4, q13
++ vadd.i16 q2, \Q11
++ vadd.i16 q3, q12
++ vbit q9, q0, q6 @ apply filtered values according to mask
++ vbit q14, q5, q6
++ vbit q10, q1, q6
++ vbit q13, q4, q6
++ vbit \Q11, q2, q6
++ vbit q12, q3, q6
++ vneg.s16 q6, q7 @ restore -tc2
+
+1:
-+ mvn r8, r8
-+ and r9, r8, r7
-+ cmp r9, #0
-+ beq 2f
++ bics r9, r7, r8
++ beq 2f
+
-+ vdup.16 q4, r2
-+
-+ vdup.16 d10, r9
-+ lsr r9, #16
-+ vmov q1, q4
-+ vdup.16 d11, r9
-+ vshr.s16 q1, #1
-+ vsub.i16 q2, q12, q11
-+ vadd.i16 q4, q1
-+ vshl.s16 q0, q2, #3
-+ vshr.s16 q4, #3
-+ vadd.i16 q2, q0
-+ vsub.i16 q0, q13, q10
-+ vsub.i16 q2, q0
-+ vshl.i16 q0, q0, #1
-+ vsub.i16 q2, q0
-+ vshl.s16 q1, q7, 2
-+ vrshr.s16 q2, q2, #4
-+ vadd.i16 q1, q7
-+ vabs.s16 q3, q2
-+ vshr.s16 q6, q6, #1
-+ vcgt.s16 q1, q1, q3
-+ vand q5, q1
-+ vshr.s16 q7, q7, #1
-+ vmax.s16 q2, q2, q6
-+ vmin.s16 q2, q2, q7
-+
-+ vshr.s16 q7, q7, #1
-+ vrhadd.s16 q3, q9, q11
-+ vneg.s16 q6, q7
-+ vsub.s16 q3, q10
-+ vdup.16 d2, r5
-+ vhadd.s16 q3, q2
-+ vdup.16 d3, r6
-+ vmax.s16 q3, q3, q6
-+ vcgt.s16 q1, q4, q1
-+ vmin.s16 q3, q3, q7
-+ vand q1, q5
-+ vadd.i16 q3, q10
-+ lsr r5, #16
-+ lsr r6, #16
-+ vbit q10, q3, q1
-+
-+ vrhadd.s16 q3, q14, q12
-+ vdup.16 d2, r5
-+ vsub.s16 q3, q13
-+ vdup.16 d3, r6
-+ vhsub.s16 q3, q2
-+ vcgt.s16 q1, q4, q1
-+ vmax.s16 q3, q3, q6
-+ vand q1, q5
-+ vmin.s16 q3, q3, q7
-+ vadd.i16 q3, q13
-+ vbit q13, q3, q1
-+ vadd.i16 q0, q11, q2
-+ vsub.i16 q4, q12, q2
-+ vbit q11, q0, q5
-+ vbit q12, q4, q5
++ vsub.i16 q0, q12, \Q11 @ q0 - p0
++ vsub.i16 q1, q13, q10 @ q1 - p1
++ lsr r3, r9, #16
++ vshl.i16 q2, q0, #3
++ lsr r7, r5, #16
++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0)
++ lsr r8, r6, #16
++ vshl.i16 q2, q1, #1
++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1)
++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1
++ vsub.i16 q5, q3, q4
++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1
++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1
++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1
++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1
++ vmax.s16 q6, q5 @
++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1
++ vdup.16 q0, r2 @ beta
++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc]
++ vshr.s16 q4, #1 @ tc_2 = tc >> 1
++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++ vshr.s16 q2, q0, #1 @ beta >> 1
++ vadd.i16 q2, q0 @ beta + (beta >> 1)
++ vneg.s16 q0, q4 @ -tc_2
++ vabs.s16 q5, q5 @ abs(original delta0)
++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3
++ vmax.s16 q1, q0
++ vmax.s16 q3, q0
++ vshl.s16 q0, q7, #2 @ 8 * tc
++ vadd.i16 q7, q0 @ 10 * tc
++ vdup.16 d0, r9
++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering
++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++ vdup.16 d8, r5 @ dp0 + dp3
++ vdup.16 d9, r7 @ dp0' + dp3'
++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0))
++ vdup.16 d10, r6 @ dq0 + dq3
++ vdup.16 d11, r8 @ dq0' + dq3'
++ vand q7, q0 @ AND block and line masks
++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++ vadd.i16 q0, q1, q10 @ p1 + deltap1
++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++ vadd.i16 q3, q3, q13 @ q1 + deltaq1
++ vadd.i16 q1, \Q11, q6 @ p0 + delta0
++ vsub.i16 q2, q12, q6 @ q0 - delta0
++ vand q4, q7 @ AND nd_p test with block/line masks
++ vand q5, q7 @ AND nd_q test with block/line masks
++ vbit q10, q0, q4
++ vbit \Q11, q1, q7
++ vbit q12, q2, q7
++ vbit q13, q3, q5
+
+2:
+.if \bit_depth == 8
++ vmovn.i16 d16, q8
++ vmovn.i16 d23, \Q15
+ neg r1, r1
-+ vqmovun.s16 d16, q8
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
-+ vqmovun.s16 d19, q11
++ vqmovun.s16 d19, \Q11
+ lsls r10, #31
+ vqmovun.s16 d20, q12
+ vqmovun.s16 d21, q13
+ vqmovun.s16 d22, q14
-+ vqmovun.s16 d23, q15
+.else
-+ movw r5, #(1 << \bit_depth - 1)
-+ vmov.i64 q0, #0
-+ vdup.i16 q1, r5
++ vmov.i16 q0, #0
++ vmov.i16 q1, #(1 << \bit_depth - 1)
+ @ q8 & q15 should be unaltered and so don't require clipping
+ neg r1, r1
+ vmax.s16 q9, q0
@@ -2204,14 +2395,14 @@ index 0000000000..e665bd848a
+ vmin.s16 q13, q1
+ vmin.s16 q14, q1
+.endif
-+ mov pc, lr
++ bx lr
+.endm
+
+function hevc_loop_filter_luma_body
-+ m_filter_luma 8
++ m_filter_luma 8, q15, q11
+endfunc
+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon(
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
+@ uint8_t *_pix, [r0]
+@ ptrdiff_t _stride, [r1]
+@ int _beta, [r2]
@@ -2219,7 +2410,7 @@ index 0000000000..e665bd848a
+@ uint8_t *_no_p, [sp+0]
+@ uint8_t *_no_q) [sp+4]
+
-+function ff_hevc_rpi_v_loop_filter_luma_neon, export=1
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
+ hevc_loop_filter_luma_start
+
+ sub r4, r0, #4
@@ -2245,66 +2436,72 @@ index 0000000000..e665bd848a
+.Lv_loop_luma_common:
+ vpush {d8-d15}
+
-+ @ Uses slightly fewer instructions to do laned loads than unlaned
-+ @ and transpose. This also means that we can use the same code for
-+ @ both split & unsplit deblock
-+ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
-+ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
-+
-+ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+
-+ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
-+ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
-+
-+ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+
-+ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
-+ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
-+
-+ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+
-+ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
-+ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
-+
-+ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
-+ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
++ @ It's slightly faster to do unlaned loads and transpose in the
++ @ 8-bit case, even though it needs more instructions, because
++ @ VLD4.8 is a really slow way to read from memory.
++ vld1.32 {d16[0]}, [r4:32], r1
++ vld1.32 {d20[0]}, [r0:32], r1
++ vld1.32 {d16[1]}, [r4:32], r1
++ vld1.32 {d20[1]}, [r0:32], r1
++ vld1.32 {d17[0]}, [r4:32], r1
++ vld1.32 {d21[0]}, [r0:32], r1
++ vld1.32 {d17[1]}, [r4:32], r1
++ vld1.32 {d21[1]}, [r0:32], r1
++ vld1.32 {d18[0]}, [r4:32], r1
++ vld1.32 {d22[0]}, [r0:32], r1
++ vld1.32 {d18[1]}, [r4:32], r1
++ vld1.32 {d22[1]}, [r0:32], r1
++ vld1.32 {d19[0]}, [r4:32], r1
++ vld1.32 {d23[0]}, [r0:32], r1
++ vld1.32 {d19[1]}, [r4:32]
++ vld1.32 {d23[1]}, [r0:32]
++ vuzp.16 q8, q9
++ vuzp.16 q10, q11
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vswp d17, d18
++ vswp d21, d22
+
+ bl hevc_loop_filter_luma_body
+
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
+ @ no_p[1]
+ bmi 1f
+ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
+ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
+
+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
+1:
+ @ no_q[1]
-+@ tst r10, #2
+ bcs 1f
+ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
+ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
+
+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+1:
++ pop {r4-r10,pc}
++
+.Lbypasswrite:
+ vpop {d8-d15}
+ pop {r4-r10,pc}
+endfunc
+
-+.macro m_filter_v_luma_common_16 bit_depth
++.macro m_filter_v_luma_16 bit_depth
+ vpush {d8-d15}
+
+ @ Uses slightly fewer instructions to do laned loads than unlaned
@@ -2336,29 +2533,34 @@ index 0000000000..e665bd848a
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
+ @ p[1]
+ bmi 1f
+ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
+ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
+ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
+ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4]
++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6]
+1:
+ @ q[1]
+ bcs 1f
+ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
+ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
+ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0]
++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
+1:
-+ vpop {d8-d15}
+ pop {r4-r10,pc}
+.endm
+
@@ -2374,7 +2576,7 @@ index 0000000000..e665bd848a
+@
+@ Src should always be on 8 byte boundry & all in the same slice
+
-+function ff_hevc_rpi_h_loop_filter_luma_neon, export=1
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
+ hevc_loop_filter_luma_start
+ b .Lh_loop_filter_luma_common_8
+endfunc
@@ -2387,71 +2589,75 @@ index 0000000000..e665bd848a
+ ldr r10, [sp, #32]
+
+.Lh_loop_filter_luma_common_8:
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
+ vpush {d8-d15}
-+ sub r0, r0, r1, lsl #2
+
-+ vld1.8 {d16}, [r0], r1
++ vld1.8 {d16}, [r4], r1
+ vld1.8 {d17}, [r0], r1
-+ vld1.8 {d18}, [r0], r1
++ vld1.8 {d18}, [r4], r1
+ vld1.8 {d19}, [r0], r1
-+ vld1.8 {d20}, [r0], r1
++ vld1.8 {d20}, [r4], r1
+ vld1.8 {d21}, [r0], r1
-+ vld1.8 {d22}, [r0], r1
++ vld1.8 {d22}, [r4]
+ vld1.8 {d23}, [r0]
+
+ bl hevc_loop_filter_luma_body
+
-+ add r2, r0, r1, lsl #2
-+ add r0, r0, r1
-+
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
+ vpop {d8-d15}
+
+ @ P2-P0
+ bcs 1f
-+ vst1.8 {d22}, [r0], r1
-+ vst1.8 {d21}, [r0], r1
-+ vst1.8 {d20}, [r0]
++ vst1.8 {d22}, [r4], r1
++ vst1.8 {d21}, [r6]
++ vst1.8 {d20}, [r4]
+1:
+ @ Q0-Q2
+ bmi 1f
-+ vst1.8 {d19}, [r2], r1
-+ vst1.8 {d18}, [r2], r1
-+ vst1.8 {d17}, [r2]
++ vst1.8 {d19}, [r0], r1
++ vst1.8 {d18}, [r2]
++ vst1.8 {d17}, [r0]
+1:
+ pop {r4-r10,pc}
+endfunc
+
+
+.macro m_filter_h_luma_16 bit_depth
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
+ vpush {d8-d15}
-+ sub r0, r0, r1, lsl #2
+
-+ vld1.16 { q8}, [r0], r1
++ vld1.16 { q8}, [r4], r1
+ vld1.16 { q9}, [r0], r1
-+ vld1.16 {q10}, [r0], r1
++ vld1.16 {q10}, [r4], r1
+ vld1.16 {q11}, [r0], r1
-+ vld1.16 {q12}, [r0], r1
++ vld1.16 {q12}, [r4], r1
+ vld1.16 {q13}, [r0], r1
-+ vld1.16 {q14}, [r0], r1
++ vld1.16 {q14}, [r4]
+ vld1.16 {q15}, [r0]
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
-+ add r2, r0, r1, lsl #2
-+ add r0, r1
-+
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
+ vpop {d8-d15}
+
+ @ P2-P0
+ bcs 1f
-+ vst1.16 {q14}, [r0], r1
-+ vst1.16 {q13}, [r0], r1
-+ vst1.16 {q12}, [r0]
++ vst1.16 {q14}, [r4], r1
++ vst1.16 {q13}, [r6]
++ vst1.16 {q12}, [r4]
+1:
+ bmi 1f
-+ vst1.16 {q11}, [r2], r1
-+ vst1.16 {q10}, [r2], r1
-+ vst1.16 { q9}, [r2]
++ vst1.16 {q11}, [r0], r1
++ vst1.16 {q10}, [r2]
++ vst1.16 { q9}, [r0]
+1:
+ pop {r4-r10,pc}
+.endm
@@ -2474,23 +2680,30 @@ index 0000000000..e665bd848a
+@ common in the H direction than V due to how we arrange deblock.
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++ sub r12, r0, r1
+ cmp r2, #0
++ it eq
+ bxeq lr
-+ sub r0, r0, r1, lsl #1
++ vld1.8 {d26,d27}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.8 {d18,d19}, [r12], r1
+ vld1.8 {d16,d17}, [r0], r1
-+ vld1.8 {d18,d19}, [r0], r1
-+ vld1.8 {d26,d27}, [r0], r1
-+ vld1.8 {d28,d29}, [r0]
-+ sub r0, r0, r1, lsl #1
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
++ vld1.8 {d28,d29}, [r12]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++ "sub r12, r0, r1, asr #1"
+
-+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
-+ vstrpl d18, [r0, #0]
-+ vstrcc d19, [r0, #8]
-+ add r0, r1
+ lsls r3, #29 @ b2 -> N, b3 -> C
++ it pl
+ vstrpl d26, [r0, #0]
++ it cc
+ vstrcc d27, [r0, #8]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ it pl
++ vstrpl d18, [r12, #0]
++ it cc
++ vstrcc d19, [r12, #8]
+ bx lr
+
+endfunc
@@ -2506,37 +2719,43 @@ index 0000000000..e665bd848a
+@ Macro here actual function near bottom
+
+.macro m_filter_h_uv_16 bit_depth
++ sub r12, r0, r1
+ cmp r2, #0
++ it eq
+ bxeq lr
-+ sub r0, r0, r1, lsl #1
++ vld1.16 {q12, q13}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.16 {q10, q11}, [r12], r1
+ vld1.16 {q8, q9 }, [r0], r1
-+ vld1.16 {q10, q11}, [r0], r1
-+ vld1.16 {q12, q13}, [r0], r1
-+ vld1.16 {q14, q15}, [r0]
-+ sub r0, r0, r1, lsl #1
++ vld1.16 {q14, q15}, [r12]
+
-+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++ "sub r12, r0, r1, asr #1", \
++ "cmp r3, #0"
+
-+ cmp r3, #0
+ bne 1f
-+ vst1.16 {q10, q11}, [r0], r1
++ vst1.16 {q10, q11}, [r12]
+ vst1.16 {q12, q13}, [r0]
+ bx lr
+
+ @ At least one no_f bit is set
+ @ Which means we need to break this apart in an ugly fashion
+1:
-+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
-+ vstrpl d20, [r0, #0]
-+ vstrpl d21, [r0, #8]
-+ vstrcc d22, [r0, #16]
-+ vstrcc d23, [r0, #24]
-+ add r0, r1
+ lsls r3, #29 @ b2 -> N, b3 -> C
++ itt pl
+ vstrpl d24, [r0, #0]
+ vstrpl d25, [r0, #8]
++ itt cc
+ vstrcc d26, [r0, #16]
+ vstrcc d27, [r0, #24]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ itt pl
++ vstrpl d20, [r12, #0]
++ vstrpl d21, [r12, #8]
++ itt cc
++ vstrcc d22, [r12, #16]
++ vstrcc d23, [r12, #24]
+ bx lr
+.endm
+
@@ -2555,7 +2774,9 @@ index 0000000000..e665bd848a
+
+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
+ cmp r2, #0
++ it eq
+ bxeq lr
++ push {lr}
+ vld2.16 {d16[0], d18[0]}, [r3], r1
+ vld2.16 {d20[0], d22[0]}, [r0], r1
+
@@ -2570,106 +2791,114 @@ index 0000000000..e665bd848a
+ vld2.16 {d20[3], d22[3]}, [r0], r1
+ blo 10f
+
-+ sub r12, r0, r3
+ vld2.16 {d17[0], d19[0]}, [r3], r1
+ vld2.16 {d21[0], d23[0]}, [r0], r1
+
-+ cmp r12, #4
++ sub ip, r0, r3
+ vld2.16 {d17[1], d19[1]}, [r3], r1
+ vld2.16 {d21[1], d23[1]}, [r0], r1
+
++ cmp ip, #4
+ vld2.16 {d17[2], d19[2]}, [r3], r1
+ vld2.16 {d21[2], d23[2]}, [r0], r1
+
+ vld2.16 {d17[3], d19[3]}, [r3]
+ vld2.16 {d21[3], d23[3]}, [r0]
-+ it eq
-+ ldreq r12, [sp, #0]
+
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
-+ cmp r12, #0
-+ add r3, #2
-+ neg r1, r1
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #2", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
+ bne 1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b
-+ vst2.16 {d19[2], d21[2]}, [r3], r1
++ vst2.16 {d19[2], d21[2]}, [ip], r1
+ vst2.16 {d19[1], d21[1]}, [r3], r1
-+ vst2.16 {d19[0], d21[0]}, [r3], r1
++ vst2.16 {d19[0], d21[0]}, [ip], r1
+ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a
-+ vst2.16 {d18[2], d20[2]}, [r3], r1
-+ vst2.16 {d18[1], d20[1]}, [r3], r1
-+ vst2.16 {d18[0], d20[0]}, [r3]
-+ bx lr
++ vst2.16 {d18[2], d20[2]}, [ip], r1
++ vst2.16 {d18[1], d20[1]}, [r3]
++ vst2.16 {d18[0], d20[0]}, [ip]
++ pop {pc}
+
+@ Either split or partial
+1:
-+ ldr r12, [sp, #0]
-+ @ I have no idea if this is faster than any of the other ways of
-+ @ testing these bits but it does free up r12
-+ lsl r12, #28
-+ add r2, r0, r1, lsl #2
-+ msr APSR_nzcvq, r12 @ b0 (P0a) -> V, b1 (Q0a) -> C, b2 (P0b) -> Z, b3 (Q0b) -> N
-+ add r12, r3, r1, lsl #2
-+ bmi 1f
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
+ @ Q0b
+ vst1.16 {d21[3]}, [r0], r1
-+ vst1.16 {d21[2]}, [r0], r1
++ vst1.16 {d21[2]}, [r2], r1
+ vst1.16 {d21[1]}, [r0], r1
-+ vst1.16 {d21[0]}, [r0]
++ vst1.16 {d21[0]}, [r2], r1
+1:
-+ beq 2f
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
+ @ P0b
+ vst1.16 {d19[3]}, [r3], r1
-+ vst1.16 {d19[2]}, [r3], r1
++ vst1.16 {d19[2]}, [ip], r1
+ vst1.16 {d19[1]}, [r3], r1
-+ vst1.16 {d19[0]}, [r3]
-+
-+2:
-+ bcs 3f
++ vst1.16 {d19[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
+ @ Q0a
-+ vst1.16 {d20[3]}, [r2], r1
++ vst1.16 {d20[3]}, [r0], r1
+ vst1.16 {d20[2]}, [r2], r1
-+ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[1]}, [r0]
+ vst1.16 {d20[0]}, [r2]
-+
-+3:
-+ it vs
-+ bxvs lr
-+ vst1.16 {d18[3]}, [r12], r1
-+ vst1.16 {d18[2]}, [r12], r1
-+ vst1.16 {d18[1]}, [r12], r1
-+ vst1.16 {d18[0]}, [r12]
-+ bx lr
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[3]}, [r3], r1
++ vst1.16 {d18[2]}, [ip], r1
++ vst1.16 {d18[1]}, [r3]
++ vst1.16 {d18[0]}, [ip]
++ pop {pc}
+
+@ Single lump (rather than double)
+10:
-+ hevc_loop_filter_uv_body1 d16, d18, d20, d22
-+
+ @ As we have post inced r0/r3 in the load the easiest thing to do is
+ @ to subtract and write forwards, rather than backwards (as above)
-+ ldr r12, [sp, #0]
-+ add r3, #2
-+ sub r0, r0, r1, lsl #2
-+ sub r3, r3, r1, lsl #2
-+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++ "ldr lr, [sp, #4]", \
++ "add r3, #2", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
+
+ bcs 3f
++ @ Q0a
+ vst1.16 {d20[0]}, [r0], r1
-+ vst1.16 {d20[1]}, [r0], r1
-+ vst1.16 {d20[2]}, [r0], r1
-+ vst1.16 {d20[3]}, [r0]
-+
++ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[2]}, [r0]
++ vst1.16 {d20[3]}, [r2]
+3:
-+ it mi
-+ bxmi lr
++ it mi
++ popmi {pc}
++ @ P0a
+ vst1.16 {d18[0]}, [r3], r1
-+ vst1.16 {d18[1]}, [r3], r1
-+ vst1.16 {d18[2]}, [r3], r1
-+ vst1.16 {d18[3]}, [r3]
-+ bx lr
++ vst1.16 {d18[1]}, [ip], r1
++ vst1.16 {d18[2]}, [r3]
++ vst1.16 {d18[3]}, [ip]
++ pop {pc}
+
+endfunc
+
@@ -2694,15 +2923,16 @@ index 0000000000..e665bd848a
+
+.macro m_filter_v_uv2_16 bit_depth
+ cmp r2, #0
++ it eq
+ bxeq lr
-+
++ push {lr}
+ vld2.32 {d16[0], d18[0]}, [r3], r1
+ vld2.32 {d20[0], d22[0]}, [r0], r1
+
++ cmp r2, #0x10000
+ vld2.32 {d16[1], d18[1]}, [r3], r1
+ vld2.32 {d20[1], d22[1]}, [r0], r1
+
-+ cmp r2, #0x10000
+ vld2.32 {d17[0], d19[0]}, [r3], r1
+ vld2.32 {d21[0], d23[0]}, [r0], r1
+
@@ -2713,170 +2943,527 @@ index 0000000000..e665bd848a
+ vld2.32 {d24[0], d26[0]}, [r3], r1
+ vld2.32 {d28[0], d30[0]}, [r0], r1
+
++ sub ip, r0, r3
+ vld2.32 {d24[1], d26[1]}, [r3], r1
+ vld2.32 {d28[1], d30[1]}, [r0], r1
-+ sub r12, r0, r3
+
++ cmp ip, #8
+ vld2.32 {d25[0], d27[0]}, [r3], r1
+ vld2.32 {d29[0], d31[0]}, [r0], r1
-+ cmp r12, #8
+
+ vld2.32 {d25[1], d27[1]}, [r3]
+ vld2.32 {d29[1], d31[1]}, [r0]
-+ it eq
-+ ldreq r12, [sp, #0]
+
-+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth
-+ cmp r12, #0
-+ add r3, #4
-+ neg r1, r1
++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #4", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
+ bne 1f
+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
+@ so it is worth having this special case
-+ vst2.32 {d27[1], d29[1]}, [r3], r1
-+ vst2.32 {d27[0], d29[0]}, [r3], r1
-+ vst2.32 {d26[1], d28[1]}, [r3], r1
-+ vst2.32 {d26[0], d28[0]}, [r3], r1
-+ vst2.32 {d19[1], d21[1]}, [r3], r1
-+ vst2.32 {d19[0], d21[0]}, [r3], r1
-+ vst2.32 {d18[1], d20[1]}, [r3], r1
-+ vst2.32 {d18[0], d20[0]}, [r3]
-+ bx lr
++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b
++ vst2.32 {d27[0], d29[0]}, [ip], r1
++ vst2.32 {d26[1], d28[1]}, [r3], r1
++ vst2.32 {d26[0], d28[0]}, [ip], r1
++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a
++ vst2.32 {d19[0], d21[0]}, [ip], r1
++ vst2.32 {d18[1], d20[1]}, [r3]
++ vst2.32 {d18[0], d20[0]}, [ip]
++ pop {pc}
+
+@ Either split or partial
+1:
-+ ldr r12, [sp, #0]
-+ lsls r12, #29 @ b2 (P0b) -> N, b3 (Q0b) -> C
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
+ bcs 1f
+ @ Q0b
-+ mov r2, r0
-+ vst1.32 {d29[1]}, [r2], r1
++ vst1.32 {d29[1]}, [r0], r1
+ vst1.32 {d29[0]}, [r2], r1
-+ vst1.32 {d28[1]}, [r2], r1
-+ vst1.32 {d28[0]}, [r2]
++ vst1.32 {d28[1]}, [r0], r1
++ vst1.32 {d28[0]}, [r2], r1
+1:
-+ bmi 2f
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
+ @ P0b
-+ mov r2, r3
-+ vst1.32 {d27[1]}, [r2], r1
-+ vst1.32 {d27[0]}, [r2], r1
-+ vst1.32 {d26[1]}, [r2], r1
-+ vst1.32 {d26[0]}, [r2]
-+
-+2:
-+ lsls r12, #2 @ b0 (P0a) -> N, b1 (Q0a) -> C
-+ bcs 3f
++ vst1.32 {d27[1]}, [r3], r1
++ vst1.32 {d27[0]}, [ip], r1
++ vst1.32 {d26[1]}, [r3], r1
++ vst1.32 {d26[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
+ @ Q0a
-+ add r0, r0, r1, lsl #2
+ vst1.32 {d21[1]}, [r0], r1
-+ vst1.32 {d21[0]}, [r0], r1
-+ vst1.32 {d20[1]}, [r0], r1
-+ vst1.32 {d20[0]}, [r0]
-+
-+3:
-+ it mi
-+ bxmi lr
++ vst1.32 {d21[0]}, [r2], r1
++ vst1.32 {d20[1]}, [r0]
++ vst1.32 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
+ @ P0a
-+ add r3, r3, r1, lsl #2
+ vst1.32 {d19[1]}, [r3], r1
-+ vst1.32 {d19[0]}, [r3], r1
-+ vst1.32 {d18[1]}, [r3], r1
-+ vst1.32 {d18[0]}, [r3]
-+ bx lr
-+
++ vst1.32 {d19[0]}, [ip], r1
++ vst1.32 {d18[1]}, [r3]
++ vst1.32 {d18[0]}, [ip]
++ pop {pc}
+
++@ Single lump (rather than double)
+10:
-+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth
-+
+ @ As we have post inced r0/r3 in the load the easiest thing to do is
+ @ to subtract and write forwards, rather than backwards (as above)
-+ ldr r12, [sp, #0]
-+ add r3, #4
-+ sub r0, r0, r1, lsl #2
-+ sub r3, r3, r1, lsl #2
-+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "add r3, #4", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
+
+ bcs 3f
+ @ Q0a
+ vst1.32 {d20[0]}, [r0], r1
-+ vst1.32 {d20[1]}, [r0], r1
-+ vst1.32 {d21[0]}, [r0], r1
-+ vst1.32 {d21[1]}, [r0]
-+
++ vst1.32 {d20[1]}, [r2], r1
++ vst1.32 {d21[0]}, [r0]
++ vst1.32 {d21[1]}, [r2]
+3:
-+ it mi
-+ bxmi lr
++ it mi
++ popmi {pc}
+ @ P0a
+ vst1.32 {d18[0]}, [r3], r1
-+ vst1.32 {d18[1]}, [r3], r1
-+ vst1.32 {d19[0]}, [r3], r1
-+ vst1.32 {d19[1]}, [r3]
-+ bx lr
++ vst1.32 {d18[1]}, [ip], r1
++ vst1.32 {d19[0]}, [r3]
++ vst1.32 {d19[1]}, [ip]
++ pop {pc}
+.endm
+
+
++#if 1 // NEON version
+
+
-+/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
-+ * int *curr_rpl0, int *curr_
-+ * MvField *curr, MvField *ne
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ mov ip, sp
++ push {a2,v1-v8,lr}
++ ldm ip, {v1-v5}
++ cmp a1, #2
++ bls 2f
++ vpush {d8-d13}
++ sub v5, v5, #10
++ mov v6, #32
++1:
++ vld2.32 {d0[0], d2[0]}, [a3]!
++ vld2.32 {d4[0], d6[0]}, [a4]!
++ vmov.u8 q12, #0
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[0]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[0]}, [ip]
++ vld1.32 {d18[0]}, [v8]
++ vld1.32 {d22[0]}, [lr]
++
++ vld2.32 {d0[1], d2[1]}, [a3]!
++ vld2.32 {d4[1], d6[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d12, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d13, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d27, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[2]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[1]}, [ip]
++ vld1.32 {d18[1]}, [v8]
++ vld1.32 {d22[1]}, [lr]
++
++ vld2.32 {d1[0], d3[0]}, [a3]!
++ vld2.32 {d5[0], d7[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[4]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[4]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[0]}, [ip]
++ vld1.32 {d19[0]}, [v8]
++ vld1.32 {d23[0]}, [lr]
++
++ vld2.32 {d1[1], d3[1]}, [a3]!
++ vld2.32 {d5[1], d7[1]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[6]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[6]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[1]}, [ip]
++ vld1.32 {d19[1]}, [v8]
++ vld1.32 {d23[1]}, [lr]
++
++ @ So now we have:
++ @ q0.32[i] = curr[i].mv[0]
++ @ q1.32[i] = curr[i].mv[1]
++ @ q2.32[i] = neigh[i].mv[0]
++ @ q3.32[i] = neigh[i].mv[1]
++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d24.16[i] = curr[i].pred_flag
++ @ d25.16[i] = neigh[i].pred_flag
++
++ vtst.16 d28, d24, d12
++ vtst.16 d29, d24, d13
++ vadd.i16 d8, d24, d12
++ vadd.i16 d9, d25, d12
++ vtst.16 d30, d25, d12
++ vtst.16 d31, d25, d13
++ veor d26, d8, d9
++ ldr lr, [sp, 6*8]
++ vmovl.s16 q4, d28
++ vmovl.s16 q5, d29
++ teq lr, #1
++ vmovl.s16 q14, d30
++ it ne
++ lslne v1, lr, #1
++ vmovl.s16 q15, d31
++ it ne
++ rsbne v2, v1, #32
++ vbif q0, q1, q4
++ vbif q2, q3, q14
++ vbif q1, q0, q5
++ vbif q3, q2, q15
++ vabd.s16 q12, q0, q2
++ vabd.s16 q2, q1
++ vabd.s16 q0, q3
++ vabd.s16 q1, q3
++ vbif q8, q9, q4
++ vbif q10, q11, q14
++ vbif q9, q8, q5
++ vbif q11, q10, q15
++ vclt.u16 d6, d24, d27
++ vclt.u16 d8, d2, d27
++ vclt.u16 d7, d25, d27
++ vclt.u16 d9, d3, d27
++ vclt.u16 d2, d0, d27
++ vclt.u16 d0, d4, d27
++ vclt.u16 d3, d1, d27
++ vclt.u16 d1, d5, d27
++ vceq.i32 q12, q10, q8
++ vceq.i32 q10, q9
++ vceq.i32 q8, q11
++ vceq.i32 q9, q11
++ vshrn.i32 d6, q3, #8
++ vshrn.i32 d7, q4, #8
++ vshrn.i32 d8, q1, #8
++ vshrn.i32 d9, q0, #8
++ vmovn.i32 d4, q12
++ vmovn.i32 d2, q10
++ vmovn.i32 d3, q8
++ vmovn.i32 d5, q9
++ vand q2, q3
++ vrev16.8 q3, q3
++ vand q2, q3
++ vand q1, q4
++ vrev16.8 q4, q4
++ vand q1, q4
++ vand d4, d5
++ vand d2, d3
++ vbic d0, d12, d4
++ vshr.u16 d26, #2
++ vbic d0, d2
++ vmov.i16 d1, #0x5555
++ vorr d0, d26
++ bne 10f
++
++ @ Merge results into result word, no duplicates
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ sub v6, #8
++ lsl a2, #30
++ lsl v8, #30
++ lsl ip, #30
++ lsl lr, #30
++ orr a2, ip, a2, lsr #2
++ orr v8, lr, v8, lsr #2
++ orr a2, v8, a2, lsr #4
++ subs a1, #4
++ orr v7, a2, v7, lsr #8
++ bhi 1b
++
++ vpop {d8-d13}
++ mov a1, v7, lsr v6
++ pop {a2,v1-v8,pc}
++10:
++ @ Merge results into result word, with duplicates
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ sub v6, v6, v1, lsl #2
++ lsl a2, v2
++ subs a1, #4
++ lsl v8, v2
++ lsl ip, v2
++ lsl lr, v2
++ ldr v2, [sp, #6*8 + 10*4 + 1*4]
++T lsr a2, v1
++T orr a2, ip, a2
++A orr a2, ip, a2, lsr v1
++ lsl ip, v1, #1
++T lsr v8, v1
++T orr v8, lr, v8
++A orr v8, lr, v8, lsr v1
++ lsl lr, v1, #2
++T lsr a2, ip
++T orr a2, v8, a2
++A orr a2, v8, a2, lsr ip
++ ldr v1, [sp, #6*8 + 10*4]
++T lsr v7, lr
++T orr v7, a2, v7
++A orr v7, a2, v7, lsr lr
++ bhi 1b
++
++ vpop {d8-d13}
++ mov a1, v7, lsr v6
++ pop {a2,v1-v8,pc}
++
++
++2:
++ sub v5, v5, #10
++ vmov.u8 d16, #0
++ blo 3f
++ vld2.32 {d0[0], d1[0]}, [a3]!
++ vld2.32 {d2[0], d3[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[4]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[0]}, [ip]
++ vld1.32 {d6[0]}, [v8]
++ vld1.32 {d7[0]}, [lr]
++
++3:
++ vld2.32 {d0[1], d1[1]}, [a3]!
++ vld2.32 {d2[1], d3[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d17, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d18, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d19, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[6]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[1]}, [ip]
++ vld1.32 {d6[1]}, [v8]
++ vld1.32 {d7[1]}, [lr]
++
++ @ So now we have:
++ @ d0.32[i] = curr[i].mv[0]
++ @ d1.32[i] = curr[i].mv[1]
++ @ d2.32[i] = neigh[i].mv[0]
++ @ d3.32[i] = neigh[i].mv[1]
++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d16.16[i] = curr[i].pred_flag
++ @ d16.16[2+i] = neigh[i].pred_flag
++
++ vtst.16 d20, d16, d17
++ vtst.16 d22, d16, d18
++ vadd.i16 d30, d16, d17
++ vswp d2, d3
++ ldr lr, [sp]
++ vmovl.s16 q10, d20
++ teq lr, #1
++ vmovl.s16 q11, d22
++ it ne
++ lslne v1, lr, #1
++ vbif d0, d1, d20
++ vbif d4, d6, d20
++ vbif d3, d2, d21
++ vbif d5, d7, d21
++ vbif d1, d0, d22
++ vbif d6, d4, d22
++ vbif d2, d3, d23
++ vbif d7, d5, d23
++ vshr.u16 d30, #2
++ vabd.s16 d24, d0, d3
++ vabd.s16 d25, d1, d2
++ vabd.s16 q0, q0, q1
++ vceq.i32 d2, d4, d5
++ vceq.i32 d20, d5, d6
++ vceq.i32 d21, d4, d7
++ vceq.i32 d3, d6, d7
++ vclt.u16 d6, d24, d19
++ vclt.u16 d7, d25, d19
++ vclt.u16 d22, d1, d19
++ vclt.u16 d23, d0, d19
++ vshrn.i32 d6, q3, #8
++ vmovn.i32 d2, q1
++ vshrn.i32 d7, q11, #8
++ vmovn.i32 d3, q10
++ vand q0, q3, q1
++ it ne
++ rsbne v2, v1, #32
++ vrev16.8 q3, q3
++ vand q0, q3
++ vsra.u64 d30, #32
++ vshr.u64 q1, q0, #32
++ vand q0, q1
++ vbic d0, d17, d0
++ vand d30, d30, d17
++ vbic d0, d1
++ vmov.i16 d1, #0x5555
++ vorr d0, d30
++ bne 10f
++
++ @ Construct result word, no duplicates
++ cmp a1, #2
++ vmov.u16 a1, d0[1]
++ vmov.u16 a2, d0[0]
++ it eq
++ orreq a1, a2, a1, lsl #2
++ pop {a2,v1-v8,pc}
++10:
++ @ Construct result word, with duplicates
++ cmp a1, #2
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov.u16 a1, d0[1]
++ lsl a2, #16
++ pkhbt a1, a1, a1, lsl #16
++ lsr a2, v2
++ lsr a1, v2
++T itt eq
++T lsleq a1, v1
++T orreq a1, a2, a1
++A orreq a1, a2, a1, lsl v1
++ pop {a2,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc)
+ */
+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
+ add ip, sp, #4*4
+ push {a2-a4,v1-v8,lr}
-+ ldmia ip, {v5-v7}
++ mov v6, #32
+1: ldmdb ip, {v1-v4}
-+ ldrsb a3, [v5, #8] @ curr->ref_idx
-+ ldrsb v8, [v5, #9]
-+ ldrsb ip, [v6, #8] @ neigh->ref_idx
-+ ldrsb lr, [v6, #9]
-+ ldr v1, [v1, a3, lsl #2]
-+ ldrb a3, [v5, #10] @ curr->pred_flag
++ ldrsb v5, [a3, #8] @ curr->ref_idx
++ ldrsb v8, [a3, #9]
++ ldrsb ip, [a4, #8] @ neigh->ref_idx
++ ldrsb lr, [a4, #9]
++ ldr v1, [v1, v5, lsl #2]
++ ldrb v5, [a3, #10] @ curr->pred_flag
+ ldr v2, [v2, v8, lsl #2]
-+ ldrb v8, [v6, #10] @ neigh->pred_flag
++ ldrb v8, [a4, #10] @ neigh->pred_flag
+ ldr v3, [v3, ip, lsl #2]
+ ldr v4, [v4, lr, lsl #2]
-+ teq a3, #3
++ teq v5, #3
+ beq 20f
+ teq v8, #3
+ beq 90f
+
-+ tst a3, #1
++ tst v5, #1
+ itee ne
-+ ldrne a3, [v5, #0] @ curr->mv[0]
-+ ldreq a3, [v5, #4] @ curr->mv[1]
++ ldrne v5, [a3, #0] @ curr->mv[0]
+ moveq v1, v2
++ ldreq v5, [a3, #4] @ curr->mv[1]
+ tst v8, #1
+ itee ne
-+ ldrne v8, [v6, #0] @ neigh->mv[0]
-+ ldreq v8, [v6, #4] @ neigh->mv[1]
++ ldrne v8, [a4, #0] @ neigh->mv[0]
+ moveq v3, v4
++ ldreq v8, [a4, #4] @ neigh->mv[1]
+ teq v1, v3
+ bne 10f
+ ldr lr, =0xFFFCFFFC
-+ ssub16 ip, v8, a3
-+ ssub16 a3, a3, v8
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 ip, v8, v5
++ ssub16 v5, v5, v8
++ sel v5, v5, ip
++ ands v5, v5, lr
+ @ drop through
+10: it ne
-+ movne a3, #1
-+11: subs a2, a2, #1
-+12:
-+A strbhs a3, [v7], a4
-+T itt hs
-+T strbhs a3, [v7]
-+T addhs v7, v7, a4
++ movne v5, #1<<30
++11:
++ sub v6, v6, #2
++T mov v7, v7, lsr #2
+ subs a2, a2, #1
-+ bhs 12b
++A orr v7, v5, v7, lsr #2
++T orr v7, v5, v7
++ bhi 11b
+
-+ ldm sp, {a2, a3}
++ ldr v5, [sp, #16*4]
+ add ip, sp, #16*4
++ ldr a2, [sp]
+ subs a1, a1, #1
-+ add v5, v5, a3
-+ add v6, v6, a3
++ add a3, a3, v5
++ add a4, a4, v5
+ bhi 1b
++ mov a1, v7, lsr v6
+ pop {a2-a4,v1-v8,pc}
+
+20: teq v8, #3
@@ -2889,43 +3476,43 @@ index 0000000000..e665bd848a
+ teq v1, v2
+ bne 30f
+
-+ ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
-+ ssub16 a3, v1, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 25f
+ ssub16 ip, v4, v2
-+ ssub16 a3, v2, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ beq 11b
+ @ drop through
+25: ssub16 ip, v4, v1
-+ ssub16 a3, v1, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 10b
+ ssub16 ip, v3, v2
-+ ssub16 a3, v2, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ b 10b
+
-+30: ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++30: ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
-+ ssub16 a3, v1, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 10b
+ ssub16 ip, v4, v2
-+ ssub16 a3, v2, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ b 10b
+
+40: teq v1, v4
@@ -2933,21 +3520,26 @@ index 0000000000..e665bd848a
+ teqeq v2, v3
+ bne 10b
+
-+ ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ b 25b
+
-+90: mov a3, #1
++90:
++ mov v5, #1<<30
+ b 11b
+endfunc
+
++
++#endif
++
++
+@ =============================================================================
+@
+@ 10 bit
+
+function hevc_loop_filter_luma_body_10
-+ m_filter_luma 10
++ m_filter_luma 10, q11, q15
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
@@ -2980,7 +3572,7 @@ index 0000000000..e665bd848a
+ ldr r10, [sp, #32]
+
+.Lv_loop_luma_common_10:
-+ m_filter_v_luma_common_16 10
++ m_filter_v_luma_16 10
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
@@ -3220,10 +3812,10 @@ index 0000000000..109fa98c29
+}
diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
new file mode 100644
-index 0000000000..a721e392ab
+index 0000000000..8a94a644a4
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,465 @@
+@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi
+ *
@@ -3255,8 +3847,8 @@ index 0000000000..a721e392ab
+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
+// have been removed from head as we never use them.
+
-+void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -3455,9 +4047,10 @@ index 0000000000..a721e392ab
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+
-+void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs);
++ int in_inc);
++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
+
+
+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
@@ -3557,10 +4150,10 @@ index 0000000000..a721e392ab
+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+ if (bit_depth == 8) {
-+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon;
-+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon;
-+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon;
-+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon;
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8;
+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8;
@@ -3688,6 +4281,7 @@ index 0000000000..a721e392ab
+ assert(offsetof(MvField, ref_idx) == 8);
+ assert(offsetof(MvField, pred_flag) == 10);
+ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
+}
diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
new file mode 100644
@@ -7255,6 +7849,6091 @@ index 0000000000..b56e0f9644
+ edge_64b_bodies edge_64b_body_16, 4
+endfunc
+
+diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
+new file mode 100644
+index 0000000000..36a23a5bf9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
+new file mode 100644
+index 0000000000..80724d4cf3
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
+new file mode 100644
+index 0000000000..21e7700174
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ switch (bit_depth)
++ {
++ case 8:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++ break;
++ case 10:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++ break;
++ default:
++ break;
++ }
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+new file mode 100644
+index 0000000000..8063a1521e
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2373 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ d24 Cur Line - load before 1st call for down - set by _up
++@ d16 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ q0-q3, q14, q15
++
++patch_h_down_8x8_8:
++ mov r5, #8
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++ vld1.8 {d24[7]}, [r2]!
++
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++ vdup.8 d30, r6
++ vext.8 q2, q3, #8
++ vdup.8 d31, r12
++ vext.8 q3, q3, #8
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d7, q14, #5
++ bne 2b
++
++store_tran_8x8_8:
++ add r12, r0, #4
++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]
++ add r5, r0, r3
++ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r12], r3
++ add r0, #8
++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r5 ], r3
++ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r12], r3
++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r5 ], r3
++ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r12], r3
++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r5 ], r3
++ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r12], r3
++ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r5 ], r3
++ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r12], r3
++ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r5 ], r3
++ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r12], r3
++ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r5 ], r3
++ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r12], r3
++ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r5 ]
++ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r12]
++ bx lr
++
++
++patch_h_up_8x8_8:
++ mov r5, #8
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ @ r2=left (variable), r1=up (const)
++ adds r8, r7
++ vmov d24, d16
++T itee mi
++ ldrbmi r12, [r2, #-1]!
++T asrpl r12, r8, #8
++T ldrbpl r12, [r1, r12]
++A ldrbpl r12, [r1, r8, asr #8]
++ vext.8 d16, d16, d16, #7
++ sub r6, #32
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++
++ vmull.u8 q14, d16, d31
++ vext.8 q2, q3, #8
++ vdup.8 d30, r12
++ vext.8 q3, q3, #8
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d7, q14, #5
++ bne 2b
++ b store_tran_8x8_8 @ This will return
++
++.macro ADRT reg, val
++@ adr in T32 has enough range but not in A32
++A adrl \reg, \val
++T adr \reg, \val
++.endm
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ mov r5, #4 @ Loop counter for all cases
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++
++98:
++ add r12, r0, r3
++ lsl r3, #1
++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ], r3
++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r12], r3
++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0 ]
++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r12]
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.32 {d16[0]}, [r2]
++ sub r8, r7
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ @ r2=left (variable), r1=up (const)
++ adds r8, r7
++ vmov d24, d16
++T itee mi
++ ldrbmi r12, [r2, #-1]!
++T asrpl r12, r8, #8
++T ldrbpl r12, [r1, r12]
++A ldrbpl r12, [r1, r8, asr #8]
++ vext.8 d16, d16, d16, #7
++ sub r6, #32
++ vmov.8 d16[0], r12
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++
++ vmull.u8 q14, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++ b 98b
++
++18:
++ cmp r12, #26
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.32 {d16[0]}, [r1 :32] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++T asr r12, r8, #8
++T ldrb r12, [r2, r12]
++A ldrb r12, [r2, r8, asr #8]
++
++ vmov d24, d16
++ add r8, r7
++ sub r6, #32
++ vext.8 d16, d16, #7
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.32 {d0[0]}, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {d24}, [r1] @ Up + up-right, may be on 32-bit align rather than 64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.32 {d0[0]}, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ bl patch_h_down_8x8_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++ sub r8, r7
++ bl patch_h_up_8x8_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {d16}, [r1 :64] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++T asr r12, r8, #8
++T ldrb r12, [r2, r12]
++A ldrb r12, [r2, r8, asr #8]
++
++ vmov d24, d16
++ add r8, r7
++ sub r6, #32
++ vext.8 d16, d16, #7
++ vmov.8 d16[0], r12
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {d24, d25}, [r1 :64]! @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 q12, q12, #1
++ sub r6, #32
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++
++ mov r2, r1 @ restore r2
++ sub r0, #16
++ add r6, r4, #32 @ Force initial load in main loop
++ vld1.8 {d24}, [r2]!
++ add r0, r0, r3, lsl #3
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.8 {d16}, [r2]
++ sub r8, r7
++
++ push {r2, r8}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r2, r8}
++
++ sub r0, #16
++ add r6, r4, #32
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #3
++ add r0, r0, r3, lsl #3
++ vld1.8 {d16}, [r2]
++
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++T asr r12, r8, #8
++T ldrb r12, [r2, r12]
++A ldrb r12, [r2, r8, asr #8]
++
++ vmov q12, q8
++ add r8, r7
++ sub r6, #32
++ vext.8 q8, q8, q8, #15
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1 :128]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.8 q12, q12, #1
++ sub r6, #32
++ vld1.8 {d25[7]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #3
++ add r0, r0, r3, lsl #3
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #32 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8, q9 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++T asr r12, r8, #8
++T ldrb r12, [r2, r12]
++A ldrb r12, [r2, r8, asr #8]
++
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ sub r6, #32
++ vext.8 q9, q8, q9, #15
++ vext.8 q8, q8, q8, #15
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmull.u8 q2, d18, d31
++ vmull.u8 q3, d19, d31
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++ vmlal.u8 q2, d26, d30
++ vmlal.u8 q3, d27, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.8 q12, q13, #1
++ vext.8 q13, q13, #1
++ sub r6, #32
++ vld1.8 {d27[7]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmull.u8 q2, d26, d30
++ vmull.u8 q3, d27, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++ vmlal.u8 q2, d18, d31
++ vmlal.u8 q3, d19, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++@ Chroma 8 bit 4x4 patch fns
++ .text
++
++patch_h_down_c_4x4_8:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 d24, d24, #1
++ sub r6, #32
++ vld1.16 {d24[3]}, [r2]!
++
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++
++store_tran_c_4x4_8:
++ add r12, r0, r3
++ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r5 ]
++ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12]
++ bx lr
++
++patch_h_up_c_4x4_8:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov d24, d16
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #2
++ it pl
++ addpl r2, r1, r8, asr #7
++ vext.16 d16, d16, d16, #3
++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0
++ and r2, #~1
++ sub r6, #32
++ vld1.16 d16[0], [r2]
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++
++ vmull.u8 q14, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++ b store_tran_c_4x4_8 @ This will return
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ bl patch_h_down_c_4x4_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.8 {d16}, [r2]
++ bl patch_h_up_c_4x4_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {d16}, [r1 :64] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r12, r8, #8
++ vmov d24, d16
++ add r8, r7
++ vext.16 d16, d16, #3
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vld1.16 {d16[0]}, [r12]
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1] @ Up + UR (only 64-bit aligned)
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q12, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ mov r1, r2
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++
++ sub r0, #16
++ add r0, r0, r3, lsl #2
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.8 {d16}, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r2, r8}
++
++ add r2, r2, #8
++ sub r0, #16
++ sub r8, r8, r7, lsl #2
++ vld1.8 {d16}, [r2]
++ add r0, r0, r3, lsl #2
++ add r6, r4, #32
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vext.16 q8, q8, #7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vld1.16 {d16[0]}, [r12]
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ vmull.u8 q1, d17, d31
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1 :128]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q12, #1
++ sub r6, #32
++ vld1.16 {d25[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8, q9 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ vext.16 q9, q8, q9, #7
++ sub r6, #32
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmull.u8 q2, d18, d31
++ vmull.u8 q3, d19, d31
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++ vmlal.u8 q2, d26, d30
++ vmlal.u8 q3, d27, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ sub r6, #32
++ vld1.16 {d27[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmull.u8 q2, d26, d30
++ vmull.u8 q3, d27, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++ vmlal.u8 q2, d18, d31
++ vmlal.u8 q3, d19, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++ .text
++ .balign 64
++angle_2:
++ .byte 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Sign inverted from standards table
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Standard sign
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++
++ @ Sign inverted from standards table
++inv_angle:
++ .short 4096, 1638, 910, 630, 482, 390, 315
++ .short 256
++ .short 315, 390, 482, 630, 910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++ .text
++ .balign 64
++
++patch_h_down_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 d24, d24, #1
++ sub r6, #32
++ vld1.16 {d24[3]}, [r2]!
++
++1:
++ rsb r12, r6, #32
++ vext.16 q1, q2, #4
++ vmov s0, r6
++ vmov s1, r12
++ vext.16 q2, q2, #4
++
++ vmul.u16 d1, d24, d0[0]
++ add r6, r4
++ vmla.u16 d1, d16, d0[2]
++ subs r5, #1
++ vrshr.u16 d5, d1, #5
++ bne 2b
++
++store_tran_4x4_10:
++ add r12, r0, r3
++ vst4.16 {d2[0], d3[0], d4[0], d5[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.16 {d2[1], d3[1], d4[1], d5[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.16 {d2[2], d3[2], d4[2], d5[2]}, [r5 ]
++ vst4.16 {d2[3], d3[3], d4[3], d5[3]}, [r12]
++ bx lr
++
++patch_h_up_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov d24, d16
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #2
++ it pl
++ addpl r2, r1, r8, asr #7
++ vext.16 d16, d16, d16, #3
++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0
++ and r2, #~1
++ sub r6, #32
++ vld1.16 d16[0], [r2]
++
++1:
++ rsb r12, r6, #32
++ vext.16 q1, q2, #4
++ vmov s0, r6
++ vmov s1, r12
++ vext.16 q2, q2, #4
++
++ vmul.u16 d1, d24, d0[2]
++ add r6, r4
++ vmla.u16 d1, d16, d0[0]
++ subs r5, #1
++ vrshr.u16 d5, d1, #5
++ bne 2b
++ b store_tran_4x4_10 @ This will return
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.16 {d24}, [r2]!
++ bl patch_h_down_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.16 {d16}, [r2]
++ bl patch_h_up_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {d16}, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov d24, d16
++ add r8, r7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vext.16 d16, d16, #3
++ vld1.16 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 d2, d16, d0[2]
++ vmla.u16 d2, d24, d0[0]
++ vrshr.u16 d2, #5
++
++ subs r5, #1
++ vst1.16 {d2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {d24, d25}, [r1 :64] @ Up + UR (64bit aligned)
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 q12, q13, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 d2, d24, d0[0]
++ vmla.u16 d2, d16, d0[2]
++ vrshr.u16 d2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {d2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.16 {d24}, [r2]!
++ mov r1, r2
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++
++ vld1.16 {d24}, [r1]!
++ sub r0, #16
++ add r6, r4, #32 @ Force initial load in main loop
++ add r0, r0, r3, lsl #2
++ mov r2, r1
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.16 {d16}, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r2, r8}
++
++ sub r0, #16
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vrshr.u16 q1, #5
++
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1 :128] @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q13, #1
++ sub r6, #32
++ vext.16 q13, q13, #1
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vrshr.u16 q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.16 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8, q9}, [r1] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ sub r6, #32
++ vext.16 q9, q8, q9, #7
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ sub r6, #32
++ vld1.16 {d27[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ vpush {q4 }
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #8 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.16 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ mov r9, #4
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ subs r9, #1
++ bne 1b
++
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ mov r9, #4
++1:
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ subs r9, #1
++ bne 1b
++ pop {r2, r8}
++
++ sub r0, #64
++ subs r10, #1
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++18:
++ cmp r12, #26
++ mov r5, #32 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vldm r1, {q8-q11} @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ vmov q14, q10
++ vmov q15, q11
++ sub r6, #32
++ vext.16 q11, q10, q11, #7
++ vext.16 q10, q9, q10, #7
++ vext.16 q9, q8, q9, #7
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmul.u16 q3, q10, d0[2]
++ vmul.u16 q4, q11, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vmla.u16 q3, q14, d0[0]
++ vmla.u16 q4, q15, d0[0]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++ b 99f
++
++@ Right of vertical - works along top - left unused
++26:
++ vldm r1, {q12-q15} @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++ add r1, #64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vmov q10, q14
++ vmov q11, q15
++ vext.16 q12, q13, #1
++ vext.16 q13, q14, #1
++ vext.16 q14, q15, #1
++ vext.16 q15, q15, #1
++ sub r6, #32
++ vld1.16 {d31[3]}, [r1]!
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmul.u16 q3, q14, d0[0]
++ vmul.u16 q4, q15, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vmla.u16 q3, q10, d0[2]
++ vmla.u16 q4, q11, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ add r6, r4
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++99:
++ vpop {q4 }
++ pop {r4-r10, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1 Up ptr (_up only)
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ q2 Cur Line - load before 1st call for down - set by _up
++@ q8 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q2
++ vext.32 q2, q2, #1
++ sub r6, #32
++ vld1.32 {d5[1]}, [r2]!
++1:
++ rsb r12, r6, #32
++ vmov q12, q13
++ vmov s0, r6
++ vmov s1, r12
++ vmov q13, q14
++
++ vmul.u16 q3, q2, d0[0]
++ add r6, r4
++ vmla.u16 q3, q8, d0[2]
++ vmov q14, q15
++ subs r5, #1
++ vrshr.u16 q15, q3, #5
++ bne 2b
++
++store_tran_c_4x4_10:
++ add r12, r0, r3
++ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.32 {d24[1], d26[1], d28[1], d30[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.32 {d25[0], d27[0], d29[0], d31[0]}, [r5 ]
++ vst4.32 {d25[1], d27[1], d29[1], d31[1]}, [r12]
++ bx lr
++
++patch_h_up_c_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov q2, q8
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #4
++ it pl
++ addpl r2, r1, r8, asr #6
++ vext.32 q8, q8, #3
++ @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1
++ and r2, #~3
++ sub r6, #32
++ vld1.32 d16[0], [r2]
++1:
++ rsb r12, r6, #32
++ vmov q12, q13
++ vmov s0, r6
++ vmov s1, r12
++ vmov q13, q14
++
++ vmul.u16 q1, q2, d0[2]
++ add r6, r4
++ vmla.u16 q1, q8, d0[0]
++ vmov q14, q15
++ subs r5, #1
++ vrshr.u16 q15, q1, #5
++ bne 2b
++ b store_tran_c_4x4_10 @ This will return
++
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.32 {q2 }, [r2]!
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.32 {q8 }, [r2]
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vext.32 q8, q8, q8, #3
++ add r12, r2, r12, lsl #2
++ sub r6, #32
++ vld1.32 {d16[0]}, [r12]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vrshr.u16 q1, #5
++
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1] @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.32 q12, q13, #1
++ vext.32 q13, q13, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vrshr.u16 q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.32 {q2 }, [r2]!
++ mov r1, r2
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++
++ vld1.32 {q2 }, [r1]!
++ sub r0, #32
++ add r6, r4, #32 @ Force initial load in main loop
++ add r0, r0, r3, lsl #2
++ mov r2, r1
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.32 {q8 }, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r2, r8}
++
++ sub r0, #32
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ vld1.32 {q8 }, [r2]
++ add r6, r4, #32
++
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8, q9 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q12, q8
++ asr r12, r8, #8
++ vmov q13, q9
++ add r8, r7
++ vext.32 q9, q8, q9, #3
++ add r12, r2, r12, lsl #2
++ vext.32 q8, q8, q8, #3
++ sub r6, #32
++ vld1.32 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.32 q12, q13, #1
++ vext.32 q13, q14, #1
++ sub r6, #32
++ vld1.32 {d27[1]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ vpush {q4 }
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.32 {q2 }, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.32 {q8 }, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r2, r8}
++
++ sub r0, #64
++ subs r10, #1
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vldm r1, {q8-q11} @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #2
++ vmov q14, q10
++ vmov q15, q11
++ vext.32 q11, q10, q11, #3
++ vext.32 q10, q9, q10, #3
++ vext.32 q9, q8, q9, #3
++ vext.32 q8, q8, q8, #3
++ sub r6, #32
++ vld1.32 {d16[0]}, [r9]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmul.u16 q3, q10, d0[2]
++ vmul.u16 q4, q11, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vmla.u16 q3, q14, d0[0]
++ vmla.u16 q4, q15, d0[0]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++ b 99f
++
++@ Right of vertical - works along top - left unused
++26:
++ vldm r1, {q12-q15} @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++ add r1, #64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vmov q10, q14
++ vmov q11, q15
++ vext.32 q12, q13, #1
++ vext.32 q13, q14, #1
++ vext.32 q14, q15, #1
++ vext.32 q15, q15, #1
++ sub r6, #32
++ vld1.32 {d31[1]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmul.u16 q3, q14, d0[0]
++ vmul.u16 q4, q15, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vmla.u16 q3, q10, d0[2]
++ vmla.u16 q4, q11, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ add r6, r4
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++99:
++ vpop {q4 }
++ pop {r4-r10, pc}
++
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+new file mode 100644
+index 0000000000..75a1789c25
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,695 @@
++/*
++ * Copyright (c) 2017 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++ @ Average the els of top & left
++ ldr r2, [r2]
++ vld1.32 {d0[0]}, [r1]
++ mov r1, #2
++ vmov s1, r2
++ vmov s2, r2
++ vmov.i16 q2, #3
++ add r2, r0, r3
++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0]
++ lsl r3, #1
++ vmovl.u8 q0, d0
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.32 {d0[0]}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d1, d0, #5*8
++ vshr.u64 d2, d0, #6*8
++ vshr.u64 d3, d0, #7*8
++ vbif d1, d6, d7
++ vbif d2, d6, d7
++ vst1.32 {d1[0]}, [r2], r3
++ vbif d3, d6, d7
++ vst1.32 {d2[0]}, [r0]
++ vst1.32 {d3[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ vld1.8 {d1}, [r2]
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T lsl r3, #1
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshrn.u16 d0, q1, #3
++
++ @ Store
++ vst1.8 {d0}, [r0], r3
++ vst1.8 {d0}, [r2], r3
++ vst1.8 {d0}, [r0]
++ vst1.8 {d0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ mov r1, #2
++ vld1.8 {d16}, [r2]
++ vmov.i16 q2, #3
++ vmov.i64 d7, #0xffff
++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0]
++ vmovl.u8 q0, d0
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vmovl.u8 q1, d16
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q1, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d2, q1, #2
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.8 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d2, #8
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ mov r1, #6
++1:
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ subs r1, #2
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ mov r1, #8
++ vld1.8 {q1}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vadd.i16 q1, q0
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshrn.u16 d0, q1, #4
++ vrshrn.u16 d1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0}, [r0], r3
++ subs r1, #4
++ vst1.8 {q0}, [r2], r3
++ vst1.8 {q0}, [r0], r3
++ vst1.8 {q0}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q8}, [r1]
++ mov r1, #2
++ vld1.8 {q9}, [r2]
++ vaddl.u8 q10, d16, d17
++ vaddl.u8 q11, d16, d18
++ vaddl.u8 q0, d18, d19
++ vmov.i16 q1, #3
++ vadd.i16 q10, q0
++ vmovl.u8 q0, d18
++ vadd.i16 d20, d21
++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmovl.u8 q2, d16
++ vmovl.u8 q9, d19
++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same)
++ vmov.i64 d7, #0xffff
++ vmovl.u8 q8, d17
++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7]
++ vmov.i64 d7, #0xff
++ vpadd.i16 d20, d20 @ 1 (all the same)
++ vrshr.u16 d21, d20, #5
++ vrshr.u16 d20, d20, #5
++ vmla.i16 q0, q10, d2[1]
++ vmla.i16 q9, q10, d2[1]
++ vmla.i16 q2, q10, q1
++ vmla.i16 q8, q10, d2[1]
++ vdup.8 q1, d20[0]
++ vrshrn.i16 d0, q0, #2
++ vrshrn.i16 d1, q9, #2
++ vrshrn.i16 d4, q2, #2
++ vrshrn.i16 d5, q8, #2
++ vext.8 q0, q0, q0, #1
++
++ @ Store top line
++ vst1.8 {q2}, [r0], r3
++
++ @ Store the rest
++ mov r1, #15
++1:
++ vbit d2, d0, d7
++ vext.8 q0, q0, q0, #1
++ subs r1, #1
++ vst1.8 {q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #16
++ vld1.8 {q2-q3}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vaddl.u8 q1, d2, d3
++A lsl r3, #2
++T lsl r3, #1
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshrn.u16 d0, q2, #5
++ vrshrn.u16 d1, q2, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q2, #5
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #32
++ vld1.8 {q2-q3}, [r2]
++ add r2, r0, r3
++ vaddl.u8 q0, d0, d1
++ lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshrn.u16 d0, q2, #6
++ vrshrn.u16 d1, q2, #6
++ vrshrn.u16 d2, q2, #6
++ vrshrn.u16 d3, q2, #6
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {d0}, [r1]
++ mov r1, #2
++ vld1.16 {d1}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vmov.i64 d7, #0xffff
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vrshr.u16 q0, #2
++
++ @ Store top line
++ vst1.16 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d3, d1, #1*16
++ vshr.u64 d4, d1, #2*16
++ vshr.u64 d5, d1, #3*16
++ vbif d3, d6, d7
++ vbif d4, d6, d7
++ vst1.16 {d3}, [r2], r3
++ vbif d5, d6, d7
++ vst1.16 {d4}, [r0]
++ vst1.16 {d5}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ vld1.8 {q1}, [r2]
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q0, q1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshr.u16 q0, q1, #3
++
++ vst1.16 {q0}, [r0], r3
++ vst1.16 {q0}, [r2], r3
++ vst1.16 {q0}, [r0]
++ vst1.16 {q0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0}, [r1]
++ mov r1, #2
++ vld1.16 {q8}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q8, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.16 q2, d6[0]
++ vdup.16 q9, d6[0]
++ vrshr.u16 q8, q8, #2
++ vrshr.u16 q0, q0, #2
++ vext.16 q1, q8, q8, #1
++
++ @ Store top line
++ vst1.16 {q0}, [r0], r3
++
++ @ Store the rest
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ mov r1, #6
++1:
++ vext.16 q8, q8, q8, #2
++ subs r1, #2
++ vext.16 q1, q1, q1, #2
++ vbit d4, d16, d7
++ vst1.16 {q2}, [r0], r3
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0-q1}, [r1]
++ mov r1, #8
++ vld1.16 {q2-q3}, [r2]
++T lsl r3, #2
++ vadd.i16 q1, q0
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q2, q3
++ vadd.i16 q1, q2
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshr.u16 q0, q1, #4
++ vrshr.u16 q1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q8-q9}, [r1]
++ mov r1, #2
++ vld1.16 {q10-q11}, [r2]
++ lsl r3, #1 @ stride given in pels
++ vadd.i16 q0, q8, q9
++ vadd.i16 q1, q10, q11
++ vmov.i16 q3, #3
++ vadd.i16 q1, q0
++ vadd.i16 d0, d16, d20
++ vmov.i64 d31, #0xffff
++ vadd.i16 d3, d2
++ vmov.16 d6[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ topline[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7]
++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d3, d3 @ 1 (all the same)
++ vrshr.u16 d2, d3, #5
++ vrshr.u16 d3, d3, #5
++ vmov q0, q1
++ vmla.i16 q10, q1, d6[1]
++ vmla.i16 q11, q1, d6[1]
++ vmla.i16 q8, q1, q3
++ vmla.i16 q9, q1, d6[1]
++ vrshr.u16 q2, q10, #2
++ vrshr.u16 q3, q11, #2
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vext.16 q2, q2, q2, #1
++ mov r1, #7<<29
++
++ @ Store top line
++ vst1.16 {q8-q9}, [r0], r3
++
++ @ Store the rest
++1:
++ vbit d0, d4, d31
++ vext.16 q2, q2, q2, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++1:
++ vbit d0, d6, d31
++ vext.16 q3, q3, q3, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #16
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #2
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshr.u16 q0, q2, #5
++ vrshr.u16 q1, q2, #5
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++ @ Average the els of top & left
++ @ With 10 bits we are (just) safe from overflow in i16
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #32
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #1
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshr.u16 q0, q2, #6
++ vrshr.u16 q1, q2, #6
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+new file mode 100644
+index 0000000000..11773f918e
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+@@ -0,0 +1,878 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ All functions have the call
++@
++@ int ff_hevc_rpi_intra_filter_N_neon_PW(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++@
++@ Assumptions:
++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
++@ if reuseing this code)
++@
++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore
++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
++@
++@ We always have at least 64 pixel H frame width rounding - this lets us
++@ load UR widthout having to worry about exactly how many pixels are actually
++@ within the frame. As partial loads will only occur very occasionally this
++@ should be a win in nearly all cases.
++@
++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
++@ so we do no maths on the contents
++@
++@ No filtering in 32bit fns as they are chroma only
++
++
++.equ AVAIL_UR, 1
++.equ AVAIL_U, 2
++.equ AVAIL_UL, 4
++.equ AVAIL_L, 8
++.equ AVAIL_DL, 16
++
++.equ FILTER_LIGHT, 0x40
++.equ FILTER_STRONG, 0x80
++
++.equ AVAIL_S_UR_N_U_C, 32 - 1
++.equ AVAIL_S_U_N_UL_C, 32 - 2
++.equ AVAIL_S_UL_N_L_C, 32 - 3
++.equ AVAIL_S_L_N_DL_C, 32 - 4
++
++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr
++
++@ On entry
++@ r2 req
++@ r3 avail
++@ [sp, #sp_offset...] args
++@
++@ On Exit:
++@
++@ Extend values:
++@ d_l scalar contains value for L & DL
++@ d_ul scalar containing value for UL
++@ d_u scalar containing value for U
++@ d_ur scalar containing value for UR
++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
++@ This means that L-filter works even if nreq DL (we never filter
++@ req-DL without req-L, but we do filter req-L without req-DL)
++@ If UR avail then d_ur == a_ur so U-filter good too
++@
++@ Data load pointers (only load if req & avail):
++@ r4 DL
++@ r10 L
++@ r6 U
++@ r5 UR
++@
++@ Others:
++@ r2 req
++@ r7 req & avail
++@ r3 L + stride
++@ r8 DL + stride
++@ r9 stride * 2
++@ cs Load U
++@ mi Load UR
++@
++@ Clobbered:
++@ r12
++
++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
++
++.equ src_l\@, \sp_offset + 0
++.equ src_u\@, \sp_offset + 4
++.equ src_ur\@, \sp_offset + 8
++.equ stride\@, \sp_offset + 12
++.equ pw\@, (1 << \pw_s) @ pel width in bytes
++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes
++
++@ r9 stride
++@ r7 = ab_ul, r6 = a_u, r5 = a_ur
++@ r4 = b_dl, r10 = b_l, r8 = b_u
++
++ ldr r5, [sp, #src_ur\@]
++ lsl r12, r3, #AVAIL_S_U_DL_CPSR
++ ldr r10, [sp, #src_l\@]
++ ldr r9, [sp, #stride\@]
++ ldr r6, [sp, #src_u\@]
++
++ @ This is quite a slow instruction but it replaces
++ @ a decent number of tests that yield a max of 2 flags/op
++ @ It is annoying we can't branch on Q!
++ @ If L navail (ne) then DL must be navail (pl)
++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur
++
++ mov r4, r5
++ sub r7, r10, r9
++ it vs
++ movvs r4, r6
++ add r8, r6, #b_size\@ - pw\@
++ it cs
++ movcs r4, r7
++ ite ne
++ movne r10, r4
++ addeq r4, r7, r9, lsl #\log2_s
++ it cc
++ movcc r7, r10
++ it mi
++ addmi r4, r10, r9, lsl #\log2_s
++ vld1.\d_type {\d_ul}, [r7]
++ itt vc
++ movvc r8, r7
++ movvc r6, r7
++ vld1.\d_type {\d_l }, [r4]
++ tst r3, #AVAIL_UR
++ vld1.\d_type {\d_u }, [r6]
++ it eq
++ moveq r5, r8
++ and r7, r2, r3
++ add r8, r4, r9
++ vld1.\d_type {\d_ur}, [r5]
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ add r3, r10, r9
++ lsl r9, #1
++.endm
++
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
++
++ it cs
++ vldrcs s2, [r6]
++ ite pl
++ vmovpl s3, s4
++ vldrmi s3, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10]
++ vld1.8 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d0[4]}, [r4], r9
++ vld1.8 {d0[5]}, [r8], r9
++ vld1.8 {d0[6]}, [r4]
++ vld1.8 {d0[7]}, [r8]
++1:
++ vstr d1, [r1] @ Up
++ vst1.8 {d31[7]}, [r12]
++ vstr d0, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
++
++ it cs
++ vldrcs d2, [r6]
++ it mi
++ vldrmi d3, [r5]
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10]
++ vld1.16 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d1[0]}, [r4], r9
++ vld1.16 {d1[1]}, [r8], r9
++ vld1.16 {d1[2]}, [r4]
++ vld1.16 {d1[3]}, [r8]
++1:
++ vst1.16 {q1}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12]
++ vst1.16 {q0}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 3
++
++function ff_hevc_rpi_intra_filter_8_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
++
++ it cs
++ vldrcs d4, [r6]
++ it mi
++ vldrmi d5, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ bpl 1f
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10], r9
++ vld1.8 {d0[3]}, [r3], r9
++ vld1.8 {d0[4]}, [r10], r9
++ vld1.8 {d0[5]}, [r3], r9
++ vld1.8 {d0[6]}, [r10]
++ vld1.8 {d0[7]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d1[0]}, [r4], r9
++ vld1.8 {d1[1]}, [r8], r9
++ vld1.8 {d1[2]}, [r4], r9
++ vld1.8 {d1[3]}, [r8], r9
++ vld1.8 {d1[4]}, [r4], r9
++ vld1.8 {d1[5]}, [r8], r9
++ vld1.8 {d1[6]}, [r4]
++ vld1.8 {d1[7]}, [r8]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.8 q8, q15, q2, #15
++ vext.8 q12, q15, q0, #15
++ vaddl.u8 q9, d17, d5
++ vaddl.u8 q8, d16, d4
++ vaddl.u8 q13, d25, d1
++ vaddl.u8 q12, d24, d0
++ vmov.u8 r3, d5[7] @ Save final pel
++ vmov.u8 r2, d1[7] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshrn.u16 d4, q2, #2
++ vrshrn.u16 d5, q3, #2
++ vrshrn.u16 d0, q0, #2
++ vrshrn.u16 d1, q1, #2
++ vrshr.u16 d30, #2
++ vmov.u8 d5[7], r3 @ Restore final pel
++ vmov.u8 d1[7], r2 @ Restore final pel
++ vdup.u8 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.8 {q2 }, [r1] @ Up
++ vst1.8 {d31[7]}, [r12] @ Up-left
++ vst1.8 {q0 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #4
++ vldm r5, {d6, d7}
++ bgt 1f
++ vdup.16 d7, d6[3]
++1:
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vdup.16 q1, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10]
++ vld1.16 {d1[3]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.16 {d2[0]}, [r4], r9
++ vld1.16 {d2[1]}, [r8], r9
++ cmp r12, #p_size
++ vld1.16 {d2[2]}, [r4], r9
++ vld1.16 {d2[3]}, [r8], r9
++ blt 2f
++ vld1.16 {d3[0]}, [r4], r9
++ vld1.16 {d3[1]}, [r8], r9
++ vld1.16 {d3[2]}, [r4]
++ vld1.16 {d3[3]}, [r8]
++ b 1f
++2:
++ vdup.16 d3, d2[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.16 q9, q2, q3, #7
++ vext.16 q8, q15, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++ vadd.u16 q9, q3
++ vadd.u16 q8, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r3, d7[3] @ Save final pel
++ vmov.u16 r2, d3[3] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r3 @ Restore final pel
++ vmov.u16 d3[3], r2 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.16 {q2, q3}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vst1.16 {q0, q1}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_16_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.16 q9, d16[0]
++ vdup.16 q11, d20[0]
++
++ it cs
++ vldmcs r6, {d16-d19}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #12
++ @ Given chroma frame layout, if UR exists then it is always legit to
++ @ load all of it even if most of it is outside the frame.
++ vldm r5, {d20-d23}
++ bgt 1f
++ bge 4f
++ cmp r5, #8
++ bge 3f
++ vdup.16 d21, d20[3]
++3: vdup.16 d22, d21[3]
++4: vdup.16 d23, d22[3]
++
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ ldr r12, [sp, #dl_size]
++ vdup.16 q1, d0[0]
++ vdup.16 q2, d0[0]
++ vdup.16 q3, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10], r9
++ vld1.16 {d1[3]}, [r3], r9
++ vld1.16 {d2[0]}, [r10], r9
++ vld1.16 {d2[1]}, [r3], r9
++ vld1.16 {d2[2]}, [r10], r9
++ vld1.16 {d2[3]}, [r3], r9
++ vld1.16 {d3[0]}, [r10], r9
++ vld1.16 {d3[1]}, [r3], r9
++ vld1.16 {d3[2]}, [r10]
++ vld1.16 {d3[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d4[0]}, [r4], r9
++ vld1.16 {d4[1]}, [r8], r9
++ cmp r12, #4
++ vld1.16 {d4[2]}, [r4], r9
++ vld1.16 {d4[3]}, [r8], r9
++ ble 2f
++ vld1.16 {d5[0]}, [r4], r9
++ vld1.16 {d5[1]}, [r8], r9
++ cmp r12, #12
++ vld1.16 {d5[2]}, [r4], r9
++ vld1.16 {d5[3]}, [r8], r9
++ blt 3f
++ vld1.16 {d6[0]}, [r4], r9
++ vld1.16 {d6[1]}, [r8], r9
++ vld1.16 {d6[2]}, [r4], r9
++ vld1.16 {d6[3]}, [r8], r9
++ ble 4f
++ vld1.16 {d7[0]}, [r4], r9
++ vld1.16 {d7[1]}, [r8], r9
++ vld1.16 {d7[2]}, [r4]
++ vld1.16 {d7[3]}, [r8]
++ b 1f
++2: vdup.16 d5, d4[3]
++3: vdup.16 d6, d5[3]
++4: vdup.16 d7, d6[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ vpush {q5}
++ @ Luma light filter
++ @ Left
++ vext.16 q5, q2, q3, #7
++ vext.16 q14, q1, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++
++ vadd.u16 q5, q3
++ vadd.u16 q14, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r2, d7[3] @ Save final pel
++
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q14, #1
++ vext.16 q2, q14, q5, #1
++ vext.16 q3, q5, q5, #1
++
++ vmov d30, d24 @ d30[0] = l[0] + ul
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++ vadd.u16 q2, q14
++ vadd.u16 q3, q5
++
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++
++ @ Up
++ vext.16 q5, q10, q11, #7
++ vext.16 q14, q9, q10, #7
++ vext.16 q13, q8, q9, #7
++ vext.16 q12, q15, q8, #7
++
++ vadd.u16 q5, q11
++ vadd.u16 q14, q10
++ vadd.u16 q13, q9
++ vadd.u16 q12, q8
++ vmov.u16 r3, d23[3] @ Save final pel
++
++ vext.16 q8, q12, q13, #1
++ vext.16 q9, q13, q14, #1
++ vext.16 q10, q14, q5, #1
++ vext.16 q11, q5, q5, #1
++
++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q8, q12
++ vadd.u16 q9, q13
++ vadd.u16 q10, q14
++ vadd.u16 q11, q5
++
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vrshr.u16 q10, #2
++ vrshr.u16 q11, #2
++
++ @ Misc
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r2 @ Restore final pel
++ vmov.u16 d23[3], r3 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++ vpop {q5}
++
++10:
++ vstm r1, {d16-d23} @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vstm r0, { d0-d7 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_4_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ it mi
++ vldmmi r5, {d6, d7}
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10]
++ vld1.32 {d1[1]}, [r3]
++1:
++ bcc 1f
++ vld1.32 {d2[0]}, [r4], r9
++ vld1.32 {d2[1]}, [r8], r9
++ vld1.32 {d3[0]}, [r4]
++ vld1.32 {d3[1]}, [r8]
++1:
++ vst1.32 {q2, q3 }, [r1] @ Up
++ vst1.32 {d31[1]}, [r12]
++ vst1.32 {q0, q1 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.32 q9, d16[0]
++ vdup.32 q11, d20[0]
++
++ it cs
++ vldmcs r6, {q8, q9 }
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #p_size
++ vldm r5, {q10, q11}
++ bge 1f
++ vdup.32 q11, d21[1]
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ vdup.32 q2, d0[0]
++ vdup.32 q3, d0[0]
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10], r9
++ vld1.32 {d1[1]}, [r3], r9
++ vld1.32 {d2[0]}, [r10], r9
++ vld1.32 {d2[1]}, [r3], r9
++ vld1.32 {d3[0]}, [r10]
++ vld1.32 {d3[1]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.32 {d4[0]}, [r4], r9
++ vld1.32 {d4[1]}, [r8], r9
++ cmp r12, #p_size
++ vld1.32 {d5[0]}, [r4], r9
++ vld1.32 {d5[1]}, [r8], r9
++ blt 2f
++ vld1.32 {d6[0]}, [r4], r9
++ vld1.32 {d6[1]}, [r8], r9
++ vld1.32 {d7[0]}, [r4]
++ vld1.32 {d7[1]}, [r8]
++ b 1f
++2:
++ vdup.32 q3, d5[1]
++1:
++ add r12, r0, #-pw
++ vstm r1, { q8-q11} @ Up
++ vst1.32 {d31[1]}, [r12]
++ vstm r0, { q0-q3 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_16_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (bytes)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++
++ @ Once we get this big we have run out of neon regs to store
++ @ everything at once so do in pieces
++
++ @ Up (have)
++ it cs
++ vldmcs r6, { q0-q3 }
++ ldr r12, [sp, #ur_size]
++ it mi
++ vldmmi r5, { q8-q11}
++ it cs
++ vstmcs r1, { q0-q3 }
++ bpl 1f
++ cmp r12, #12
++ add lr, r1, #(pw << log2_s)
++ bgt 2f
++ cmp r12, #8
++ bge 3f
++ vdup.16 q9, d17[1]
++4: vdup.16 d10, d19[1]
++3: vdup.16 q11, d21[1]
++2: vstm lr, { q8-q11}
++1:
++
++ @ Left (have)
++ add lr, r0, #-pw
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vst1.32 {d30[1]}, [lr] @ UL
++ bpl 1f
++ vld1.32 { d0[0]}, [r10], r9
++ vld1.32 { d0[1]}, [r3], r9
++ vld1.32 { d1[0]}, [r10], r9
++ vld1.32 { d1[1]}, [r3], r9
++ vld1.32 { d2[0]}, [r10], r9
++ vld1.32 { d2[1]}, [r3], r9
++ vld1.32 { d3[0]}, [r10], r9
++ vld1.32 { d3[1]}, [r3], r9
++ vld1.32 { d4[0]}, [r10], r9
++ vld1.32 { d4[1]}, [r3], r9
++ vld1.32 { d5[0]}, [r10], r9
++ vld1.32 { d5[1]}, [r3], r9
++ vld1.32 { d6[0]}, [r10], r9
++ vld1.32 { d6[1]}, [r3], r9
++ vld1.32 { d7[0]}, [r10]
++ vld1.32 { d7[1]}, [r3]
++ vstm r0, { q0-q3 }
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ add lr, r0, #(pw << log2_s)
++ vld1.32 {d16[0]}, [r4], r9
++ vld1.32 {d16[1]}, [r8], r9
++ cmp r12, #4
++ vld1.32 {d17[0]}, [r4], r9
++ vld1.32 {d17[1]}, [r8], r9
++ ble 2f
++ vld1.32 {d18[0]}, [r4], r9
++ vld1.32 {d18[1]}, [r8], r9
++ cmp r12, #12
++ vld1.32 {d19[0]}, [r4], r9
++ vld1.32 {d19[1]}, [r8], r9
++ blt 3f
++ vld1.32 {d20[0]}, [r4], r9
++ vld1.32 {d20[1]}, [r8], r9
++ vld1.32 {d21[0]}, [r4], r9
++ vld1.32 {d21[1]}, [r8], r9
++ ble 4f
++ vld1.32 {d22[0]}, [r4], r9
++ vld1.32 {d22[1]}, [r8], r9
++ vld1.32 {d23[0]}, [r4]
++ vld1.32 {d23[1]}, [r8]
++ b 5f
++2: vdup.32 q9, d17[1]
++3: vdup.32 q10, d19[1]
++4: vdup.32 q11, d21[1]
++5: vstm lr, { q8-q11}
++1:
++ eors r7, r2
++ beq 99f
++
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ vdup.32 q0, d31[0]
++ vdup.32 q1, d31[0]
++ vdup.32 q2, d31[0]
++ vdup.32 q3, d31[0]
++ add lr, r1, #(pw << log2_s)
++ vdup.32 q8, d31[1]
++ vdup.32 q9, d31[1]
++ vdup.32 q10, d31[1]
++ vdup.32 q11, d31[1]
++ it cs
++ vstmcs r1, { q0-q3 }
++ it mi
++ vstmmi lr, { q8-q11}
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q0, d30[0]
++ vdup.32 q1, d30[0]
++ vdup.32 q2, d30[0]
++ vdup.32 q3, d30[0]
++ add lr, r0, #(pw << log2_s)
++ it mi
++ vstmmi r0, { q0-q3 }
++ it cs
++ vstmcs lr, { q0-q3 }
++
++99:
++ pop {r4-r10, pc}
++endfunc
++
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+new file mode 100644
+index 0000000000..ccf13a081f
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,888 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ * Vertical, at least, is relatively common
++ * Much simpler code than the general angular case
++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ * cases where it should be turned off (rdpcm & an extension sps flag).
++ * These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++ vld1.32 {d0[0] }, [r1 :32] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.32 {d24[0]}, [r2 :32] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d24, d4
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ mov r1, #4
++ vdup.8 d2, d2[0]
++ vqadd.s8 d24, d2
++ vmov.i64 d4, #0xff
++ veor.8 d24, d6
++
++1:
++ vbit.8 d0, d24, d4
++ vext.8 d24, d24, #1
++ subs r1, #1
++ vst1.32 {d0[0] }, [r0 :32], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++ vld1.8 {d0 }, [r1 :64] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {d24}, [r2 :64] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d24, d4
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ mov r1, #8
++ vdup.8 d2, d2[0]
++ vqadd.s8 d24, d2
++ vmov.i64 d4, #0xff
++ veor.8 d24, d6
++
++1:
++ vbit.8 d0, d24, d4
++ vext.8 d24, d24, #1
++ subs r1, #1
++ vst1.8 {d0 }, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {q12}, [r2 :128] @ left
++
++ vdup.8 q2, r12
++ vmov.u8 q3, #128
++ vhsub.u8 q12, q2
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ vdup.8 q1, d2[0]
++ vqadd.s8 q12, q1
++ veor.8 q12, q3
++
++ vmov.i64 d4, #0xff
++ mov r1, #16
++1:
++ vbit.8 d0, d24, d4
++ vext.8 q12, q12, #1
++ subs r1, #1
++ vst1.8 {q0 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++ vld1.8 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d0 }, [r2 :64], r3
++ vst1.16 {d0 }, [r0 :64]
++ vst1.16 {d0 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++ vld1.32 {d0[0] }, [r1 :32] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.32 {d16[0]}, [r2 :32] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d0, d4
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 d2, d2[0]
++ lsl r3, #1
++ vqadd.s8 d0, d2
++ veor.8 d0, d6
++
++ vdup.8 d1, d16[1]
++ vdup.8 d2, d16[2]
++ vdup.8 d3, d16[3]
++ vst1.32 {d0[0] }, [r0 :32], r3
++ vst1.32 {d1[0] }, [r2 :32], r3
++ vst1.32 {d2[0] }, [r0 :32]
++ vst1.32 {d3[0] }, [r2 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++ vld1.8 {d0 }, [r1 :64] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {d16}, [r2 :64] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d0, d4
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 d2, d2[0]
++ lsl r3, #1
++ vqadd.s8 d0, d2
++ mov r1, #3
++ veor.8 d0, d6
++
++ vdup.8 d4, d16[1]
++ vst1.8 {d0 }, [r0 :64], r3
++ vst1.8 {d4 }, [r2 :64], r3
++
++1:
++ vext.8 d16, d16, #2
++ subs r1, #1
++ vdup.8 d0, d16[0]
++ vdup.8 d4, d16[1]
++ vst1.8 {d0 }, [r0 :64], r3
++ vst1.8 {d4 }, [r2 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {q8 }, [r2 :128] @ left
++
++ vdup.8 q2, r12
++ vmov.u8 q3, #128
++ vhsub.u8 q0, q2
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 q1, d2[0]
++ lsl r3, #1
++ vqadd.s8 q0, q1
++ mov r1, #7
++ veor.8 q0, q3
++
++ vdup.8 q2, d16[1]
++ vst1.8 {q0 }, [r0 :128], r3
++ vst1.8 {q2 }, [r2 :128], r3
++
++1:
++ vext.8 q8, q8, #2
++ subs r1, #1
++ vdup.8 q0, d16[0]
++ vdup.8 q2, d16[1]
++ vst1.8 {q0 }, [r0 :128], r3
++ vst1.8 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++ vld1.8 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vdup.8 q0, d16[0]
++ vdup.8 q1, d16[0]
++ vdup.8 q2, d16[1]
++ vdup.8 q3, d16[1]
++ vext.8 q8, q9, #2
++ vext.8 q9, q9, #2
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++ vld1.16 {d16}, [r2 :64] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vdup.16 d0, d16[0]
++ vdup.16 d1, d16[1]
++ vdup.16 d2, d16[2]
++ vdup.16 d3, d16[3]
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d1 }, [r2 :64], r3
++ vst1.16 {d2 }, [r0 :64]
++ vst1.16 {d3 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++ vld1.16 {q8 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q2, d16[1]
++ vext.16 q8, q8, #2
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++ vld1.16 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++ vext.16 q8, q9, #2
++ vext.16 q9, q9, #2
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {d24}, [r2 :64] @ left
++
++ vdup.16 d4, r12
++ lsl r3, #1
++ vhsub.u16 d24, d4
++
++ vdup.16 d6, d0[0]
++ vmov.s16 d4, #0
++ vadd.i16 d24, d6
++
++ vmov.s16 d6, #0x3ff
++ vmax.s16 d24, d4
++ vmov.i64 d4, #0xffff
++ vmin.s16 d24, d6
++
++ mov r1, #4
++1:
++ vbit.8 d0, d24, d4
++ vext.16 d24, d24, #1
++ subs r1, #1
++ vst1.16 {d0 }, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q12}, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ lsl r3, #1
++ vhsub.u16 q12, q2
++
++ vdup.16 q3, d0[0]
++ vmov.s16 q2, #0
++ vadd.i16 q12, q3
++
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q12, q2
++ vmin.s16 q12, q3
++
++ vmov.i64 d4, #0xffff
++ mov r1, #8
++1:
++ vbit.8 d0, d24, d4
++ vext.16 q12, q12, #1
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q12, q13}, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ lsl r3, #1
++ vhsub.u16 q12, q2
++ vhsub.u16 q13, q2
++
++ vdup.16 q3, d0[0]
++ vmov.s16 q2, #0
++ vadd.i16 q12, q3
++ vadd.i16 q13, q3
++
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q12, q2
++ vmax.s16 q13, q2
++ vmin.s16 q12, q3
++ vmin.s16 q13, q3
++
++ vmov.i64 d4, #0xffff
++ mov r1, #16
++1:
++ vbit.8 d0, d24, d4
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ mov r1, #32
++1:
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #1
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128]
++ vst1.16 {q0 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ mov r1, #16
++1:
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ bx lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {d16}, [r2 :64] @ left
++
++ vdup.16 d4, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 d0, d4
++
++ vdup.16 d6, d16[0]
++ vmov.s16 d4, #0
++ vadd.i16 d0, d6
++
++ vmov.s16 d6, #0x3ff
++ vmax.s16 d0, d4
++ lsl r3, #2
++ vmin.s16 d0, d6
++
++ vdup.16 d1, d16[1]
++ vdup.16 d2, d16[2]
++ vdup.16 d3, d16[3]
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d1 }, [r2 :64], r3
++ vst1.16 {d2 }, [r0 :64]
++ vst1.16 {d3 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q8 }, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 q0, q2
++
++ vdup.16 q3, d16[0]
++ lsl r3, #2
++ vmov.s16 q2, #0
++ vadd.i16 q0, q3
++
++ mov r1, #3
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmin.s16 q0, q3
++
++ vdup.16 q2, d16[1]
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q2 }, [r2 :128], r3
++1:
++ vext.16 q8, q8, #2
++ vdup.16 q0, d16[0]
++ vdup.16 q2, d16[1]
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q8, q9 }, [r2 :128] @ left
++
++
++ vdup.16 q2, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 q0, q2
++ vhsub.u16 q1, q2
++
++ vdup.16 q3, d16[0]
++ lsl r3, #2
++ vmov.s16 q2, #0
++ vadd.i16 q0, q3
++ vadd.i16 q1, q3
++
++ mov r1, #7
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmax.s16 q1, q2
++ vmin.s16 q0, q3
++ vmin.s16 q1, q3
++
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ vst1.16 {q2, q3 }, [r2 :128], r3
++1:
++ vext.16 q8, q9, #2
++ vext.16 q9, q9, #2
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ vst1.16 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++ vldm r2, { q8-q11}
++ mov r1, #16
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[0]
++ vdup.16 q3, d16[0]
++ add r2, r0, r3, lsl #1
++ vdup.16 q12, d16[1]
++ vdup.16 q13, d16[1]
++ vdup.16 q14, d16[1]
++ vdup.16 q15, d16[1]
++ vstm r0, { q0-q3 }
++ vstm r2, {q12-q15}
++
++ vext.16 q8, q9, #2
++ vext.16 q9, q10, #2
++ add r0, r0, r3, lsl #2
++ vext.16 q10, q11, #2
++ subs r1, #1
++ vext.16 q11, q11, #2
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++ vld1.16 {q8 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[1]
++ vdup.32 q2, d17[0]
++ vdup.32 q3, d17[1]
++
++ vst1.32 {q0 }, [r0 :128], r3
++ vst1.16 {q1 }, [r2 :128], r3
++ vst1.32 {q2 }, [r0 :128]
++ vst1.16 {q3 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++ vld1.16 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[0]
++ vdup.32 q2, d16[1]
++ vdup.32 q3, d16[1]
++ vext.32 q8, q9, #2
++ vext.32 q9, q9, #2
++ vst1.32 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.32 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++ vldm r2, { q8-q11}
++ mov r1, #8
++1:
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[0]
++ vdup.32 q2, d16[0]
++ vdup.32 q3, d16[0]
++ add r2, r0, r3, lsl #2
++ vdup.32 q12, d16[1]
++ vdup.32 q13, d16[1]
++ vdup.32 q14, d16[1]
++ vdup.32 q15, d16[1]
++ vstm r0, { q0-q3 }
++ vstm r2, {q12-q15}
++
++ vext.32 q8, q9, #2
++ vext.32 q9, q10, #2
++ add r0, r0, r3, lsl #3
++ vext.32 q10, q11, #2
++ subs r1, #1
++ vext.32 q11, q11, #2
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+new file mode 100644
+index 0000000000..9fb3633862
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,930 @@
++/*
++ * Copyright (c) 2017 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@ ( x + 1 ) * p[ nTbS ][ -1 ] +
++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++ adr r12, nb_3_0_1_4
++ vld1.8 {d24}, [r2] @ Left
++ vld1.8 {d0 }, [r1] @ Up
++ vld1.8 {q8 }, [r12 :128] @ 3..
++
++ vdup.8 d30, d24[4]
++ vdup.8 d31, d0[4]
++
++ vdup.32 d0, d0[0] @ copy lo -> hi
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q0, d0, #2
++ add r1, r0, r3
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++ vshl.i16 q3, q2, #1
++ vadd.i16 d0, d4
++ vadd.i16 d1, d6
++ lsl r3, #1
++ vadd.i16 q1, q0, q3
++
++ vdup.u8 d20, d24[0]
++ vdup.u8 d21, d24[1]
++ vdup.u8 d22, d24[2]
++ vdup.u8 d23, d24[3]
++
++ vtrn.32 d20, d21
++ vtrn.32 d22, d23
++
++ vmull.u8 q10, d16, d20
++ vmull.u8 q11, d16, d22
++ vadd.i16 q10, q0
++ vadd.i16 q11, q1
++
++ vrshrn.u16 d28, q10, #3
++ vrshrn.u16 d29, q11, #3
++
++ vst1.32 {d28[0]}, [r0 :32], r3
++ vst1.32 {d28[1]}, [r1 :32], r3
++ vst1.32 {d29[0]}, [r0 :32]
++ vst1.32 {d29[1]}, [r1 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++ adr r12, nb_7_0_1_8
++ vld1.8 {q12}, [r2] @ Left
++ vld1.8 {q0 }, [r1] @ Up
++ vld1.8 {q8 }, [r12 :128] @ 7..
++
++ vdup.8 d30, d25[0]
++ vdup.8 d31, d1[0]
++
++ mov r1, #8
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q0, d0, #3
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++@ u8 7..0 [1] d16
++@ u8 left[y] [1] d24
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q0, q2
++
++ vdup.u8 d20, d24[0]
++ vext.8 d24, d24, #1
++
++ vmull.u8 q10, d16, d20
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d28, q10, #4
++
++ subs r1, #1
++ vst1.8 {d28}, [r0 :64], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++ vld1.8 {q12}, [r2 :128] @ Left
++ ldrb r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread
++ adr r12, nb_15_0_1_16
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r1, [r1, #16] @ Up-right
++ vld1.8 {q8, q9 }, [r12 :128] @ 15...
++
++ vdup.8 d30, r2
++ vdup.8 d31, r1
++
++ mov r1, #16
++ vsubl.u8 q3, d30, d1
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q1, d1, #4
++ vshll.u8 q0, d0, #4
++ vmlal.u8 q1, d19, d31
++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free
++
++@ u8 15..0 [1] q8
++@ u8 left[y] [1] q12
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q1, q3
++ vadd.i16 q0, q2
++
++ vdup.u8 d20, d24[0]
++ vext.8 q12, q12, #1
++
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d29, q11, #5
++ vrshrn.u16 d28, q10, #5
++
++ subs r1, #1
++ vst1.8 {q14}, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++ vpush {q4-q7}
++ vld1.8 {q12, q13}, [r2 :128]! @ Left
++ adr r12, nb_31_0_1_32
++ vld1.8 {q0, q1 }, [r1 :128]! @ Up
++ vld1.8 {d30[0]}, [r2] @ Down left
++ vld1.8 {d31[0]}, [r1] @ Up-right
++ vldm r12, { q8-q11} @ 1..32, 31..0
++
++ vdup.8 d30, d30[0]
++ vdup.8 d31, d31[0]
++
++ vsubl.u8 q7, d30, d3
++ vsubl.u8 q6, d30, d2
++ vsubl.u8 q5, d30, d1
++ vsubl.u8 q4, d30, d0 @ Add set up
++
++ vshll.u8 q3, d3, #5
++ vshll.u8 q2, d2, #5
++ vshll.u8 q1, d1, #5
++ vshll.u8 q0, d0, #5
++ vmlal.u8 q3, d23, d31
++ vmlal.u8 q2, d22, d31
++ vmlal.u8 q1, d21, d31
++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free
++
++ mov r1, #32
++
++@ u8 31..0 [2] q10, q11
++@ u8 left[y] [2] q12, q13
++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q3, q7
++ vadd.i16 q2, q6
++ vadd.i16 q1, q5
++ vadd.i16 q0, q4
++
++ vdup.u8 d20, d24[0]
++ vext.8 q12, q13, #1
++ vext.8 q13, q13, #1
++
++ vmull.u8 q15, d19, d20
++ vmull.u8 q14, d18, d20
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q15, q3
++ vadd.i16 q14, q2
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d31, q15, #6
++ vrshrn.u16 d30, q14, #6
++ vrshrn.u16 d29, q11, #6
++ vrshrn.u16 d28, q10, #6
++
++ subs r1, #1
++ vst1.8 {q14, q15}, [r0 :128], r3
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++ vld1.8 {q12}, [r2 :64] @ Left + down-left - <1d of overread is OK
++ adr r12, nbx2_3_0_1_4
++ vld1.8 {q0 }, [r1 :64] @ Up + up right
++ vld1.8 {q8 }, [r12 :128] @ 3,3..
++
++ vdup.16 d30, d25[0]
++ vdup.16 d31, d1[0]
++
++ mov r1, #4
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q0, d0, #2
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++@ u8 3,3..0,0 [1] d16
++@ u8 left[y] [1] d24
++@ u16 acc [1] q0 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q0, q2
++
++ vdup.u16 d20, d24[0]
++ vext.16 d24, d24, #1
++
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d28, q10, #3
++
++ subs r1, #1
++ vst1.8 {d28}, [r0 :64], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++ vld1.8 {q12}, [r2 :128] @ Left
++ ldrh r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread
++ adr r12, nbx2_7_0_1_8
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrh r1, [r1, #16] @ Up-right
++ vld1.8 {q8, q9 }, [r12 :128] @ 7,7...
++
++ vdup.16 d30, r2
++ vdup.16 d31, r1
++
++ mov r1, #8
++ vsubl.u8 q3, d30, d1
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q1, d1, #3
++ vshll.u8 q0, d0, #3
++ vmlal.u8 q1, d19, d31
++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free
++
++@ u8 7,7..0,0 [1] q8
++@ u8 left[y] [1] q12
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q1, q3
++ vadd.i16 q0, q2
++
++ vdup.u16 d20, d24[0]
++ vext.16 q12, q12, #1
++
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d29, q11, #4
++ vrshrn.u16 d28, q10, #4
++
++ subs r1, #1
++ vst1.8 {q14}, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++ vpush {q4-q7}
++ vld1.8 {q12, q13}, [r2 :128]! @ Left
++ adr r12, nbx2_15_0_1_16
++ vld1.8 {q0, q1 }, [r1 :128]! @ Up
++ vld1.16 {d30[0]}, [r2] @ Down left
++ vld1.16 {d31[0]}, [r1] @ Up-right
++ vldm r12, { q8-q11} @ 1..32, 31..0
++
++ vdup.16 d30, d30[0]
++ vdup.16 d31, d31[0]
++
++ mov r1, #16
++ vsubl.u8 q7, d30, d3
++ vsubl.u8 q6, d30, d2
++ vsubl.u8 q5, d30, d1
++ vsubl.u8 q4, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q3, d3, #4
++ vshll.u8 q2, d2, #4
++ vshll.u8 q1, d1, #4
++ vshll.u8 q0, d0, #4
++ vmlal.u8 q3, d23, d31
++ vmlal.u8 q2, d22, d31
++ vmlal.u8 q1, d21, d31
++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free
++
++@ u8 31..0 [2] q10, q11
++@ u8 left[y] [2] q12, q13
++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q3, q7
++ vadd.i16 q2, q6
++ vadd.i16 q1, q5
++ vadd.i16 q0, q4
++
++ vdup.u16 d20, d24[0]
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++
++ vmull.u8 q15, d19, d20
++ vmull.u8 q14, d18, d20
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q15, q3
++ vadd.i16 q14, q2
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d31, q15, #5
++ vrshrn.u16 d30, q14, #5
++ vrshrn.u16 d29, q11, #5
++ vrshrn.u16 d28, q10, #5
++
++ subs r1, #1
++ vst1.8 {q14, q15}, [r0 :256], r3
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++@------------------------------------------------------------------------------
++@
++@ Data - put btween the 2 code lumps so we can reach it with an adr from both
++@ Beware - it gets quite close which is why nb_3_0_1_4 is 1st...
++
++ .text
++ .balign 64
++
++ @ These could be extracted from the above array, but separate out
++ @ out for better (16 byte) alignment
++nb_3_0_1_4:
++ .byte 3, 2, 1, 0, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 1, 2, 3, 4
++nb_7_0_1_8:
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++nbh_3_0_1_4:
++ .short 3, 2, 1, 0, 1, 2, 3, 4
++nbx2_3_0_1_4:
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++
++ @ should be back on a 64-byte boundary here
++nb_31_0_1_32:
++ .byte 31, 30, 29, 28, 27, 26, 25, 24
++ .byte 23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++ .byte 15, 14, 13, 12, 11, 10, 9, 8
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++ .byte 9, 10, 11, 12, 13, 14, 15, 16
++ .byte 17, 18, 19, 20, 21, 22, 23, 24
++ .byte 25, 26, 27, 28, 29, 30, 31, 32
++
++ @ should be back on a 64-byte boundary here
++nbx2_15_0_1_16:
++ .byte 15, 15, 14, 14, 13, 13, 12, 12
++ .byte 11, 11, 10, 10, 9, 9, 8, 8
++nbx2_7_0_1_8:
++ .byte 7, 7, 6, 6, 5, 5, 4, 4
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++ .byte 5, 5, 6, 6, 7, 7, 8, 8
++ .byte 9, 9, 10, 10, 11, 11, 12, 12
++ .byte 13, 13, 14, 14, 15, 15, 16, 16
++
++@------------------------------------------------------------------------------
++@
++@ 10 bits
++@ (all would work with 9)
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbh_3_0_1_4
++ vld1.16 {q14}, [r2 :64]
++ vld1.16 {q8 }, [r12 :128] @ 3..0,1,..4
++ vld1.16 {q12}, [r1 :64] @ Up
++ vdup.16 d2, d29[0]
++
++ lsl r3, #1
++ vsub.i16 d4, d2, d24 @ Add set up
++
++ vdup.16 d0, d25[0]
++ vshl.i16 d24, #2
++ vmla.i16 d24, d17, d0 @ Acc set up
++ add r1, r0, r3
++ vmov d17, d16
++
++ vadd.i16 d24, d4
++ vadd.i16 d25, d24, d4
++ vshl.i16 d4, d4, #1 @ x2
++ lsl r3, #1
++ vadd.i16 d26, d24, d4
++ vadd.i16 d27, d25, d4
++
++ vdup.16 d0, d28[0]
++ vdup.16 d1, d28[1]
++ vdup.16 d2, d28[2]
++ vdup.16 d3, d28[3]
++
++ vmul.i16 q0, q8, q0
++ vmul.i16 q1, q8, q1
++ vadd.i16 q0, q12
++ vadd.i16 q1, q13
++
++ vrshr.u16 q0, #3
++ vrshr.u16 q1, #3
++
++ vst1.16 {d0}, [r0], r3
++ vst1.16 {d1}, [r1], r3
++ vst1.16 {d2}, [r0]
++ vst1.16 {d3}, [r1]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_7_0_1_8
++ vld1.16 {q14}, [r2 :128]
++ ldrh r2, [r2, #16] @ Down left
++ vld1.8 {q0 }, [r12 :128] @ 7..0,1,..8
++ vld1.16 {q12}, [r1 :128] @ Up
++ ldrh r1, [r1, #16] @ Up-right
++ vmovl.u8 q8, d1
++ vdup.16 q1, r2
++ vmovl.u8 q10, d0
++
++ lsl r3, #1
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ vdup.16 q0, r1
++ mov r1, #8
++ vshl.i16 q12, #3
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 15..0 [1] q10
++@ u32 left[y] [1] q14
++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.16 q0, d28[0]
++ vext.16 q14, q14, #1
++
++ vadd.i16 q12, q2
++
++ vmul.i16 q0, q10, q0
++ vadd.i16 q0, q12
++ vrshr.u16 q0, #4
++
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_15_0_1_16
++ vld1.16 {q14, q15}, [r2 :128]
++ ldrh r2, [r2, #32] @ Down left
++ vld1.8 {q0, q1 }, [r12 :128] @ 15..0,1,..16
++ vld1.16 {q12, q13}, [r1 :128] @ Up
++ ldrh r1, [r1, #32] @ Up-right
++ vmovl.u8 q9, d3
++ vmovl.u8 q8, d2
++ vdup.16 q1, r2
++ vmovl.u8 q11, d1
++ vmovl.u8 q10, d0
++
++ lsl r3, #1
++ vsub.i16 q3, q1, q13
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ vdup.16 q0, r1
++ mov r1, #16
++ vshl.i16 q13, #4
++ vshl.i16 q12, #4
++ vmla.i16 q13, q9, q0
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 15..0 [2] q10..q11
++@ u32 left[y] [2] q14..q15
++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.16 q0, d28[0]
++ vext.16 q14, q15, #1
++ vext.16 q15, q15, #1
++
++ vadd.i16 q13, q3
++ vadd.i16 q12, q2
++
++ vmul.i16 q1, q11, q0
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q0, #5
++
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++ push {r4, lr}
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_31_0_1_32
++ vpush { q4-q7 }
++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0
++ vldm r1!, {q12-q15} @ Up
++ ldrh r12, [r2, #64] @ Down left
++ vmovl.u8 q8, d4
++ vmovl.u8 q9, d5
++ vmovl.u8 q10, d6
++ vmovl.u8 q11, d7
++ vdup.16 q3, r12
++ vld1.16 {d4[0]}, [r1] @ Up-right
++
++ vsub.i16 q7, q3, q15
++ vsub.i16 q6, q3, q14
++ vsub.i16 q5, q3, q13
++ vsub.i16 q4, q3, q12 @ Add set up
++
++ vshl.i16 q15, #5
++ vshl.i16 q14, #5
++ vshl.i16 q13, #5
++ vshl.i16 q12, #5
++ vmla.i16 q15, q11, d4[0]
++ vmla.i16 q14, q10, d4[0]
++ vmla.i16 q13, q9, d4[0]
++ vmla.i16 q12, q8, d4[0] @ Acc set up - q8-q11 free
++
++ mov r1, #32
++ vmovl.u8 q8, d0
++ vmovl.u8 q9, d1
++ vmovl.u8 q10, d2
++ vmovl.u8 q11, d3
++
++@ u8 31..0 [4] q8..q11
++@ u8 left[y] [4] [r2]
++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vld1.16 {d0[0]}, [r2]!
++
++ vadd.i16 q15, q7
++ vadd.i16 q14, q6
++ vadd.i16 q13, q5
++ vadd.i16 q12, q4
++
++ vmul.i16 q3, q11, d0[0]
++ vmul.i16 q2, q10, d0[0]
++ vmul.i16 q1, q9, d0[0]
++ vmul.i16 q0, q8, d0[0]
++
++ vadd.i16 q3, q15
++ vadd.i16 q2, q14
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q3, #6
++ vrshr.u16 q2, #6
++ vrshr.u16 q1, #6
++ vrshr.u16 q0, #6
++
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #1
++
++ bne 1b
++
++ vpop {q4-q7}
++ pop {r4, pc}
++
++endfunc
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_3_0_1_4
++ vld1.8 {q0 }, [r12 :128] @ 3,3..0,0,1,1..4,4
++ vld1.16 {q14}, [r2 :128] @ left
++ ldr r12, [r2, #16] @ Down left
++ vld1.16 {q12}, [r1 :128] @ Up
++ vmovl.u8 q8, d1
++ vdup.32 q1, r12
++ ldr r12, [r1, #16] @ Up-right
++ vmovl.u8 q10, d0
++
++ lsl r3, #2
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ mov r1, #4
++ vdup.32 q0, r12
++ vshl.i16 q12, #2
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 3,3..0,0 [1] q10
++@ u32 left[y] [1] q14
++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.32 q0, d28[0]
++ vext.32 q14, q14, #1
++
++ vadd.i16 q12, q2
++
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q0, q12
++
++ vrshr.u16 q0, #3
++
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_7_0_1_8
++ vld1.8 {q0, q1 }, [r12 :128] @ 7,7..0,0,1,1..8,8
++ vld1.16 {q14, q15}, [r2 :128]
++ ldr r12, [r2, #32] @ Down left
++ vld1.16 {q12, q13}, [r1 :128] @ Up
++ vmovl.u8 q9, d3
++ vmovl.u8 q8, d2
++ vdup.32 q1, r12
++ ldr r12, [r1, #32] @ Up-right
++ vmovl.u8 q11, d1
++ vmovl.u8 q10, d0
++
++ lsl r3, #2
++ vsub.i16 q3, q1, q13
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ mov r1, #8
++ vdup.32 q0, r12
++ vshl.i16 q13, #3
++ vshl.i16 q12, #3
++ vmla.i16 q13, q9, q0
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 7,7..0,0 [2] q10..q11
++@ u32 left[y] [2] q14..q15
++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.32 q0, d28[0]
++ vext.32 q14, q15, #1
++ vext.32 q15, q15, #1
++
++ vadd.i16 q13, q3
++ vadd.i16 q12, q2
++
++ vmul.i16 q1, q11, q0
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q1, #4
++ vrshr.u16 q0, #4
++
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :256], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_15_0_1_16
++ vpush { q4-q7 }
++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0
++ vldm r1!, {q12-q15} @ Up
++ ldr r12, [r2, #64] @ Down left
++ vmovl.u8 q11, d7
++ vmovl.u8 q10, d6
++ vmovl.u8 q9, d5
++ vmovl.u8 q8, d4
++ vdup.32 q3, r12
++ ldr r12, [r1] @ Up-right
++
++ vsub.i16 q7, q3, q15
++ vsub.i16 q6, q3, q14
++ vsub.i16 q5, q3, q13
++ vsub.i16 q4, q3, q12 @ Add set up
++
++ vdup.32 q2, r12
++ vshl.i16 q15, #4
++ vshl.i16 q14, #4
++ vshl.i16 q13, #4
++ vshl.i16 q12, #4
++ vmla.i16 q15, q11, q2
++ vmla.i16 q14, q10, q2
++ vmla.i16 q13, q9, q2
++ vmla.i16 q12, q8, q2 @ Acc set up - q8-q11 free
++
++ mov r1, #16
++ vmovl.u8 q11, d3
++ vmovl.u8 q10, d2
++ vmovl.u8 q9, d1
++ vmovl.u8 q8, d0
++
++@ u16 15,15..0,0 [4] q8..q11
++@ u32 left[y] [4] [r2]
++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1]
++1:
++ ldr r12, [r2], #4
++
++ vadd.i16 q15, q7
++ vadd.i16 q14, q6
++ vdup.32 q0, r12
++ vadd.i16 q13, q5
++ vadd.i16 q12, q4
++
++ vmul.i16 q3, q11, q0
++ vmul.i16 q2, q10, q0
++ vmul.i16 q1, q9, q0
++ vmul.i16 q0, q8, q0
++
++ vadd.i16 q3, q15
++ vadd.i16 q2, q14
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q3, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q1, #5
++ vrshr.u16 q0, #5
++
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #2
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++endfunc
++
++
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index fb0c6fae70..9f2ebb16f3 100644
--- a/libavcodec/avcodec.h
@@ -7445,10 +14124,10 @@ index d181b74570..c52c450956 100644
if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
new file mode 100644
-index 0000000000..4891a79eb5
+index 0000000000..f053ebcc59
--- /dev/null
+++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2269 @@
+@@ -0,0 +1,2266 @@
+/*
+ * HEVC CABAC decoding
+ *
@@ -8336,9 +15015,9 @@ index 0000000000..4891a79eb5
+ int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
+ int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
+
-+ if (lc->ctb_left_flag || x0b)
++ if ((lc->ctb_avail & AVAIL_L) != 0 || x0b)
+ depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
-+ if (lc->ctb_up_flag || y0b)
++ if ((lc->ctb_avail & AVAIL_U) != 0 || y0b)
+ depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
+
+ inc += (depth_left > ct_depth);
@@ -8876,7 +15555,6 @@ index 0000000000..4891a79eb5
+
+ // Rewrite as add residual - must rewrite all fields as different union member
+ pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+ pc->c_idx = c_idx;
+ pc->ta.buf = coeffs;
+ pc->ta.dst = dst;
+ pc->ta.stride = stride;
@@ -8889,7 +15567,6 @@ index 0000000000..4891a79eb5
+
+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+ cmd->size = log2_trafo_size;
-+ cmd->c_idx = c_idx;
+ cmd->ta.buf = coeffs;
+ cmd->ta.dst = dst;
+ cmd->ta.stride = stride;
@@ -8945,7 +15622,6 @@ index 0000000000..4891a79eb5
+
+ cmd->type = RPI_PRED_ADD_DC + c_idx;
+ cmd->size = log2_trafo_size;
-+ cmd->c_idx = c_idx;
+ cmd->dc.dst = dst;
+ cmd->dc.stride = stride;
+ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
@@ -9720,7 +16396,7 @@ index 0000000000..4891a79eb5
+#endif
diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
new file mode 100644
-index 0000000000..a360815a36
+index 0000000000..f6daf936ca
--- /dev/null
+++ b/libavcodec/rpi_hevc_cabac_fns.h
@@ -0,0 +1,190 @@
@@ -9854,9 +16530,9 @@ index 0000000000..a360815a36
+ const uint8_t * const skip_bits = s->skip_flag + y_cb * stride;
+
+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
-+ ((!lc->ctb_left_flag && (x0 & ctb_mask) == 0) ? 0 :
++ (((lc->ctb_avail & AVAIL_L) == 0 && (x0 & ctb_mask) == 0) ? 0 :
+ (skip_bits[((x_cb - 1) >> 3)] >> ((x_cb - 1) & 7)) & 1) +
-+ ((!lc->ctb_up_flag && (y0 & ctb_mask) == 0) ? 0 :
++ (((lc->ctb_avail & AVAIL_U) == 0 && (y0 & ctb_mask) == 0) ? 0 :
+ (skip_bits[(x_cb >> 3) - stride] >> (x_cb & 7)) & 1));
+}
+
@@ -10034,10 +16710,10 @@ index 0000000000..0aee673d8b
+#endif /* AVCODEC_RPI_HEVC_DATA_H */
diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
new file mode 100644
-index 0000000000..a8601da4e7
+index 0000000000..05d447eaa5
--- /dev/null
+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1165 @@
+@@ -0,0 +1,1210 @@
+/*
+ * HEVC video decoder
+ *
@@ -10158,28 +16834,6 @@ index 0000000000..a8601da4e7
+ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+}
+
-+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+int i, j;
-+
-+ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
-+ for (i = 0; i < height; i++) {
-+ for (j = 0; j < width; j+=8)
-+ AV_COPY64U(dst+j, src+j);
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ } else {
-+ for (i = 0; i < height; i++) {
-+ for (j = 0; j < width; j+=16)
-+ AV_COPY128(dst+j, src+j);
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ }
-+}
-+
+// "DSP" these?
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
@@ -10419,7 +17073,7 @@ index 0000000000..a8601da4e7
+ [2*MAX_PB_SIZE*MAX_PB_SIZE];
+ dst = dstbuf;
+ stride_dst = 2*MAX_PB_SIZE;
-+ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
+ if (sliced && c_idx != 0)
+ {
+ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
@@ -10533,10 +17187,7 @@ index 0000000000..a8601da4e7
+ }
+ }
+
-+ copy_CTB(dst,
-+ src,
-+ width << sh,
-+ height, stride_dst, stride_src);
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
+
+ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+ x_ctb, y_ctb);
@@ -10568,7 +17219,6 @@ index 0000000000..a8601da4e7
+ horiz_edge,
+ diag_edge);
+ }
-+ // ??? Does this actually work for chroma ???
+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+ x, y, width, height, c_idx);
+ sao->type_idx[c_idx] = SAO_APPLIED;
@@ -10599,6 +17249,15 @@ index 0000000000..a8601da4e7
+#endif
+}
+
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
@@ -10614,23 +17273,60 @@ index 0000000000..a8601da4e7
+ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
+}
+
-+// We sometimes need 17 2-bit entries (annoying!)
-+// * This could be avoided if we separate out the H filter left-stub deblock
-+// but 64 bit constant shr shouldn't be too bad - though the variable mask here is probably quite nasty
-+static inline uint64_t hbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
-+ unsigned int n = (xr - xl + 7) & ~7;
-+
-+ return n == 0 ? (uint64_t)0 :
-+ (*(uint64_t *)(s->horizontal_bs + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1);
++ return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
+}
+
-+static inline uint64_t vbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
-+ unsigned int n = (xr - xl + 7) & ~7;
++ return (uint8_t *)(bs +
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
+
-+ return n == 0 ? (uint64_t)0 :
-+ (*(uint64_t *)(s->vertical_bs2 + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1);
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2,
++ const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ if (xr <= xl) {
++ return 0;
++ }
++ else
++ {
++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++ return n == 32 ? a :
++ (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++ }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
+}
+
+
@@ -10658,68 +17354,78 @@ index 0000000000..a8601da4e7
+ // Main body
+ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
+ {
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
+ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
++ if (vbs != 0)
+ {
+ const uint8_t * const tcv = tctable + dbp->tc_offset;
+ const uint8_t * const betav = betatable + dbp->beta_offset;
+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+// const uint8_t * vbs = s->vertical_bs + (bv_l >> 3) * s->bs_height + (y >> 2);
-+ uint64_t vbs2 = vbs_get(s, bv_l, bv_r, y);
+ unsigned int x;
+
-+ for (x = bv_l; x < bv_r; x += 8)
++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
+ {
-+ const unsigned int pcmf_v = pcmfa & 3;
-+ const unsigned int bs0 = vbs2 & 3;
-+ const unsigned int bs1 = (vbs2 & 0xc) >> 2;
-+
-+ if ((bs0 | bs1) != 0 && pcmf_v != 3)
++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
+ {
+ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+ frame_stride1(s->frame, LUMA),
+ betav[qp],
-+ (bs0 == 0 ? 0 : tcv[qp + (int)(bs0 & 2)]) |
-+ ((bs1 == 0 ? 0 : tcv[qp + (int)(bs1 & 2)]) << 16),
-+ pcmf_v,
++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++ pcmfa & 3,
+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+ }
-+
-+ pcmfa >>= 1;
-+// vbs += s->bs_height;
-+ vbs2 >>= 4;
+ }
+ }
+
+ if (y != 0)
+ {
-+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+ uint64_t hbs = hbs_get(s, bh_l, bh_r + 1, y); // Will give (x <= bh_r) in for loop
++ uint32_t hbs;
+
-+ for (x = bh_l; hbs != 0; x += 8, hbs >>= 4)
++ // H left - mostly separated out so we only need a uint32_t hbs
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
+ {
-+ const unsigned int pcmf_h = (pcmfa & 1) | ((pcmfa & 0x10000) >> 15);
-+ const unsigned int bs0 = hbs & 3;
-+ const unsigned int bs1 = (hbs >> 2) & 3;
++ const unsigned int x = bh_l;
++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const DBParams * const dbph = dbp - 1;
++ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
+
-+ if ((bs0 | bs1) != 0 && pcmf_h != 3)
++ av_assert2(cb_x - bh_l == 8);
++
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbph->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++
++ // H
++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
+ {
-+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const DBParams * const dbph = (x < cb_x ? dbp - 1 : dbp);
-+ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+ frame_stride1(s->frame, LUMA),
-+ betatable[qp + dbph->beta_offset],
-+ (bs0 == 0 ? 0 : tc[bs0 & 2]) |
-+ ((bs1 == 0 ? 0 : tc[bs1 & 2]) << 16),
-+ pcmf_h);
++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++ {
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbp->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
+ }
-+
-+ pcmfa >>= 1;
+ }
+ }
+
@@ -10727,11 +17433,6 @@ index 0000000000..a8601da4e7
+ }
+}
+
-+#define TL 1
-+#define TR 2
-+#define BL 4
-+#define BR 8
-+
+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
@@ -10768,98 +17469,119 @@ index 0000000000..a8601da4e7
+ // Deblock V up 8
+ // CTB above current
+ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+ unsigned int x;
+ const unsigned int y = bounds.y - 8;
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
+
-+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U);
-+
-+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8)
++ if (vbs != 0)
+ {
-+ const unsigned int pcmf_v = (pcmfa & 3);
-+ if ((vbs2 & 2) != 0 && pcmf_v != 3)
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+ {
-+ const int qp0 = q2h(s, x, y);
-+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ pcmf_v);
++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ pcmfa & 3);
++ }
+ }
-+ pcmfa >>= 2;
+ }
+ }
+
+ for (y = bounds.y; y < b_b; y += 16)
+ {
++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
+ // V
++ if (vbs != 0)
+ {
+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, bv_l - 1, y);
-+ const unsigned int pcmf_or = (y + 16 <= b_b) ? 0 : BL | BR;
++ unsigned int pcmfa =
++ (y + 16 > b_b ?
++ pcm2(s, bv_l - 1, y) | 0xffff0000 :
++ pcm4(s, bv_l - 1, y));
+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U) |
-+ ((vbs_get(s, bv_l, bv_r, y + 8) & 0x0202020202020202U) << 4);
+
-+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8)
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+ {
-+ const unsigned int pcmf_v = pcmf_or | (pcmfa & 3) | ((pcmfa >> 14) & 0xc);
-+ const unsigned int bs0 = (~pcmf_v & (TL | TR)) == 0 ? 0 : vbs2 & 2;
-+ const unsigned int bs1 = (~pcmf_v & (BL | BR)) == 0 ? 0 : (vbs2 & 0x20) >> 4;
-+
-+ if ((bs0 | bs1) != 0)
++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
+ {
+ const int qp0 = q2h(s, x, y);
+ const int qp1 = q2h(s, x, y + 8);
+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ frame_stride1(s->frame, 1),
-+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ pcmf_v);
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
+ }
-+
-+ pcmfa >>= 2;
+ }
+ }
+
+ // H
+ if (y != 0)
+ {
-+ unsigned int x;
-+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++ uint32_t hbs;
+ const unsigned int bh_l = bv_l - 16;
-+ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+ uint64_t hbs = hbs_get(s, bh_l, bh_r, y) & 0x2222222222222222U;
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
-+ // Chop off bits we don't want...
-+ if (bh_l < bounds.x) {
-+ pcmfa |= 0x10001; // TL|BL pre rearrangement
-+ hbs &= ~(uint64_t)3; // Make BS 0
-+ }
-+
-+ for (x = bh_l; hbs != 0; x += 16, hbs >>= 8)
++ // H left - mostly separated out so we only need a uint32_t hbs
++ // Stub is width 8 to the left of bounds, but width 16 internally
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
+ {
-+ const unsigned int pcmf_h = (x + 16 > bh_r ? TR | BR : 0) |
-+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc);
-+ const int bs0 = hbs & 2;
-+ const int bs1 = (~pcmf_h & (TR | BR)) == 0 ? 0 : (hbs >> 4) & 2;
-+ if ((bs0 | bs1) != 0)
++ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++ // Chop off bits we don't want...
++ if (bh_l < bounds.x) {
++ pcmfa |= 0x10001; // TL|BL pre rearrangement
++ hbs &= ~3; // Make BS 0
++ }
++
++ // Double check we still want this
++ if (hbs != 0 && (~pcmfa & 0x30003) != 0)
+ {
++ const unsigned int x = bh_l;
+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+ const uint8_t * const tc = tctable + 2 + (x < cb_x ? dbp - 1 : dbp)->tc_offset;
++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
+
+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ frame_stride1(s->frame, 1),
-+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+ pcmf_h);
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++
++ // H main
++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it
++
++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++ {
++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
+ }
-+ pcmfa >>= 2;
+ }
+ }
+ }
@@ -10871,18 +17593,18 @@ index 0000000000..a8601da4e7
+ return x & ~(~0U << log2_n);
+}
+
-+static inline void set_bs_h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+ av_assert2((y & 7) == 0);
+
+ // This doesn't have the same simultainious update issues that bsf_stash
+ // does (other threads will have a different y) so we can do it the easy way
+ if ((bsf &= mask) != 0)
-+ *(uint32_t *)(s->horizontal_bs + ((x >> 4) & ~3) + (y >> 3) * s->hbs_stride) |= bsf << ((x >> 1) & 31);
++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
+}
+
+
-+static void set_bs_v(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+ // We arrange this in a slightly odd fashion but it lines up with
+ // how we are going to use it in the actual deblock code & it is easier
@@ -10894,8 +17616,7 @@ index 0000000000..a8601da4e7
+
+ if ((bsf &= mask) != 0)
+ {
-+ const unsigned int stride1 = s->hbs_stride;
-+ uint8_t *p = s->vertical_bs2 + (x >> 4) + (y >> 3) * stride1;
++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
+ const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
+
+ if (mask <= 0xf)
@@ -10906,7 +17627,7 @@ index 0000000000..a8601da4e7
+ {
+ do {
+ *p |= (bsf & 0xf) << sh;
-+ p += stride1;
++ p += HEVC_RPI_BS_STRIDE1_BYTES;
+ } while ((bsf >>= 4) != 0);
+ }
+ }
@@ -10918,19 +17639,10 @@ index 0000000000..a8601da4e7
+ const RefPicList * const rpl_p, const RefPicList * const rpl_q,
+ const MvField * const mvf_p, const MvField * const mvf_q)
+{
-+ uint8_t res[16];
-+ unsigned int i;
-+ unsigned int a = 0;
-+
-+ s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+ sizeof(MvField) * mvf_stride, 1,
++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++ mvf_p, mvf_q,
+ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+ mvf_p, mvf_q, res);
-+
-+ for (i = 0; i != rep * dup; ++i) {
-+ a |= res[i] << (i * 2);
-+ }
-+ return a;
++ sizeof(MvField) * mvf_stride);
+}
+
+
@@ -11050,7 +17762,7 @@ index 0000000000..a8601da4e7
+ }
+
+ // Finally put the results into bs
-+ set_bs_h(s, x0, y0, bsf_mask, bsf_h);
++ hbs_set(s, x0, y0, bsf_mask, bsf_h);
+ }
+
+ // Max of 1 pu internal split - ignore if not on 8pel boundary
@@ -11061,7 +17773,7 @@ index 0000000000..a8601da4e7
+ // If we have the x split as well then it must be in the middle
+ const unsigned int log2_rep = has_x_split ? 1 : 0;
+
-+ set_bs_h(s, x0, lc->cu.y_split, bsf_mask,
++ hbs_set(s, x0, lc->cu.y_split, bsf_mask,
+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+ trafo_size >> (log2_min_pu_size + log2_rep),
+ rpl, rpl,
@@ -11074,7 +17786,7 @@ index 0000000000..a8601da4e7
+ {
+ // Boundary left
+ if (x0 != 0 &&
-+ ((x0 & ((1 << s->ps.sps->log2_ctb_size) - 1)) != 0 ||
++ (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
+ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
+ {
+ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
@@ -11090,7 +17802,7 @@ index 0000000000..a8601da4e7
+ mvf_curr, mvf_curr - 1);
+ }
+
-+ set_bs_v(s, x0, y0, bsf_mask, bsf_v);
++ vbs_set(s, x0, y0, bsf_mask, bsf_v);
+ }
+
+ if (has_x_split && !off_boundary(lc->cu.x_split, 3))
@@ -11099,7 +17811,7 @@ index 0000000000..a8601da4e7
+ (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size);
+ const unsigned int log2_rep = has_y_split ? 1 : 0;
+
-+ set_bs_v(s, lc->cu.x_split, y0, bsf_mask,
++ vbs_set(s, lc->cu.x_split, y0, bsf_mask,
+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+ (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+ rpl, rpl,
@@ -11135,6 +17847,12 @@ index 0000000000..a8601da4e7
+
+ // Deblock may not touch the edges of the bound as they are still needed
+ // for Intra pred
++ //
++ // Deblock is disabled with a per-slice flag
++ // Given that bounds may cover multiple slices & we dblock outside bounds
++ // anyway we can't avoid deblock using that flag - about the only thing we
++ // could do is have a "no deblock seen yet" flag but it doesn't really
++ // seem worth the effort
+
+ deblock_y_blk(s, bounds, x_end, y_end);
+ deblock_uv_blk(s, bounds, x_end, y_end);
@@ -11150,9 +17868,12 @@ index 0000000000..a8601da4e7
+ const unsigned int xl = ussub(bounds.x, xo);
+ const unsigned int xr = x_end ? br : ussub(br, xo);
+
-+ for (y = yt; y < yb; y += ctb_size) {
-+ for (x = xl; x < xr; x += ctb_size) {
-+ sao_filter_CTB(s, x, y);
++ if (s->ps.sps->sao_enabled)
++ {
++ for (y = yt; y < yb; y += ctb_size) {
++ for (x = xl; x < xr; x += ctb_size) {
++ sao_filter_CTB(s, x, y);
++ }
+ }
+ }
+
@@ -11205,10 +17926,10 @@ index 0000000000..a8601da4e7
+
diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
new file mode 100644
-index 0000000000..93f3530ff5
+index 0000000000..f283f01489
--- /dev/null
+++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,761 @@
+@@ -0,0 +1,704 @@
+/*
+ * HEVC video decoder
+ *
@@ -11250,43 +17971,6 @@ index 0000000000..93f3530ff5
+ { 3, 2, },
+};
+
-+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+ const int nPbW, const int nPbH)
-+{
-+ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
-+ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
-+
-+ lc->na.cand_up = (lc->ctb_up_flag || y0b);
-+ lc->na.cand_left = (lc->ctb_left_flag || x0b);
-+ lc->na.cand_up_left = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up;
-+ lc->na.cand_up_right = (x0 + nPbW) >= lc->end_of_ctb_x ?
-+ (lc->ctb_up_right_flag && !y0b) : lc->na.cand_up;
-+ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_ctb_y) ? 0 : lc->na.cand_left;
-+}
-+
-+/*
-+ * 6.4.1 Derivation process for z-scan order block availability
-+ */
-+static av_always_inline int z_scan_block_avail(const HEVCRpiContext * const s, const int xCurr, const int yCurr,
-+ const int xN, const int yN)
-+{
-+#define MIN_TB_ADDR_ZS(x, y) \
-+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
-+
-+ int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size;
-+ int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size;
-+ int xN_ctb = xN >> s->ps.sps->log2_ctb_size;
-+ int yN_ctb = yN >> s->ps.sps->log2_ctb_size;
-+ if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb )
-+ return 1;
-+ else {
-+ int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
-+ (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
-+ int N = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
-+ (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
-+ return N <= Curr;
-+ }
-+}
+
+//check if the two luma locations belong to the same motion estimation region
+static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP)
@@ -11451,8 +18135,7 @@ index 0000000000..93f3530ff5
+ x < s->ps.sps->width) {
+ x &= ~15;
+ y &= ~15;
-+ if (s->threads_type == FF_THREAD_FRAME)
-+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
+ x_pu = x >> s->ps.sps->log2_min_pu_size;
+ y_pu = y >> s->ps.sps->log2_min_pu_size;
+ temp_col = TAB_MVF(x_pu, y_pu);
@@ -11465,8 +18148,7 @@ index 0000000000..93f3530ff5
+ y = y0 + (nPbH >> 1);
+ x &= ~15;
+ y &= ~15;
-+ if (s->threads_type == FF_THREAD_FRAME)
-+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
+ x_pu = x >> s->ps.sps->log2_min_pu_size;
+ y_pu = y >> s->ps.sps->log2_min_pu_size;
+ temp_col = TAB_MVF(x_pu, y_pu);
@@ -11478,9 +18160,6 @@ index 0000000000..93f3530ff5
+#define AVAILABLE(cand, v) \
+ (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
+
-+#define PRED_BLOCK_AVAILABLE(v) \
-+ z_scan_block_avail(s, x0, y0, x ## v, y ## v)
-+
+#define COMPARE_MV_REFIDX(a, b) \
+ compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
+
@@ -11489,7 +18168,7 @@ index 0000000000..93f3530ff5
+ */
+static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
+ int nPbW, int nPbH,
-+ int log2_cb_size,
++ int log2_cb_size, const unsigned int avail,
+ int singleMCLFlag, int part_idx,
+ int merge_idx,
+ struct MvField mergecandlist[])
@@ -11498,13 +18177,6 @@ index 0000000000..93f3530ff5
+ const MvField * const tab_mvf = s->ref->tab_mvf;
+
+ const int min_pu_width = s->ps.sps->min_pu_width;
-+
-+ const int cand_bottom_left = lc->na.cand_bottom_left;
-+ const int cand_left = lc->na.cand_left;
-+ const int cand_up_left = lc->na.cand_up_left;
-+ const int cand_up = lc->na.cand_up;
-+ const int cand_up_right = lc->na.cand_up_right;
-+
+ const int xA1 = x0 - 1;
+ const int yA1 = y0 + nPbH - 1;
+
@@ -11542,7 +18214,7 @@ index 0000000000..93f3530ff5
+ is_diff_mer(s, xA1, yA1, x0, y0)) {
+ is_available_a1 = 0;
+ } else {
-+ is_available_a1 = AVAILABLE(cand_left, A1);
++ is_available_a1 = AVAILABLE((avail & AVAIL_L) != 0, A1);
+ if (is_available_a1) {
+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
+ if (merge_idx == 0)
@@ -11558,7 +18230,7 @@ index 0000000000..93f3530ff5
+ is_diff_mer(s, xB1, yB1, x0, y0)) {
+ is_available_b1 = 0;
+ } else {
-+ is_available_b1 = AVAILABLE(cand_up, B1);
++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1);
+ if (is_available_b1 &&
+ !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
@@ -11569,8 +18241,7 @@ index 0000000000..93f3530ff5
+ }
+
+ // above right spatial merge candidate
-+ is_available_b0 = AVAILABLE(cand_up_right, B0) &&
-+ PRED_BLOCK_AVAILABLE(B0) &&
++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0) &&
+ !is_diff_mer(s, xB0, yB0, x0, y0);
+
+ if (is_available_b0 &&
@@ -11582,8 +18253,7 @@ index 0000000000..93f3530ff5
+ }
+
+ // left bottom spatial merge candidate
-+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
-+ PRED_BLOCK_AVAILABLE(A0) &&
++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0) &&
+ !is_diff_mer(s, xA0, yA0, x0, y0);
+
+ if (is_available_a0 &&
@@ -11595,7 +18265,7 @@ index 0000000000..93f3530ff5
+ }
+
+ // above left spatial merge candidate
-+ is_available_b2 = AVAILABLE(cand_up_left, B2) &&
++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2) &&
+ !is_diff_mer(s, xB2, yB2, x0, y0);
+
+ if (is_available_b2 &&
@@ -11697,8 +18367,8 @@ index 0000000000..93f3530ff5
+ part_idx = 0;
+ }
+
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
+ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
+ singleMCLFlag, part_idx,
+ merge_idx, mergecand_list);
+
@@ -11780,8 +18450,9 @@ index 0000000000..93f3530ff5
+ (y ## v) >> s->ps.sps->log2_min_pu_size, \
+ pred, &mx, ref_idx_curr, ref_idx)
+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
-+ int nPbH, int log2_cb_size, int part_idx,
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc,
++ int x0, int y0, int nPbW, int nPbH,
++ int log2_cb_size, const unsigned int avail, int part_idx,
+ int merge_idx, MvField * const mv,
+ int mvp_lx_flag, int LX)
+{
@@ -11811,11 +18482,6 @@ index 0000000000..93f3530ff5
+ int pred_flag_index_l0;
+ int pred_flag_index_l1;
+
-+ const int cand_bottom_left = lc->na.cand_bottom_left;
-+ const int cand_left = lc->na.cand_left;
-+ const int cand_up_left = lc->na.cand_up_left;
-+ const int cand_up = lc->na.cand_up;
-+ const int cand_up_right = lc->na.cand_up_right;
+ ref_idx_curr = LX;
+ ref_idx = mv->ref_idx[LX];
+ pred_flag_index_l0 = LX;
@@ -11825,14 +18491,13 @@ index 0000000000..93f3530ff5
+ xA0 = x0 - 1;
+ yA0 = y0 + nPbH;
+
-+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
-+ PRED_BLOCK_AVAILABLE(A0);
++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0);
+
+ //left spatial merge candidate
+ xA1 = x0 - 1;
+ yA1 = y0 + nPbH - 1;
+
-+ is_available_a1 = AVAILABLE(cand_left, A1);
++ is_available_a1 = AVAILABLE((avail & AVAIL_L), A1);
+ if (is_available_a0 || is_available_a1)
+ isScaledFlag_L0 = 1;
+
@@ -11879,18 +18544,17 @@ index 0000000000..93f3530ff5
+ xB0 = x0 + nPbW;
+ yB0 = y0 - 1;
+
-+ is_available_b0 = AVAILABLE(cand_up_right, B0) &&
-+ PRED_BLOCK_AVAILABLE(B0);
++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0);
+
+ // above spatial merge candidate
+ xB1 = x0 + nPbW - 1;
+ yB1 = y0 - 1;
-+ is_available_b1 = AVAILABLE(cand_up, B1);
++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1);
+
+ // above left spatial merge candidate
+ xB2 = x0 - 1;
+ yB2 = y0 - 1;
-+ is_available_b2 = AVAILABLE(cand_up_left, B2);
++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2);
+
+ // above right spatial merge candidate
+ if (is_available_b0) {
@@ -12162,10 +18826,10 @@ index 0000000000..4b4d032a16
+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
new file mode 100644
-index 0000000000..e8df452021
+index 0000000000..4967b3f44c
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1957 @@
+@@ -0,0 +1,1934 @@
+/*
+ * HEVC Parameter Set decoding
+ *
@@ -13347,7 +20011,7 @@ index 0000000000..e8df452021
+ sps->long_term_ref_pics_present_flag = get_bits1(gb);
+ if (sps->long_term_ref_pics_present_flag) {
+ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+ if (sps->num_long_term_ref_pics_sps > 31U) {
++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
+ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
+ sps->num_long_term_ref_pics_sps);
+ return AVERROR_INVALIDDATA;
@@ -13543,7 +20207,6 @@ index 0000000000..e8df452021
+ av_freep(&pps->tile_size);
+ av_freep(&pps->tile_id);
+ av_freep(&pps->ctb_ts_flags);
-+ av_freep(&pps->min_tb_addr_zs_tab);
+
+ av_freep(&pps);
+}
@@ -13608,7 +20271,6 @@ index 0000000000..e8df452021
+static inline int setup_pps(AVCodecContext * const avctx,
+ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
+{
-+ int log2_diff;
+ int pic_area_in_ctbs;
+ int i, j, x, y, ctb_addr_rs, tile_id;
+
@@ -13712,9 +20374,8 @@ index 0000000000..e8df452021
+ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
+ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
+ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags));
-+ pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
+ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+ !pps->tile_id || !pps->min_tb_addr_zs_tab || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
+ return AVERROR(ENOMEM);
+ }
+
@@ -13810,26 +20471,6 @@ index 0000000000..e8df452021
+ }
+ }
+
-+ log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
-+ pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
-+ for (y = 0; y < sps->tb_mask+2; y++) {
-+ pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1;
-+ pps->min_tb_addr_zs_tab[y] = -1;
-+ }
-+ for (y = 0; y < sps->tb_mask+1; y++) {
-+ for (x = 0; x < sps->tb_mask+1; x++) {
-+ int tb_x = x >> log2_diff;
-+ int tb_y = y >> log2_diff;
-+ int rs = sps->ctb_width * tb_y + tb_x;
-+ int val = pps->ctb_addr_rs_to_ts[rs] << (log2_diff * 2);
-+ for (i = 0; i < log2_diff; i++) {
-+ int m = 1 << i;
-+ val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
-+ }
-+ pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val;
-+ }
-+ }
-+
+ return 0;
+}
+
@@ -14125,10 +20766,10 @@ index 0000000000..e8df452021
+}
diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
new file mode 100644
-index 0000000000..c9ecf9a268
+index 0000000000..77af463e31
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,441 @@
+@@ -0,0 +1,442 @@
+/*
+ * HEVC parameter set parsing
+ *
@@ -14237,6 +20878,9 @@ index 0000000000..c9ecf9a268
+ int num_entry_point_offsets;
+ int offsets_allocated;
+
++ uint8_t offload_wpp;
++ uint8_t offload_tiles;
++
+ int8_t slice_qp;
+
+ uint8_t luma_log2_weight_denom;
@@ -14388,8 +21032,8 @@ index 0000000000..c9ecf9a268
+ uint8_t sao_enabled;
+
+ uint8_t long_term_ref_pics_present_flag;
-+ uint16_t lt_ref_pic_poc_lsb_sps[32];
-+ uint8_t used_by_curr_pic_lt_sps_flag[32];
++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
+ uint8_t num_long_term_ref_pics_sps;
+
+ struct {
@@ -14532,8 +21176,6 @@ index 0000000000..c9ecf9a268
+ uint16_t *tile_id; ///< TileId
+ uint16_t *tile_pos_ts; ///< TilePosRS
+ uint16_t *tile_size; ///< TileSize
-+ int *min_tb_addr_zs; ///< MinTbAddrZS
-+ int *min_tb_addr_zs_tab;///< MinTbAddrZS
+ uint8_t * ctb_ts_flags;
+
+ uint8_t data[4096];
@@ -14541,14 +21183,14 @@ index 0000000000..c9ecf9a268
+} HEVCRpiPPS;
+
+typedef struct HEVCRpiParamSets {
-+ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
-+ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
-+ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
-+
+ /* currently active parameter sets */
+ const HEVCRpiVPS *vps;
+ const HEVCRpiSPS *sps;
+ const HEVCRpiPPS *pps;
++
++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
+} HEVCRpiParamSets;
+
+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
@@ -15093,7 +21735,7 @@ index 0000000000..d7745711ab
+}
diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
new file mode 100644
-index 0000000000..c5133a8a88
+index 0000000000..cd8149d58e
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.c
@@ -0,0 +1,368 @@
@@ -15194,10 +21836,11 @@ index 0000000000..c5133a8a88
+ s->quincunx_subsampling = get_bits1(gb);
+ s->content_interpretation_type = get_bits(gb, 6);
+
-+ // the following skips spatial_flipping_flag frame0_flipped_flag
-+ // field_views_flag current_frame_is_frame0_flag
-+ // frame0_self_contained_flag frame1_self_contained_flag
-+ skip_bits(gb, 6);
++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++ skip_bits(gb, 3);
++ s->current_frame_is_frame0_flag = get_bits1(gb);
++ // frame0_self_contained_flag, frame1_self_contained_flag
++ skip_bits(gb, 2);
+
+ if (!s->quincunx_subsampling && s->arrangement_type != 5)
+ skip_bits(gb, 16); // frame[01]_grid_position_[xy]
@@ -15371,8 +22014,8 @@ index 0000000000..c5133a8a88
+ return 0;
+}
+
-+static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+ int type, int size, void *logctx)
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++ int type, int size)
+{
+ switch (type) {
+ case 256: // Mismatched value from HM 8.1
@@ -15400,8 +22043,8 @@ index 0000000000..c5133a8a88
+ }
+}
+
-+static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s,
-+ int type, int size, void *logctx)
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ int type, int size)
+{
+ switch (type) {
+ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
@@ -15413,9 +22056,8 @@ index 0000000000..c5133a8a88
+ }
+}
+
-+static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s,
-+ const HEVCRpiParamSets *ps, int nal_unit_type,
-+ void *logctx)
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++ const HEVCRpiParamSets * const ps, const int nal_unit_type)
+{
+ int payload_type = 0;
+ int payload_size = 0;
@@ -15436,9 +22078,9 @@ index 0000000000..c5133a8a88
+ payload_size += byte;
+ }
+ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+ return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx);
++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
+ } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+ return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx);
++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
+ }
+}
+
@@ -15453,7 +22095,7 @@ index 0000000000..c5133a8a88
+ int ret;
+
+ do {
-+ ret = decode_nal_sei_message(gb, s, ps, type, logctx);
++ ret = decode_nal_sei_message(gb, logctx, s, ps, type);
+ if (ret < 0)
+ return ret;
+ } while (more_rbsp_data(gb));
@@ -15467,7 +22109,7 @@ index 0000000000..c5133a8a88
+}
diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
new file mode 100644
-index 0000000000..41e4a20127
+index 0000000000..d4ac348df9
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.h
@@ -0,0 +1,135 @@
@@ -15533,7 +22175,6 @@ index 0000000000..41e4a20127
+} HEVC_SEI_Type;
+
+typedef struct HEVCSEIPictureHash {
-+ struct AVMD5 *md5_ctx;
+ uint8_t md5[3][16];
+ uint8_t is_md5;
+} HEVCSEIPictureHash;
@@ -15543,6 +22184,7 @@ index 0000000000..41e4a20127
+ int arrangement_type;
+ int content_interpretation_type;
+ int quincunx_subsampling;
++ int current_frame_is_frame0_flag;
+} HEVCSEIFramePacking;
+
+typedef struct HEVCSEIDisplayOrientation {
@@ -20163,210 +26805,234 @@ index 0000000000..3caef20137
+
diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
new file mode 100644
-index 0000000000..1c364492d0
+index 0000000000..18128f4311
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,94 @@
+@@ -0,0 +1,106 @@
+static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
-+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
-+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
-+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
-+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
-+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
-+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030
-+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
-+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
-+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
-+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
-+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
-+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
-+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
-+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
-+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
-+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
-+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
-+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090
-+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
-+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
-+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
-+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
-+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
-+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
-+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
-+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
-+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
-+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
-+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
-+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
-+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
-+0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
-+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
-+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
-+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
-+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
-+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
-+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
-+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
-+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
-+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
-+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
-+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
-+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
-+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
-+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
-+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
-+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
-+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
-+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
-+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
-+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
-+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
-+0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8
-+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
-+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
-+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
-+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
-+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
-+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
-+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
-+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
-+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
-+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
-+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
-+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
-+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
-+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
-+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
-+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
-+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
-+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
-+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
-+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
-+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
-+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
-+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
-+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
-+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
-+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
-+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
-+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
-+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000
++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008
++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010
++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018
++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020
++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028
++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030
++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038
++0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, 0x39, 0xef, // 0040
++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048
++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050
++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058
++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060
++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068
++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070
++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078
++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080
++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088
++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090
++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098
++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0
++0x30, 0xc0, 0x06, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8
++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0
++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8
++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0
++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8
++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0
++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8
++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0
++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8
++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0
++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8
++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100
++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108
++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110
++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118
++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120
++0x00, 0x00, 0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, // 0128
++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130
++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138
++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140
++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148
++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150
++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158
++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160
++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168
++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170
++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178
++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180
++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188
++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190
++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198
++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0
++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8
++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0
++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8
++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0
++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8
++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0
++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8
++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0
++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8
++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0
++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8
++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200
++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208
++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210
++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218
++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x02, // 0220
++0x00, 0x00, 0x65, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228
++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230
++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238
++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240
++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248
++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250
++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258
++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260
++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268
++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290
++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298
++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0
++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8
++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0
++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8
++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0
++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8
++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0
++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8
++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0
++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8
++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0
++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8
++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308
++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310
++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318
++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320
++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338
+};
diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
new file mode 100644
-index 0000000000..1128a2c054
+index 0000000000..3557348e30
--- /dev/null
+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,94 @@
+@@ -0,0 +1,106 @@
+static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
-+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
-+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
-+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
-+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
-+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
-+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030
-+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
-+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
-+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
-+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
-+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
-+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
-+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
-+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
-+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
-+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
-+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
-+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090
-+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
-+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
-+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
-+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
-+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
-+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
-+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
-+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
-+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
-+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
-+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
-+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
-+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
-+0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
-+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
-+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
-+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
-+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
-+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
-+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
-+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
-+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
-+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
-+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
-+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
-+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
-+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
-+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
-+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
-+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
-+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
-+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
-+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
-+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
-+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
-+0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8
-+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
-+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
-+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
-+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
-+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
-+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
-+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
-+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
-+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
-+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
-+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
-+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
-+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
-+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
-+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
-+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
-+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
-+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
-+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
-+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
-+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
-+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
-+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
-+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
-+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
-+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
-+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
-+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
-+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000
++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008
++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010
++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018
++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020
++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028
++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030
++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038
++0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, 0x39, 0xef, // 0040
++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048
++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050
++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058
++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060
++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068
++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070
++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078
++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080
++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088
++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090
++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098
++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0
++0x30, 0xc0, 0x04, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8
++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0
++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8
++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0
++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8
++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0
++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8
++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0
++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8
++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0
++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8
++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100
++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108
++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110
++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118
++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120
++0x00, 0x00, 0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, // 0128
++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130
++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138
++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140
++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148
++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150
++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158
++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160
++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168
++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170
++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178
++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180
++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188
++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190
++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198
++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0
++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8
++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0
++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8
++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0
++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8
++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0
++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8
++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0
++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8
++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0
++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8
++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200
++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208
++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210
++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218
++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x08, // 0220
++0x00, 0x00, 0x45, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228
++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230
++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238
++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240
++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248
++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250
++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258
++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260
++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268
++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290
++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298
++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0
++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8
++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0
++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8
++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0
++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8
++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0
++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8
++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0
++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8
++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0
++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8
++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308
++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310
++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318
++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320
++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338
+};
diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
new file mode 100644
-index 0000000000..4034c77979
+index 0000000000..7c98f707d3
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,5753 @@
+@@ -0,0 +1,5850 @@
+/*
+ * HEVC video Decoder
+ *
@@ -21302,9 +27968,9 @@ index 0000000000..4034c77979
+ av_freep(&s->tab_slice_address);
+ av_freep(&s->filter_slice_edges);
+
-+ av_freep(&s->horizontal_bs);
++ av_freep(&s->bs_horizontal);
+// av_freep(&s->vertical_bs);
-+ av_freep(&s->vertical_bs2);
++ av_freep(&s->bs_vertical);
+ av_freep(&s->bsf_stash_left);
+ av_freep(&s->bsf_stash_up);
+
@@ -21325,8 +27991,13 @@ index 0000000000..4034c77979
+ int ctb_count = sps->ctb_width * sps->ctb_height;
+ int min_pu_size = sps->min_pu_width * sps->min_pu_height;
+
-+ s->hbs_stride = ((width + 63) & ~63) >> 4;
-+ s->bs_size = (((height + 15) & ~15) >> 3) * s->hbs_stride;
++ {
++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++ unsigned int h = ((height + 15) & ~15);
++
++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++ }
+
+ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
+ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
@@ -21340,7 +28011,10 @@ index 0000000000..4034c77979
+ goto fail;
+
+ s->tab_ipm = av_mallocz(min_pu_size);
-+ s->is_pcm = av_malloc_array(sps->pcm_width, sps->pcm_height);
++ // We can overread by 1 line & one byte in deblock so alloc & zero
++ // We don't need to zero the extra @ start of frame as it will never be
++ // written
++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
+ if (!s->tab_ipm || !s->is_pcm)
+ goto fail;
+
@@ -21352,9 +28026,9 @@ index 0000000000..4034c77979
+ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
+ goto fail;
+
-+ s->horizontal_bs = av_mallocz(s->bs_size);
-+ s->vertical_bs2 = av_mallocz(s->bs_size);
-+ if (s->horizontal_bs == NULL || s->vertical_bs2 == NULL)
++ s->bs_horizontal = av_mallocz(s->bs_size);
++ s->bs_vertical = av_mallocz(s->bs_size);
++ if (s->bs_horizontal == NULL || s->bs_vertical == NULL)
+ goto fail;
+
+ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
@@ -21406,15 +28080,22 @@ index 0000000000..4034c77979
+ uint8_t chroma_weight_l0_flag[16];
+ uint8_t luma_weight_l1_flag[16];
+ uint8_t chroma_weight_l1_flag[16];
-+ int luma_log2_weight_denom;
++ unsigned int luma_log2_weight_denom;
+
+ luma_log2_weight_denom = get_ue_golomb_long(gb);
-+ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
++ if (luma_log2_weight_denom > 7) {
+ av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
-+ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
++ return AVERROR_INVALIDDATA;
++ }
++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
+ if (ctx_cfmt(s) != 0) {
-+ int delta = get_se_golomb(gb);
-+ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
++ const unsigned int chroma_log2_weight_denom = luma_log2_weight_denom + get_se_golomb(gb);
++ if (chroma_log2_weight_denom > 7)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is invalid\n", chroma_log2_weight_denom);
++ return AVERROR_INVALIDDATA;
++ }
++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
+ }
+
+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
@@ -21741,6 +28422,7 @@ index 0000000000..4034c77979
+ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
+ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
+ const HEVCRpiSPS *last_sps = s->ps.sps;
++ enum AVPixelFormat pix_fmt;
+
+ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
+ if (sps->width != last_sps->width || sps->height != last_sps->height ||
@@ -21750,10 +28432,20 @@ index 0000000000..4034c77979
+ }
+ ff_hevc_rpi_clear_refs(s);
+
-+ ret = set_sps(s, sps, get_format(s, sps));
++ ret = set_sps(s, sps, sps->pix_fmt);
+ if (ret < 0)
+ return ret;
+
++ pix_fmt = get_format(s, sps);
++ if (pix_fmt < 0)
++ return pix_fmt;
++
++// ret = set_sps(s, sps, pix_fmt);
++// if (ret < 0)
++// return ret;
++
++ s->avctx->pix_fmt = pix_fmt;
++
+ s->seq_decode = (s->seq_decode + 1) & 0xff;
+ s->max_ra = INT_MAX;
+ }
@@ -22056,6 +28748,9 @@ index 0000000000..4034c77979
+ }
+
+ sh->num_entry_point_offsets = 0;
++ sh->offload_wpp = 0;
++ sh->offload_wpp = 0;
++
+ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
+ unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
+ // It would be possible to bound this tighter but this here is simpler
@@ -22092,6 +28787,18 @@ index 0000000000..4034c77979
+ }
+ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
+ }
++
++ // Do we want to offload this
++ if (s->threads_type != 0)
++ {
++ sh->offload_wpp = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
++ s->ps.pps->num_tile_columns > 1;
++ // * We only cope with WPP in a single column
++ // Probably want to deal with that case as tiles rather than WPP anyway
++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->ps.pps->num_tile_columns == 1;
++ }
+ }
+ }
+
@@ -22134,7 +28841,7 @@ index 0000000000..4034c77979
+
+ if (s->sh.slice_sample_adaptive_offset_flag[0] ||
+ s->sh.slice_sample_adaptive_offset_flag[1]) {
-+ if (lc->ctb_left_flag)
++ if ((lc->ctb_avail & AVAIL_L) != 0)
+ {
+ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
+ if (sao_merge_left_flag) {
@@ -22142,7 +28849,7 @@ index 0000000000..4034c77979
+ return;
+ }
+ }
-+ if (lc->ctb_up_flag)
++ if ((lc->ctb_avail & AVAIL_U) != 0)
+ {
+ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
+ if (sao_merge_up_flag) {
@@ -22224,19 +28931,97 @@ index 0000000000..4034c77979
+ return jb->intra.cmds + jb->intra.n++;
+}
+
-+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx)
++#define A0(x, y, U, L, UL, UR, DL) \
++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
++
++#define A1(x, y, U, L, UL, UR, DL) \
++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 )
++
++#define A2(x, y, U, L, UL, UR, DL) \
++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 )
++
++#define A3(x, y, U, L, UL, UR, DL) \
++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 )
++
++#define A4(x, y, U, L, UL, UR, DL) \
++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 )
++
++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
++{
++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++ const unsigned int ctb_mask = ctb_size - 1;
++ const unsigned int tb_x = x & ctb_mask;
++ const unsigned int tb_y = y & ctb_mask;
++
++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
++
++ unsigned int f = (lc->ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
++
++ if ((tb_x != 0 || tb_y != 0) && (~f & (AVAIL_L | AVAIL_U)) == 0)
++ f |= AVAIL_UL;
++
++
++ if (x + w >= lc->end_of_ctb_x)
++ {
++ if (tb_y == 0)
++ f |= (lc->ctb_avail & AVAIL_UR);
++ }
++ else
++ {
++ f |= (tb_y != 0) ? (tb_f[(w - 1) >> 2] & AVAIL_UR) : (lc->ctb_avail >> (AVAIL_S_U - AVAIL_S_UR)) & AVAIL_UR;
++ }
++#if AVAIL_S_U - AVAIL_S_UR < 0
++#error Shift problem
++#endif
++
++ // Never any D if Y beyond eoctb
++ if (y + h < lc->end_of_ctb_y)
++ {
++ if (tb_x == 0)
++ f |= (lc->ctb_avail << (AVAIL_S_DL - AVAIL_S_L)) & AVAIL_DL;
++ else
++ f |= tb_f[((h - 1) >> 2) * 16] & AVAIL_DL;
++ }
++#if AVAIL_S_DL - AVAIL_S_L < 0
++#error Shift problem
++#endif
++
++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
++// lc->end_of_ctb_x, lc->end_of_ctb_y);
++
++ return f;
++}
++
++#undef A0
++#undef A1
++#undef A2
++#undef A3
++#undef A4
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
++ unsigned int avail)
+{
+ // If rpi_enabled then sand - U & V done on U call
+ if (c_idx <= 1)
+ {
+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+ cmd->type = RPI_PRED_INTRA;
++ cmd->type = RPI_PRED_INTRA + c_idx;
+ cmd->size = log2_trafo_size;
-+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-+ cmd->c_idx = c_idx;
++ cmd->avail = avail;
+ cmd->i_pred.x = x0;
+ cmd->i_pred.y = y0;
+ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++
++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
+ }
+}
+
@@ -22264,8 +29049,8 @@ index 0000000000..4034c77979
+
+ if (lc->cu.pred_mode == MODE_INTRA) {
+ const unsigned int trafo_size = 1 << log2_trafo_size;
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size);
-+ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0);
++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size));
+ }
+
+ if (cbf_luma || cbf_chroma != 0)
@@ -22332,6 +29117,8 @@ index 0000000000..4034c77979
+
+ if (cbf_luma)
+ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++
++
+ if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) {
+ const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
+ const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
@@ -22344,8 +29131,8 @@ index 0000000000..4034c77979
+ }
+ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v));
+ }
+ if (((cbf_chroma >> i) & CBF_CB0) != 0)
+ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
@@ -22371,10 +29158,10 @@ index 0000000000..4034c77979
+ hls_cross_component_pred(lc, 1);
+ }
+ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
-+ if (lc->cu.pred_mode == MODE_INTRA) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
-+ }
++// if (lc->cu.pred_mode == MODE_INTRA) {
++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v));
++// }
+ if (((cbf_chroma >> i) & CBF_CR0) != 0)
+ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
+ log2_trafo_size_c, scan_idx_c, 2);
@@ -22385,11 +29172,12 @@ index 0000000000..4034c77979
+ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
+ int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2;
+ const int size = 1 << log2_trafo_size_c;
++ int j;
+
+ uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
+ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+ for (i = 0; i < (size * size); i++) {
-+ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++ for (j = 0; j < (size * size); j++) {
++ coeffs[j] = ((lc->tu.res_scale_val * coeffs_y[j]) >> 3);
+ }
+ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
+ }
@@ -22399,20 +29187,18 @@ index 0000000000..4034c77979
+ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
+ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
+ if (lc->cu.pred_mode == MODE_INTRA) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
-+ trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v));
+ }
+ if (((cbf_chroma >> i) & CBF_CB0) != 0)
+ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
+ log2_trafo_size, scan_idx_c, 1);
+ }
+ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
-+ if (lc->cu.pred_mode == MODE_INTRA) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
-+ trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
-+ }
++// if (lc->cu.pred_mode == MODE_INTRA) {
++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v));
++// }
+ if (((cbf_chroma >> i) & CBF_CR0) != 0)
+ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
+ log2_trafo_size, scan_idx_c, 2);
@@ -22422,28 +29208,29 @@ index 0000000000..4034c77979
+ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) {
+ int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
+ int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2);
-+ if (ctx_cfmt(s) == 2) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c),
-+ trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
-+ }
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v));
++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v));
++// if (ctx_cfmt(s) == 2) {
++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1,
++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v));
++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v));
++// }
+ } else if (blk_idx == 3) {
+ int trafo_size_h = 1 << (log2_trafo_size + 1);
+ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
-+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase,
-+ trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2);
-+ if (ctx_cfmt(s) == 2) {
-+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)),
-+ trafo_size_h, trafo_size_v);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+ }
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v));
++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v));
++// if (ctx_cfmt(s) == 2) {
++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1,
++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v));
++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2,
++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v));
++// }
+ }
+ }
+
@@ -22642,7 +29429,7 @@ index 0000000000..4034c77979
+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref,
+ const Mv * const mv, const int y0, const int height)
+{
-+ if (s->threads_type == FF_THREAD_FRAME) {
++ if (s->threads_type != 0) {
+ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+
+ // Progress has to be attached to current job as the actual wait
@@ -22661,8 +29448,8 @@ index 0000000000..4034c77979
+{
+ enum InterPredIdc inter_pred_idc = PRED_L0;
+ int mvp_flag;
++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
+
-+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
+ mv->pred_flag = 0;
+ if (s->sh.slice_type == HEVC_SLICE_B)
+ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
@@ -22674,7 +29461,7 @@ index 0000000000..4034c77979
+ mv->pred_flag = PF_L0;
+ ff_hevc_rpi_hls_mvd_coding(lc);
+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail,
+ part_idx, merge_idx, mv, mvp_flag, 0);
+ mv->mv[0].x += lc->pu.mvd.x;
+ mv->mv[0].y += lc->pu.mvd.y;
@@ -22692,7 +29479,7 @@ index 0000000000..4034c77979
+
+ mv->pred_flag += PF_L1;
+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail,
+ part_idx, merge_idx, mv, mvp_flag, 1);
+ mv->mv[1].x += lc->pu.mvd.x;
+ mv->mv[1].y += lc->pu.mvd.y;
@@ -23388,12 +30175,10 @@ index 0000000000..4034c77979
+ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
+ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
+
-+ int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size);
-+
+ // intra_pred_mode prediction does not cross vertical CTB boundaries
-+ const unsigned int cand_up = (lc->ctb_up_flag || y0b) && (y0 > y_ctb) ?
++ const unsigned int cand_up = y0b != 0 ?
+ s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
-+ const unsigned int cand_left = (lc->ctb_left_flag || x0b) ?
++ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) != 0 || x0b) ?
+ s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC;
+
+ int intra_pred_mode;
@@ -23800,16 +30585,17 @@ index 0000000000..4034c77979
+ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
+ lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
+
-+ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0;
-+ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0;
-+
+ // Use line width rather than tile width for addr_in_slice test as
+ // addr_in_slice is in raster units
-+ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+ (ctb_addr_rs_in_slice >= line_w + 1);
+
-+ lc->ctb_up_right_flag = (ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
-+ (ctb_addr_rs_in_slice + 1 >= line_w);
++ lc->ctb_avail =
++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
++ // Down-left never avail at CTB level
+}
+
+
@@ -23819,7 +30605,7 @@ index 0000000000..4034c77979
+ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
+
+ // Signal
-+ if (s->threads_type == FF_THREAD_FRAME && y > 0) {
++ if (y > 0) {
+ // Cast away const as progress is held in s, but this really shouldn't confuse anything
+ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
+ }
@@ -23847,22 +30633,11 @@ index 0000000000..4034c77979
+ switch (cmd->type)
+ {
+ case RPI_PRED_INTRA:
-+ {
-+ HEVCRpiLocalContextIntra lci; // Abbreviated local context
-+ HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci;
-+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
-+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1;
-+ lc->na.cand_left = (cmd->na >> 3) & 1;
-+ lc->na.cand_up_left = (cmd->na >> 2) & 1;
-+ lc->na.cand_up = (cmd->na >> 1) & 1;
-+ lc->na.cand_up_right = (cmd->na >> 0) & 1;
-+ if (cmd->c_idx == 0)
-+ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
-+ else
-+ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
++ break;
++ case RPI_PRED_INTRA_C:
++ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
+ break;
-+ }
-+
+ case RPI_PRED_ADD_RESIDUAL:
+ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+ break;
@@ -24590,7 +31365,7 @@ index 0000000000..4034c77979
+ ff_hevc_rpi_save_states(s, lc);
+
+ // Report progress so we can use our MVs in other frames
-+ if (s->threads_type == FF_THREAD_FRAME && (ctb_flags & CTB_TS_FLAGS_EOL) != 0)
++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
+ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
+
+ // End of line || End of tile line || End of tile
@@ -25004,9 +31779,7 @@ index 0000000000..4034c77979
+
+#if RPI_EXTRA_BIT_THREADS > 0
+
-+ if (s->sh.num_entry_point_offsets != 0 &&
-+ (!s->ps.pps->tile_wpp_inter_disable || s->sh.slice_type == HEVC_SLICE_I) &&
-+ s->ps.pps->num_tile_columns > 1)
++ if (s->sh.offload_tiles)
+ {
+ unsigned int slice_row = 0;
+
@@ -25051,14 +31824,7 @@ index 0000000000..4034c77979
+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
+#endif
+ }
-+ else
-+
-+ // * We only cope with WPP in a single column
-+ // Probably want to deal with that case as tiles rather than WPP anyway
-+ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
-+ if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-+ s->ps.pps->num_tile_columns == 1 &&
-+ s->sh.num_entry_point_offsets != 0)
++ else if (s->sh.offload_wpp)
+ {
+#if TRACE_WPP
+ printf("%s: Do WPP\n", __func__);
@@ -25184,6 +31950,13 @@ index 0000000000..4034c77979
+
+ if (s->sei.frame_packing.content_interpretation_type == 2)
+ stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++ if (s->sei.frame_packing.arrangement_type == 5) {
++ if (s->sei.frame_packing.current_frame_is_frame0_flag)
++ stereo->view = AV_STEREO3D_VIEW_LEFT;
++ else
++ stereo->view = AV_STEREO3D_VIEW_RIGHT;
++ }
+ }
+
+ if (s->sei.display_orientation.present &&
@@ -25297,8 +32070,8 @@ index 0000000000..4034c77979
+ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+ int ret;
+
-+ memset(s->horizontal_bs, 0, s->bs_size);
-+ memset(s->vertical_bs2, 0, s->bs_size);
++ memset(s->bs_horizontal, 0, s->bs_size);
++ memset(s->bs_vertical, 0, s->bs_size);
+ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
+ memset(s->skip_flag, 0, s->ps.sps->min_cb_height * s->skip_flag_stride);
+ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
@@ -25406,8 +32179,7 @@ index 0000000000..4034c77979
+ s->nal_unit_type == HEVC_NAL_STSA_N ||
+ s->nal_unit_type == HEVC_NAL_RADL_N ||
+ s->nal_unit_type == HEVC_NAL_RASL_N);
-+ s->offload_recon = s->used_for_ref;
-+// s->offload_recon = 0;
++ s->offload_recon = s->threads_type != 0 && s->used_for_ref;
+
+#if DEBUG_DECODE_N
+ {
@@ -25421,7 +32193,12 @@ index 0000000000..4034c77979
+ }
+ }
+#endif
-+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++ if (
++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s)))
++ {
+ s->is_decoded = 0;
+ break;
+ }
@@ -25544,7 +32321,7 @@ index 0000000000..4034c77979
+
+fail: // Also success path
+ if (s->ref != NULL) {
-+ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++ if (s->used_for_ref && s->threads_type != 0) {
+ ff_hevc_rpi_progress_signal_all_done(s);
+ }
+ else {
@@ -25596,7 +32373,7 @@ index 0000000000..4034c77979
+ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
+ uint8_t md5[16];
+
-+ av_md5_init(s->sei.picture_hash.md5_ctx);
++ av_md5_init(s->md5_ctx);
+ for (j = 0; j < h; j++) {
+ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
+#if HAVE_BIGENDIAN
@@ -25606,9 +32383,9 @@ index 0000000000..4034c77979
+ src = s->checksum_buf;
+ }
+#endif
-+ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift);
++ av_md5_update(s->md5_ctx, src, w << pixel_shift);
+ }
-+ av_md5_final(s->sei.picture_hash.md5_ctx, md5);
++ av_md5_final(s->md5_ctx, md5);
+
+ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
+ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
@@ -25759,7 +32536,7 @@ index 0000000000..4034c77979
+
+ pic_arrays_free(s);
+
-+ av_freep(&s->sei.picture_hash.md5_ctx);
++ av_freep(&s->md5_ctx);
+
+ av_freep(&s->cabac_save);
+
@@ -25793,12 +32570,6 @@ index 0000000000..4034c77979
+ s->ps.pps = NULL;
+ s->ps.vps = NULL;
+
-+ for (i = 1; i < s->threads_number; i++) {
-+ if (s->sList[i] != NULL) {
-+ av_freep(&s->sList[i]);
-+ }
-+ }
-+
+ // Free separately from sLists as used that way by RPI WPP
+ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
+ av_freep(s->HEVClcList + i);
@@ -25827,7 +32598,6 @@ index 0000000000..4034c77979
+ if (!s->HEVClc)
+ goto fail;
+ s->HEVClcList[0] = s->HEVClc;
-+ s->sList[0] = s;
+
+ // Whilst FFmpegs init fn is only called once the close fn is called as
+ // many times as we have threads (init_thread_copy is called for the
@@ -25871,8 +32641,7 @@ index 0000000000..4034c77979
+
+ s->max_ra = INT_MAX;
+
-+ s->sei.picture_hash.md5_ctx = av_md5_alloc();
-+ if (!s->sei.picture_hash.md5_ctx)
++ if ((s->md5_ctx = av_md5_alloc()) == NULL)
+ goto fail;
+
+ s->context_initialized = 1;
@@ -25953,7 +32722,6 @@ index 0000000000..4034c77979
+ s->is_nalff = s0->is_nalff;
+ s->nal_length_size = s0->nal_length_size;
+
-+ s->threads_number = s0->threads_number;
+ s->threads_type = s0->threads_type;
+
+ if (s0->eos) {
@@ -26011,11 +32779,6 @@ index 0000000000..4034c77979
+
+ atomic_init(&s->wpp_err, 0);
+
-+ if(avctx->active_thread_type & FF_THREAD_SLICE)
-+ s->threads_number = avctx->thread_count;
-+ else
-+ s->threads_number = 1;
-+
+ if (avctx->extradata_size > 0 && avctx->extradata) {
+ ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
+
@@ -26032,7 +32795,7 @@ index 0000000000..4034c77979
+ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+ s->threads_type = FF_THREAD_FRAME;
+ else
-+ s->threads_type = FF_THREAD_SLICE;
++ s->threads_type = 0;
+
+ return 0;
+}
@@ -26122,10 +32885,10 @@ index 0000000000..4034c77979
+
diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
new file mode 100644
-index 0000000000..117432de0a
+index 0000000000..d2ac038c9b
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,985 @@
+@@ -0,0 +1,958 @@
+/*
+ * HEVC video decoder
+ *
@@ -26360,44 +33123,6 @@ index 0000000000..117432de0a
+ PF_BI,
+};
+
-+enum IntraPredMode {
-+ INTRA_PLANAR = 0,
-+ INTRA_DC,
-+ INTRA_ANGULAR_2,
-+ INTRA_ANGULAR_3,
-+ INTRA_ANGULAR_4,
-+ INTRA_ANGULAR_5,
-+ INTRA_ANGULAR_6,
-+ INTRA_ANGULAR_7,
-+ INTRA_ANGULAR_8,
-+ INTRA_ANGULAR_9,
-+ INTRA_ANGULAR_10,
-+ INTRA_ANGULAR_11,
-+ INTRA_ANGULAR_12,
-+ INTRA_ANGULAR_13,
-+ INTRA_ANGULAR_14,
-+ INTRA_ANGULAR_15,
-+ INTRA_ANGULAR_16,
-+ INTRA_ANGULAR_17,
-+ INTRA_ANGULAR_18,
-+ INTRA_ANGULAR_19,
-+ INTRA_ANGULAR_20,
-+ INTRA_ANGULAR_21,
-+ INTRA_ANGULAR_22,
-+ INTRA_ANGULAR_23,
-+ INTRA_ANGULAR_24,
-+ INTRA_ANGULAR_25,
-+ INTRA_ANGULAR_26,
-+ INTRA_ANGULAR_27,
-+ INTRA_ANGULAR_28,
-+ INTRA_ANGULAR_29,
-+ INTRA_ANGULAR_30,
-+ INTRA_ANGULAR_31,
-+ INTRA_ANGULAR_32,
-+ INTRA_ANGULAR_33,
-+ INTRA_ANGULAR_34,
-+};
-+
+enum SAOType {
+ SAO_NOT_APPLIED = 0,
+ SAO_BAND,
@@ -26444,14 +33169,6 @@ index 0000000000..117432de0a
+ uint8_t cu_transquant_bypass_flag;
+} RpiCodingUnit;
+
-+typedef struct RpiNeighbourAvailable {
-+ char cand_bottom_left;
-+ char cand_left;
-+ char cand_up;
-+ char cand_up_left;
-+ char cand_up_right;
-+} RpiNeighbourAvailable;
-+
+typedef struct RpiPredictionUnit {
+ uint8_t intra_pred_mode[4];
+ uint8_t intra_pred_mode_c[4];
@@ -26517,14 +33234,8 @@ index 0000000000..117432de0a
+ uint8_t dpb_no;
+} HEVCFrame;
+
-+typedef struct HEVCRpiLocalContextIntra {
-+ TransformUnit tu;
-+ RpiNeighbourAvailable na;
-+} HEVCRpiLocalContextIntra;
-+
+typedef struct HEVCRpiLocalContext {
-+ TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!)
-+ RpiNeighbourAvailable na;
++ TransformUnit tu;
+
+ CABACContext cc;
+
@@ -26565,10 +33276,20 @@ index 0000000000..117432de0a
+ int8_t curr_qp_y;
+ int8_t qPy_pred;
+
-+ uint8_t ctb_left_flag;
-+ uint8_t ctb_up_flag;
-+ uint8_t ctb_up_right_flag;
-+ uint8_t ctb_up_left_flag;
++// N.B. Used by asm (neon) - do not change
++#define AVAIL_S_UR 0
++#define AVAIL_S_U 1
++#define AVAIL_S_UL 2
++#define AVAIL_S_L 3
++#define AVAIL_S_DL 4
++
++#define AVAIL_U (1 << AVAIL_S_U)
++#define AVAIL_L (1 << AVAIL_S_L)
++#define AVAIL_UL (1 << AVAIL_S_UL)
++#define AVAIL_UR (1 << AVAIL_S_UR)
++#define AVAIL_DL (1 << AVAIL_S_DL)
++
++ uint8_t ctb_avail;
+ int end_of_ctb_x;
+ int end_of_ctb_y;
+
@@ -26608,6 +33329,7 @@ index 0000000000..117432de0a
+ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
+ RPI_PRED_ADD_DC_V,
+ RPI_PRED_INTRA,
++ RPI_PRED_INTRA_C,
+ RPI_PRED_I_PCM,
+ RPI_PRED_CMD_MAX
+};
@@ -26615,8 +33337,8 @@ index 0000000000..117432de0a
+typedef struct HEVCPredCmd {
+ uint8_t type;
+ uint8_t size; // log2 "size" used by all variants
-+ uint8_t na; // i_pred - but left here as they pack well
-+ uint8_t c_idx; // i_pred
++ uint8_t avail; // i_pred - but left here as they pack well
++ uint8_t dummy;
+ union {
+ struct { // TRANSFORM_ADD
+ uint8_t * dst;
@@ -26813,17 +33535,25 @@ index 0000000000..117432de0a
+ uint8_t state[HEVC_CONTEXTS];
+} HEVCRpiCabacState;
+
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
+typedef struct HEVCRpiContext {
+ const AVClass *c; // needed by private avoptions
+ AVCodecContext *avctx;
+
-+ struct HEVCRpiContext *sList[MAX_NB_THREADS];
-+
+ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
+ HEVCRpiLocalContext *HEVClc;
+
+ uint8_t threads_type;
-+ uint8_t threads_number;
+
+ /** 1 if the independent slice segment header was successfully parsed */
+ uint8_t slice_initialized;
@@ -26882,17 +33612,19 @@ index 0000000000..117432de0a
+ int eos; ///< current packet contains an EOS/EOB NAL
+ int last_eos; ///< last packet contains an EOS/EOB NAL
+ int max_ra;
-+ unsigned int hbs_stride;
-+ unsigned int bs_size;
+
+ int is_decoded;
+ int no_rasl_output_flag;
+
-+ HEVCPredContext hpc;
++ HEVCRpiPredContext hpc;
+ HEVCDSPContext hevcdsp;
+ int8_t *qp_y_tab;
-+ uint8_t *horizontal_bs;
-+ uint8_t *vertical_bs2;
++
++ // Deblocking block strength bitmaps
++ unsigned int bs_stride2;
++ unsigned int bs_size;
++ uint8_t *bs_horizontal;
++ uint8_t *bs_vertical;
+ uint8_t *bsf_stash_up;
+ uint8_t *bsf_stash_left;
+
@@ -26930,11 +33662,12 @@ index 0000000000..117432de0a
+ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
+ int nuh_layer_id;
+
++ struct AVMD5 *md5_ctx;
++
+ HEVCSEIContext sei;
+
+ // Put structures that allocate non-trivial storage at the end
+ // These are mostly used indirectly so position in the structure doesn't matter
-+ HEVCRpiLocalContextIntra HEVClcIntra;
+ HEVCRpiPassQueue passq[RPI_PASSES];
+#if RPI_EXTRA_BIT_THREADS > 0
+ int bt_started;
@@ -26987,13 +33720,15 @@ index 0000000000..117432de0a
+
+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags);
+
-+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+ const int nPbW, const int nPbH);
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
++
+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
+ int nPbH, int log2_cb_size, int part_idx,
+ int merge_idx, MvField * const mv);
+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
-+ int nPbH, int log2_cb_size, int part_idx,
++ int nPbH, int log2_cb_size, const unsigned int avail, int part_idx,
+ int merge_idx, MvField * const mv,
+ int mvp_lx_flag, int LX);
+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
@@ -27024,12 +33759,13 @@ index 0000000000..117432de0a
+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
+ const HEVCFrame * const ref, const int y)
+{
-+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++ if (s->threads_type != 0)
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+}
+
+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
+{
-+ if (s->used_for_ref)
++ if (s->used_for_ref && s->threads_type != 0)
+ ff_hevc_rpi_progress_signal_field(s, y, 1);
+}
+
@@ -27041,7 +33777,7 @@ index 0000000000..117432de0a
+
+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
+{
-+ if (s->used_for_ref)
++ if (s->used_for_ref && s->threads_type != 0)
+ {
+ ff_hevc_rpi_progress_signal_field(s, y, 0);
+ }
@@ -27113,10 +33849,10 @@ index 0000000000..117432de0a
+#endif /* AVCODEC_RPI_HEVCDEC_H */
diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
new file mode 100644
-index 0000000000..a6af5ecd85
+index 0000000000..b041e0fd3f
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,416 @@
+@@ -0,0 +1,444 @@
+/*
+ * HEVC video decoder
+ *
@@ -27242,10 +33978,12 @@ index 0000000000..a6af5ecd85
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs)
++ int in_inc)
+{
++ int shift = 32;
++ uint32_t bs = 0;
+ for (; pus > 0; pus--) {
+ int strength, out;
+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
@@ -27350,12 +34088,37 @@ index 0000000000..a6af5ecd85
+
+ for (out = dup; out > 0; out--)
+ {
-+ *bs = strength;
-+ bs += out_inc;
++ bs = (bs >> 2) | (strength << 30);
++ shift -= 2;
++ }
++ }
++ return bs >> shift;
++}
++
++
++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
++{
++ unsigned int i, j;
++
++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=8)
++ AV_COPY64U(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
++ }
++ } else {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=16)
++ AV_COPY128(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
+ }
+ }
+}
+
++
++
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+{
+#undef FUNC
@@ -27523,6 +34286,7 @@ index 0000000000..a6af5ecd85
+ }
+
+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++ hevcdsp->cpy_blk = cpy_blk;
+
+ if (ARCH_PPC)
+ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
@@ -27535,10 +34299,10 @@ index 0000000000..a6af5ecd85
+}
diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
new file mode 100644
-index 0000000000..59d06bbe28
+index 0000000000..0b532f874b
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,185 @@
+/*
+ * HEVC video decoder
+ *
@@ -27707,9 +34471,11 @@ index 0000000000..59d06bbe28
+ uint8_t * src_l,
+ unsigned int no_f);
+
-+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs);
++ int in_inc);
++
++ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
+} HEVCDSPContext;
+
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
@@ -27724,7 +34490,7 @@ index 0000000000..59d06bbe28
+#endif /* AVCODEC_RPI_HEVCDSP_H */
diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
new file mode 100644
-index 0000000000..cfe9264fc3
+index 0000000000..d1196a4440
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp_template.c
@@ -0,0 +1,2278 @@
@@ -28309,7 +35075,7 @@ index 0000000000..cfe9264fc3
+ pixel *src = (pixel *)_src;
+ int a_stride, b_stride;
+ int x, y;
-+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
+ stride_dst /= sizeof(pixel);
+
+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
@@ -28537,7 +35303,7 @@ index 0000000000..cfe9264fc3
+ pixel *src = (pixel *)_src;
+ int a_stride, b_stride;
+ int x, y;
-+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
+
+ stride_dst /= sizeof(pixel);
+ width *= 2;
@@ -30008,10 +36774,10 @@ index 0000000000..cfe9264fc3
+
diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
new file mode 100644
-index 0000000000..f6db76482d
+index 0000000000..62135b83c2
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,122 @@
+@@ -0,0 +1,166 @@
+/*
+ * HEVC video Decoder
+ *
@@ -30037,6 +36803,9 @@ index 0000000000..f6db76482d
+#include "rpi_hevcdec.h"
+
+#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
+
+#define PRED_C 0
+#define BIT_DEPTH 8
@@ -30074,7 +36843,7 @@ index 0000000000..f6db76482d
+#undef BIT_DEPTH
+#undef PRED_C
+
-+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth)
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
@@ -30087,30 +36856,68 @@ index 0000000000..f6db76482d
+ hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \
+ hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \
+ hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \
++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
+ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \
+ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
+ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
+ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
-+ hpc->pred_dc = FUNC(pred_dc, depth); \
++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \
++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \
++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \
++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \
++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
+ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
+ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
+ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-+ hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \
++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \
++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \
++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth);
+
+#define HEVC_PRED_C(depth) \
+ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \
+ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \
+ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \
+ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \
++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
+ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
-+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \
++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \
++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \
++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \
++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \
++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \
++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \
++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \
++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth);
+
+#define HEVC_PRED(depth) \
+ HEVC_PRED_Y(depth); \
@@ -30131,15 +36938,18 @@ index 0000000000..f6db76482d
+ break;
+ }
+
-+ if (ARCH_MIPS)
-+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#if (ARCH_ARM)
++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
+}
diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
new file mode 100644
-index 0000000000..03c6eb3295
+index 0000000000..6e594277c0
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,57 @@
+@@ -0,0 +1,121 @@
+/*
+ * HEVC video Decoder
+ *
@@ -30172,37 +36982,101 @@ index 0000000000..03c6eb3295
+struct HEVCRpiContext;
+struct HEVCRpiLocalContext;
+
-+typedef struct HEVCPredContext {
-+ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
++enum IntraPredMode {
++ INTRA_PLANAR = 0,
++ INTRA_DC,
++ INTRA_ANGULAR_2,
++ INTRA_ANGULAR_3,
++ INTRA_ANGULAR_4,
++ INTRA_ANGULAR_5,
++ INTRA_ANGULAR_6,
++ INTRA_ANGULAR_7,
++ INTRA_ANGULAR_8,
++ INTRA_ANGULAR_9,
++ INTRA_ANGULAR_10,
++ INTRA_ANGULAR_11,
++ INTRA_ANGULAR_12,
++ INTRA_ANGULAR_13,
++ INTRA_ANGULAR_14,
++ INTRA_ANGULAR_15,
++ INTRA_ANGULAR_16,
++ INTRA_ANGULAR_17,
++ INTRA_ANGULAR_18,
++ INTRA_ANGULAR_19,
++ INTRA_ANGULAR_20,
++ INTRA_ANGULAR_21,
++ INTRA_ANGULAR_22,
++ INTRA_ANGULAR_23,
++ INTRA_ANGULAR_24,
++ INTRA_ANGULAR_25,
++ INTRA_ANGULAR_26,
++ INTRA_ANGULAR_27,
++ INTRA_ANGULAR_28,
++ INTRA_ANGULAR_29,
++ INTRA_ANGULAR_30,
++ INTRA_ANGULAR_31,
++ INTRA_ANGULAR_32,
++ INTRA_ANGULAR_33,
++ INTRA_ANGULAR_34,
++};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26
+
++typedef void intra_filter_fn_t(
++ uint8_t * const left, uint8_t * const top,
++ const unsigned int req, const unsigned int avail,
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size);
++
++typedef struct HEVCRpiPredContext {
++ void (*intra_pred[4])(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
++
++ intra_filter_fn_t *intra_filter[4];
+ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
+ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride,
-+ int c_idx, int mode);
-+ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
++ int mode);
++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
+
++ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
++ intra_filter_fn_t *intra_filter_c[4];
+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride,
-+ int c_idx, int mode);
-+} HEVCPredContext;
++ int mode);
++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
++} HEVCRpiPredContext;
+
-+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth);
-+void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_RPI_HEVCPRED_H */
diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
new file mode 100644
-index 0000000000..4ee776f955
+index 0000000000..23835a320e
--- /dev/null
+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,850 @@
+@@ -0,0 +1,1487 @@
+/*
+ * HEVC video decoder
+ *
@@ -30314,7 +37188,7 @@ index 0000000000..4ee776f955
+#endif
+
+
-+#if DUMP_PRED && !defined(INCLUDE_ONCE)
++#if DUMP_PRED && !defined(INCLUDED_ONCE)
+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+{
+ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
@@ -30327,104 +37201,705 @@ index 0000000000..4ee776f955
+}
+#endif
+
-+static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
-+ int log2_size, int c_idx_arg)
++#ifndef INCLUDED_ONCE
++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
+{
++ if ((n >>= 2) != 0) {
++ uint32_t v4 = v | (v << 8);
++ uint32_t * p = (uint32_t *)ptr;
++ v4 = v4 | (v4 << 16);
++ do {
++ *p++ = v4;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t v2 = v | (v << 16);
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v2;
++ *p++ = v2;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ } while (--n != 0);
++ }
++}
++
++// Beware that this inverts the avail ordering
++// For CIP it seems easier this way round
++static unsigned int cip_avail(const MvField * mvf, const int mvf_stride, const unsigned int log2_pu_size, const unsigned int avail, unsigned int size,
++ unsigned int s0, unsigned int s1)
++{
++ const unsigned int n = 1 << (log2_pu_size - 2);
++ unsigned int fa = 0;
++ unsigned int i = 0;
++
++ size >>= 2; // Now in 4-pel units
++ s0 >>= 2;
++ s1 >>= 2;
++
++ if ((avail & 4) != 0)
++ fa |= ((1 << s0) - 1) << (size - s0);
++ if ((avail & 2) != 0)
++ fa |= ((1 << s1) - 1) << size;
++ if ((avail & 1) != 0)
++ fa |= 1 << (size << 1);
++
++ for (i = 0; (fa >> i) != 0; i += n, mvf += mvf_stride) {
++ if ((fa & (((1 << n) - 1) << i)) != 0 && mvf->pred_flag != PF_INTRA)
++ fa &= ~(((1 << n) - 1) << i);
++ }
++
++ return fa;
++}
++
++static inline unsigned int rmbd(unsigned int x)
++{
++#if 1
++ return __builtin_ctz(x);
++#else
++ unsigned int n = 0;
++ if ((x & 0xffff) == 0) {
++ x >>= 16;
++ n += 16;
++ }
++ if ((x & 0xff) == 0) {
++ x >>= 8;
++ n += 8;
++ }
++ if ((x & 0xf) == 0) {
++ x >>= 4;
++ n += 4;
++ }
++ if ((x & 0x3) == 0) {
++ x >>= 2;
++ n += 2;
++ }
++
++ return (x & 1) == 0 ? n + 1 : n;
++#endif
++}
++#endif
++
++
++static void FUNC(cip_fill)(pixel * const left, pixel * const top,
++ const unsigned int avail_l, const unsigned int avail_u,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int size)
++{
++ pixel a;
++ unsigned int i;
++
++ // 1st find DL value
++ if ((avail_l & 1) == 0) {
++ if (avail_l != 0)
++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
++ else
++ {
++ // (avail_l | avail_u) != 0 so this must be good
++ const unsigned int n = rmbd(avail_u)*4;
++ a = (n >= size) ? src_ur[n - size] : src_u[n];
++ }
++ }
++
++ // L
++ {
++ pixel * d = left + size * 2 - 1;
++ const pixel * s = src_l + (size * 2 - 1) * stride;
++ unsigned int x = avail_l;
++ for (i = 0; i < size * 2; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = a = *s;
++ s -= stride;
++ }
++ else
++ {
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ s -= stride * 4;
++ }
++ }
++ // UL
++ *d = a = (x & 1) != 0 ? *s : a;
++ }
++
++ // U
++ {
++ pixel * d = top;
++ const pixel * s = src_u;
++ unsigned int x = avail_u;
++
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++
++ // UR
++ s = src_ur;
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++ }
++}
++
++
++#if !PRED_C && PW == 1
++#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
++#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
++#else
++#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
++#endif
++
++
+#define PU(x) \
+ ((x) >> s->ps.sps->log2_min_pu_size)
+#define MVF(x, y) \
-+ (s->ref->tab_mvf[(x) + (y) * min_pu_width])
++ (s->ref->tab_mvf[(x) + (y) * s->ps.sps->min_pu_width])
+#define MVF_PU(x, y) \
+ MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift))))
-+#define IS_INTRA(x, y) \
-+ (MVF_PU(x, y).pred_flag == PF_INTRA)
-+#define MIN_TB_ADDR_ZS(x, y) \
-+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
-+#define EXTEND(ptr, val, len) \
-+do { \
-+ pixel4 pix = PIXEL_SPLAT_X4(val); \
-+ for (i = 0; i < (len); i += 4) \
-+ AV_WN4P(ptr + i, pix); \
-+} while (0)
+
-+#define EXTEND_RIGHT_CIP(ptr, start, length) \
-+ for (i = start; i < (start) + (length); i += 4) \
-+ if (!IS_INTRA(i, -1)) \
-+ AV_WN4P(&ptr[i], a); \
-+ else \
-+ a = PIXEL_SPLAT_X4(ptr[i+3])
-+#define EXTEND_LEFT_CIP(ptr, start, length) \
-+ for (i = start; i > (start) - (length); i--) \
-+ if (!IS_INTRA(i - 1, -1)) \
-+ ptr[i - 1] = ptr[i]
-+#define EXTEND_UP_CIP(ptr, start, length) \
-+ for (i = (start); i > (start) - (length); i -= 4) \
-+ if (!IS_INTRA(-1, i - 3)) \
-+ AV_WN4P(&ptr[i - 3], a); \
-+ else \
-+ a = PIXEL_SPLAT_X4(ptr[i - 3])
-+#define EXTEND_DOWN_CIP(ptr, start, length) \
-+ for (i = start; i < (start) + (length); i += 4) \
-+ if (!IS_INTRA(-1, i)) \
-+ AV_WN4P(&ptr[i], a); \
-+ else \
-+ a = PIXEL_SPLAT_X4(ptr[i + 3])
++// Reqs:
++//
++// Planar: DL[0], L, ul, U, UR[0]
++// DC: dl, L, ul, U, ur
++// A2-9: DL, L, ul, u, ur
++// A10: dl, L, ul, u, ur
++// A11-17 dl, L, UL, U, ur
++// A18-25 dl, L, Ul, U, ur
++// A26 dl, l, ul, U, ur
++// A27-34 dl, l, ul, U, UR
++
++#ifndef INCLUDED_ONCE
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++
++#define FILTER_LIGHT 0x40
++#define FILTER_STRONG 0x80
++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG)
++
++static const uint8_t req_avail_c[35] =
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++};
++
++static const uint8_t req_avail[4][35] = {
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++},
++{ // 3
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | 0, // 3
++ AVAIL_DL | AVAIL_L | 0, // 4
++ AVAIL_DL | AVAIL_L | 0, // 5
++ AVAIL_DL | AVAIL_L | 0, // 6
++ AVAIL_DL | AVAIL_L | 0, // 7
++ AVAIL_DL | AVAIL_L | 0, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | 0, // 28
++ AVAIL_U | AVAIL_UR | 0, // 29
++ AVAIL_U | AVAIL_UR | 0, // 30
++ AVAIL_U | AVAIL_UR | 0, // 31
++ AVAIL_U | AVAIL_UR | 0, // 32
++ AVAIL_U | AVAIL_UR | 0, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 4
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 5
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9
++ AVAIL_L | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25
++ AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34
++}
++};
++
++
++#endif
++
++#define filter_light1 FUNC(filter_light1)
++static inline pixel filter_light1(pixel a, pixel b, pixel c)
++{
++ return (a + b*2 + c + 2) >> 2;
++}
++
++#define filter_light FUNC(filter_light)
++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
++{
++ pixel p0;
++ pixel p2 = *src;
++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
++ unsigned int n_minus_1 = n - 1;
++
++ do
++ {
++ src += sstride;
++ p0 = p1;
++ p1 = p2;
++ p2 = *src;
++ *dst++ = filter_light1(p0, p1, p2);
++ } while (--n_minus_1 != 0);
++ *dst = filter_light1(p1, p2, pn);
++}
++
++#define filter_strong FUNC(filter_strong)
++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
++{
++ unsigned int a = 64 * p0 + 32;
++ const int v = p1 - p0;
++
++ do
++ {
++ *dst++ = (a += v) >> 6;
++ } while (--n != 0);
++}
++
++#define intra_filter FUNC(intra_filter)
++static av_always_inline void intra_filter(
++ pixel * const left, pixel * const top,
++ const unsigned int req, const unsigned int avail,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size,
++ const unsigned int log2_size)
++{
++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
++ const unsigned int size = 1 << log2_size;
++
++ // a_ is the first pel in a section working round dl -> ur
++ // b_ is the last
++ // Beware that top & left work out from UL so usage of a_ & b_ may
++ // swap between them. It is a bad naming scheme but I have found no
++ // better
++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
++ const pixel * b_dl = src_l + size * stride;
++ const pixel * a_l = src_l + (size - 1) * stride;
++ const pixel * b_l = src_l;
++ const pixel * ab_ul = src_l - stride;
++ const pixel * a_u = src_u;
++ const pixel * b_u = src_u + size - 1;
++ const pixel * a_ur = src_ur;
++ const pixel * b_ur = src_ur + top_right_size - 1;
++
++ const unsigned int want = req & ~avail;
++ const unsigned int have = req & avail;
++ unsigned int i;
++
++ if ((avail & AVAIL_DL) == 0)
++ {
++ a_dl = a_ur;
++ if ((avail & AVAIL_U) != 0)
++ a_dl = a_u;
++ if ((avail & AVAIL_UL) != 0)
++ a_dl = ab_ul;
++ if ((avail & AVAIL_L) != 0)
++ a_dl = a_l;
++ b_dl = a_dl;
++ }
++
++ if ((avail & AVAIL_L) == 0)
++ {
++ a_l = b_dl;
++ b_l = b_dl;
++ }
++ if ((avail & AVAIL_UL) == 0)
++ {
++ ab_ul = b_l;
++ }
++ if ((avail & AVAIL_U) == 0)
++ {
++ a_u = ab_ul;
++ b_u = ab_ul;
++ }
++ if ((avail & AVAIL_UR) == 0)
++ {
++ a_ur = b_u;
++ b_ur = b_u;
++ }
++
++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints
++ {
++ if ((req & AVAIL_UL) != 0)
++ left[-1] = *ab_ul;
++
++ if ((want & AVAIL_L) != 0)
++ EXTEND(left, *a_l, size);
++ if ((want & AVAIL_DL) != 0)
++ EXTEND(left + size, *a_dl, size);
++ if ((want & AVAIL_U) != 0)
++ EXTEND(top, *a_u, size);
++ if ((want & AVAIL_UR) != 0)
++ EXTEND(top + size, *a_ur, size);
++
++ if ((have & AVAIL_U) != 0)
++ // Always good - even with sand
++ memcpy(top, a_u, size * sizeof(pixel));
++ if ((have & AVAIL_UR) != 0)
++ {
++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
++ EXTEND(top + size + top_right_size, *b_ur,
++ size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ for (i = 0; i < size; i++)
++ left[i] = b_l[stride * i];
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ for (i = 0; i < down_left_size; i++)
++ left[i + size] = b_dl[stride * i];
++ EXTEND(left + size + down_left_size, *a_dl,
++ size - down_left_size);
++ }
++ }
++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
++ {
++ if ((req & (AVAIL_U | AVAIL_UR)) != 0)
++ filter_strong(top, *ab_ul, *b_ur, size * 2);
++ left[-1] = *ab_ul;
++ if ((req & (AVAIL_L | AVAIL_DL)) != 0)
++ filter_strong(left, *ab_ul, *a_dl, size*2);
++ }
++ else
++ {
++ // Same code for both have & want for UL
++ if ((req & AVAIL_UL) != 0)
++ {
++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
++ }
++
++ if ((want & AVAIL_L) != 0)
++ {
++ EXTEND(left, *a_l, size);
++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
++ }
++ if ((want & AVAIL_DL) != 0)
++ {
++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
++ EXTEND(left + size, *a_l, size);
++ }
++ if ((want & AVAIL_U) != 0)
++ {
++ EXTEND(top, *a_u, size);
++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
++ }
++ if ((want & AVAIL_UR) != 0)
++ {
++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
++ EXTEND(top + size, *a_ur, size);
++ }
++
++ if ((have & AVAIL_U) != 0)
++ {
++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
++ }
++ if ((have & AVAIL_UR) != 0) {
++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
++ top[size*2 - 1] = *b_ur;
++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
++ left[size*2 - 1] = *a_dl;
++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
++ }
++ }
++}
++
++#define INTRA_FILTER(log2_size) \
++static void FUNC(intra_filter_ ## log2_size)( \
++ uint8_t * const left, uint8_t * const top, \
++ const unsigned int req, const unsigned int avail, \
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
++ const unsigned int stride, \
++ const unsigned int top_right_size, const unsigned int down_left_size) \
++{ \
++ intra_filter((pixel *)left, (pixel *)top, req, avail, \
++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
++}
++
++INTRA_FILTER(2)
++INTRA_FILTER(3)
++INTRA_FILTER(4)
++INTRA_FILTER(5)
++
++#undef intra_filter
++#undef INTRA_FILTER
++
++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
++ const unsigned int log2_size)
++{
+ // c_idx will alaways be 1 for _c versions and 0 for y
+ const unsigned int c_idx = PRED_C;
-+ int i;
+ const unsigned int hshift = ctx_hshift(s, c_idx);
+ const unsigned int vshift = ctx_vshift(s, c_idx);
-+ int size = (1 << log2_size);
-+ int size_in_luma_h = size << hshift;
-+ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
-+ int size_in_luma_v = size << vshift;
-+ int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
-+ const int x = x0 >> hshift;
-+ const int y = y0 >> vshift;
-+ int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
-+ int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
-+
-+ int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
++ const unsigned int size = (1 << log2_size);
++ const unsigned int x = x0 >> hshift;
++ const unsigned int y = y0 >> vshift;
+
+ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
+ pixel *const src = c_idx == 0 ?
+ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+
-+ int min_pu_width = s->ps.sps->min_pu_width;
++ // Align so we can do multiple loads in the asm
++ // Padded to 16 byte boundary so as not to confuse anything
++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++ DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
+
-+ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
-+ lc->tu.intra_pred_mode;
-+ pixel4 a;
-+ pixel left_array[2 * MAX_TB_SIZE + 1];
++ pixel * const left = left_array + 16 / sizeof(pixel);
++ pixel * const top = top_array + 16 / sizeof(pixel);
++ const pixel * top_pred = top;
++
++ const pixel * src_l = src - 1;
++ const pixel * src_u = src - stride;
++ const pixel * src_ur = src_u + size;
+#if !PRED_C
-+ pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
-+#endif
-+ pixel top_array[2 * MAX_TB_SIZE + 1];
-+#if !PRED_C
-+ pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
++ unsigned int req = req_avail[log2_size - 2][mode];
++#else
++ unsigned int req = req_avail_c[mode];
+#endif
+
-+ pixel *left = left_array + 1;
-+ pixel *top = top_array + 1;
++ // If we have nothing to pred from then fill with grey
++ // This isn't a common case but dealing with it here means we don't have to
++ // test for it later
++ if (avail == 0)
++ {
++dc_only:
+#if !PRED_C
-+ pixel *filtered_left = filtered_left_array + 1;
-+ pixel *filtered_top = filtered_top_array + 1;
++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
++#else
++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
+#endif
-+ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
-+ int cand_left = lc->na.cand_left;
-+ int cand_up_left = lc->na.cand_up_left;
-+ int cand_up = lc->na.cand_up;
-+ int cand_up_right = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
++ return;
++ }
+
-+ int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
-+ (y0 + size_in_luma_v)) >> vshift;
-+ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
-+ (x0 + size_in_luma_h)) >> hshift;
-+
-+ pixel * src_l = src - 1;
-+ pixel * src_u = src - stride;
-+ pixel * src_ur = src_u + size;
++ // There will be no filtering on C so no point worrying about disabling it
++#if !PRED_C
++ if (s->ps.sps->intra_smoothing_disabled_flag)
++ req &= ~FILTER_EITHER;
++ if (!s->ps.sps->sps_strong_intra_smoothing_enable_flag)
++ req &= ~FILTER_STRONG;
++#endif
+
+ {
+ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
@@ -30437,255 +37912,123 @@ index 0000000000..4ee776f955
+ src_ur += stripe_adj;
+ }
+
-+ if (s->ps.pps->constrained_intra_pred_flag == 1) {
-+ int size_in_luma_pu_v = PU(size_in_luma_v);
-+ int size_in_luma_pu_h = PU(size_in_luma_h);
-+ int on_pu_edge_x = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
-+ int on_pu_edge_y = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
-+ if (!size_in_luma_pu_h)
-+ size_in_luma_pu_h++;
-+ if (cand_bottom_left == 1 && on_pu_edge_x) {
-+ int x_left_pu = PU(x0 - 1);
-+ int y_bottom_pu = PU(y0 + size_in_luma_v);
-+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
-+ cand_bottom_left = 0;
-+ for (i = 0; i < max; i += 2)
-+ cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
-+ }
-+ if (cand_left == 1 && on_pu_edge_x) {
-+ int x_left_pu = PU(x0 - 1);
-+ int y_left_pu = PU(y0);
-+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
-+ cand_left = 0;
-+ for (i = 0; i < max; i += 2)
-+ cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
-+ }
-+ if (cand_up_left == 1) {
-+ int x_left_pu = PU(x0 - 1);
-+ int y_top_pu = PU(y0 - 1);
-+ cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
-+ }
-+ if (cand_up == 1 && on_pu_edge_y) {
-+ int x_top_pu = PU(x0);
-+ int y_top_pu = PU(y0 - 1);
-+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
-+ cand_up = 0;
-+ for (i = 0; i < max; i += 2)
-+ cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
-+ }
-+ if (cand_up_right == 1 && on_pu_edge_y) {
-+ int y_top_pu = PU(y0 - 1);
-+ int x_right_pu = PU(x0 + size_in_luma_h);
-+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
-+ cand_up_right = 0;
-+ for (i = 0; i < max; i += 2)
-+ cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
-+ }
-+ memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
-+ memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
-+ top[-1] = 128;
-+ }
-+ if (cand_up_left) {
-+ left[-1] = src_l[-stride];
-+ top[-1] = left[-1];
-+ }
-+ if (cand_up)
-+ // Always good - even with sand
-+ memcpy(top, src_u, size * sizeof(pixel));
-+ if (cand_up_right) {
-+ memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
-+ EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
-+ size - top_right_size);
-+ }
-+ if (cand_left)
-+ for (i = 0; i < size; i++)
-+ left[i] = src_l[stride * i];
-+ if (cand_bottom_left) {
-+ for (i = size; i < size + bottom_left_size; i++)
-+ left[i] = src_l[stride * i];
-+ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
-+ size - bottom_left_size);
-+ }
++ if (s->ps.pps->constrained_intra_pred_flag == 1 &&
++ s->sh.slice_type != HEVC_SLICE_I) // Can deal with I-slices in 'normal' code
++ {
++ const unsigned int l2_pu_s = FFMAX(s->ps.sps->log2_min_pu_size - hshift, 2);
++ const unsigned int l2_pu_stride_s = l2_pu_s - (s->ps.sps->log2_min_pu_size - hshift);
+
-+ if (s->ps.pps->constrained_intra_pred_flag == 1) {
-+ if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
-+ int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ?
-+ 2 * size : (s->ps.sps->width - x0) >> hshift;
-+ int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ?
-+ 2 * size : (s->ps.sps->height - y0) >> vshift;
-+ int j = size + (cand_bottom_left? bottom_left_size: 0) -1;
-+ if (!cand_up_right) {
-+ size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ?
-+ size : (s->ps.sps->width - x0) >> hshift;
-+ }
-+ if (!cand_bottom_left) {
-+ size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ?
-+ size : (s->ps.sps->height - y0) >> vshift;
-+ }
-+ if (cand_bottom_left || cand_left || cand_up_left) {
-+ while (j > -1 && !IS_INTRA(-1, j))
-+ j--;
-+ if (!IS_INTRA(-1, j)) {
-+ j = 0;
-+ while (j < size_max_x && !IS_INTRA(j, -1))
-+ j++;
-+ EXTEND_LEFT_CIP(top, j, j + 1);
-+ left[-1] = top[-1];
-+ }
-+ } else {
-+ j = 0;
-+ while (j < size_max_x && !IS_INTRA(j, -1))
-+ j++;
-+ if (j > 0)
-+ if (x0 > 0) {
-+ EXTEND_LEFT_CIP(top, j, j + 1);
-+ } else {
-+ EXTEND_LEFT_CIP(top, j, j);
-+ top[-1] = top[0];
-+ }
-+ left[-1] = top[-1];
-+ }
-+ left[-1] = top[-1];
-+ if (cand_bottom_left || cand_left) {
-+ a = PIXEL_SPLAT_X4(left[-1]);
-+ EXTEND_DOWN_CIP(left, 0, size_max_y);
-+ }
-+ if (!cand_left)
-+ EXTEND(left, left[-1], size);
-+ if (!cand_bottom_left)
-+ EXTEND(left + size, left[size - 1], size);
-+ if (x0 != 0 && y0 != 0) {
-+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
-+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
-+ if (!IS_INTRA(-1, - 1))
-+ left[-1] = left[0];
-+ } else if (x0 == 0) {
-+ EXTEND(left, 0, size_max_y);
-+ } else {
-+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
-+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
-+ }
-+ top[-1] = left[-1];
-+ if (y0 != 0) {
-+ a = PIXEL_SPLAT_X4(left[-1]);
-+ EXTEND_RIGHT_CIP(top, 0, size_max_x);
-+ }
-+ }
-+ }
-+ // Infer the unavailable samples
-+ if (!cand_bottom_left) {
-+ if (cand_left) {
-+ EXTEND(left + size, left[size - 1], size);
-+ } else if (cand_up_left) {
-+ EXTEND(left, left[-1], 2 * size);
-+ cand_left = 1;
-+ } else if (cand_up) {
-+ left[-1] = top[0];
-+ EXTEND(left, left[-1], 2 * size);
-+ cand_up_left = 1;
-+ cand_left = 1;
-+ } else if (cand_up_right) {
-+ EXTEND(top, top[size], size);
-+ left[-1] = top[size];
-+ EXTEND(left, left[-1], 2 * size);
-+ cand_up = 1;
-+ cand_up_left = 1;
-+ cand_left = 1;
-+ } else { // No samples available
-+#if PRED_C
-+ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
-+#else
-+ left[-1] = (1 << (BIT_DEPTH - 1));
-+#endif
-+ EXTEND(top, left[-1], 2 * size);
-+ EXTEND(left, left[-1], 2 * size);
-+ }
-+ }
++ unsigned int avail_l = cip_avail(&MVF_PU(-1, size * 2 - 1),
++ -(int)(s->ps.sps->min_pu_width << l2_pu_stride_s),
++ l2_pu_s,
++ avail >> AVAIL_S_UL,
++ size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), size);
++ unsigned int avail_u = cip_avail(&MVF_PU(0, -1),
++ 1 << l2_pu_stride_s,
++ l2_pu_s,
++ avail << 1,
++ size,
++ size, FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size));
+
-+ if (!cand_left)
-+ EXTEND(left, left[size], size);
-+ if (!cand_up_left) {
-+ left[-1] = left[0];
-+ }
-+ if (!cand_up)
-+ EXTEND(top, left[-1], size);
-+ if (!cand_up_right)
-+ EXTEND(top + size, top[size - 1], size);
++ // Anything left?
++ if ((avail_l | avail_u) == 0)
++ goto dc_only;
+
-+ top[-1] = left[-1];
++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
+
-+ // Filtering process
-+ // Sand can only apply to chroma_format_idc == 1 so we don't need to
-+ // worry about chroma smoothing for that case
+#if !PRED_C
-+ if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || ctx_cfmt(s) == 3)) {
-+ if (mode != INTRA_DC && size != 4){
-+ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-+ int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
-+ FFABS((int)(mode - 10U)));
-+ if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
-+ int threshold = 1 << (BIT_DEPTH - 5);
-+ if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
-+ log2_size == 5 &&
-+ FFABS(top[-1] + top[63] - 2 * top[31]) < threshold &&
-+ FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
-+ // We can't just overwrite values in top because it could be
-+ // a pointer into src
-+ filtered_top[-1] = top[-1];
-+ filtered_top[63] = top[63];
-+ for (i = 0; i < 63; i++)
-+ filtered_top[i] = ((64 - (i + 1)) * top[-1] +
-+ (i + 1) * top[63] + 32) >> 6;
-+ for (i = 0; i < 63; i++)
-+ left[i] = ((64 - (i + 1)) * left[-1] +
-+ (i + 1) * left[63] + 32) >> 6;
-+ top = filtered_top;
-+ } else {
-+ filtered_left[2 * size - 1] = left[2 * size - 1];
-+ filtered_top[2 * size - 1] = top[2 * size - 1];
-+ for (i = 2 * size - 2; i >= 0; i--)
-+ filtered_left[i] = (left[i + 1] + 2 * left[i] +
-+ left[i - 1] + 2) >> 2;
-+ filtered_top[-1] =
-+ filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
-+ for (i = 2 * size - 2; i >= 0; i--)
-+ filtered_top[i] = (top[i + 1] + 2 * top[i] +
-+ top[i - 1] + 2) >> 2;
-+ left = filtered_left;
-+ top = filtered_top;
-+ }
++ if ((req & FILTER_LIGHT) != 0)
++ {
++ const unsigned threshold = 1 << (BIT_DEPTH - 5);
++ if ((req & FILTER_STRONG) != 0 &&
++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold &&
++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
++ {
++ filter_strong(top, left[-1], top[63], 64);
++ filter_strong(left, left[-1], left[63], 64);
++ } else
++ {
++ // LHS writes UL too so copy for top
++ const pixel p_ul = left[-1];
++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
+ }
+ }
++#endif
++ }
++ else
++ {
++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
++ {
++ top_pred = src_u;
++ }
++ else
++ {
++#if !PRED_C
++ s->hpc.intra_filter[log2_size - 2]
++#else
++ s->hpc.intra_filter_c[log2_size - 2]
++#endif
++ ((uint8_t *)left, (uint8_t *)top, req, avail,
++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
++ ur_size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
++ }
+ }
+
++
++#if !PRED_C
+ switch (mode) {
+ case INTRA_PLANAR:
-+ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+ (uint8_t *)left, stride);
+ break;
+ case INTRA_DC:
-+ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, log2_size, c_idx);
++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
+ break;
+ default:
-+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, c_idx,
++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
+ mode);
+ break;
+ }
+#else
+ switch (mode) {
+ case INTRA_PLANAR:
-+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
+ (uint8_t *)left, stride);
+ break;
+ case INTRA_DC:
-+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, log2_size, c_idx);
++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
+ break;
+ default:
-+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, c_idx,
++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
+ mode);
+ break;
+ }
@@ -30699,10 +38042,11 @@ index 0000000000..4ee776f955
+#endif
+}
+
-+#define INTRA_PRED(size) \
-+static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx) \
-+{ \
-+ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \
++#define INTRA_PRED(log2_size) \
++static void FUNC(intra_pred_ ## log2_size)(const struct HEVCRpiContext * const s, \
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail) \
++{ \
++ FUNC(intra_pred)(s, mode, x0, y0, avail, log2_size); \
+}
+
+INTRA_PRED(2)
@@ -30768,7 +38112,7 @@ index 0000000000..4ee776f955
+#if !PRED_C
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size, int c_idx)
++ ptrdiff_t stride, int log2_size)
+{
+ int i, j, x, y;
+ int size = (1 << log2_size);
@@ -30788,7 +38132,10 @@ index 0000000000..4ee776f955
+ for (j = 0; j < size; j+=4)
+ AV_WN4P(&POS(j, i), a);
+
-+ if (c_idx == 0 && size < 32) {
++// if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++ if (size < 32)
++ {
+ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
+ for (x = 1; x < size; x++)
+ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
@@ -30799,7 +38146,7 @@ index 0000000000..4ee776f955
+#else
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size, int c_idx)
++ ptrdiff_t stride, int log2_size)
+{
+ unsigned int i, j;
+ const unsigned int size = (1 << log2_size);
@@ -30830,6 +38177,70 @@ index 0000000000..4ee776f955
+}
+#endif
+
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc)(src, top, left, stride, size + 2); \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
++
++
++
++#if !PRED_C
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ int i, j;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++}
++#else
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const pixel a = (1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = a;
++ src[j][1] = a;
++ }
++ }
++}
++#endif
++
++#define PRED_DC0(size)\
++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc0)(src, stride, size + 2); \
++}
++
++PRED_DC0(0)
++PRED_DC0(1)
++PRED_DC0(2)
++PRED_DC0(3)
++
++#undef PRED_DC0
++
++
++
++
+#ifndef ANGLE_CONSTS
+#define ANGLE_CONSTS
+static const int intra_pred_angle[] = {
@@ -30846,7 +38257,7 @@ index 0000000000..4ee776f955
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+ const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int c_idx,
++ ptrdiff_t stride,
+ int mode, int size)
+{
+ int x, y;
@@ -30889,10 +38300,12 @@ index 0000000000..4ee776f955
+ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
+ }
+ }
-+ if (mode == 26 && c_idx == 0 && size < 32) {
++// if (mode == 26 && c_idx == 0 && size < 32) {
++ if (mode == 26 && size < 32) {
+ for (y = 0; y < size; y++)
+ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
+ }
++
+ } else {
+ ref = left - 1;
+ if (angle < 0 && last < -1) {
@@ -30916,7 +38329,8 @@ index 0000000000..4ee776f955
+ POS(x, y) = ref[y + idx + 1];
+ }
+ }
-+ if (mode == 10 && c_idx == 0 && size < 32) {
++// if (mode == 10 && c_idx == 0 && size < 32) {
++ if (mode == 10 && size < 32) {
+ for (x = 0; x < size; x += 4) {
+ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1));
+ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
@@ -30925,12 +38339,61 @@ index 0000000000..4ee776f955
+ }
+ }
+ }
++
++
++
++#if BIT_DEPTH == 8 && 0
++ if ((size == 16 || size == 32) && mode != 10 && mode != 26) {
++ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
++ void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++// void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++#if 1
++ src = (pixel *)_src;
++ printf("C: Mode=%d\n", mode);
++ for (y = 0; y < size; y++, src += stride)
++ {
++ printf("%2d: ", y);
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x ", src[x]);
++ }
++ printf("\n");
++ }
++#endif
++// ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size);
++ memset(a, 0, sizeof(a));
++// ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode);
++ ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode);
++#if 1
++ src = (pixel *)a;
++ printf("A:\n");
++ for (y = 0; y < size; y++, src += size)
++ {
++ printf("%2d: ", y);
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x ", src[x]);
++ }
++ printf("\n");
++ }
++#endif
++ src = (pixel *)_src;
++ for (y = 0; y < size; y++, src += stride)
++ {
++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
++ printf("Fail at line %d\n", y);
++ av_assert0(0);
++ }
++ }
++ }
++#endif
++
+}
+#else
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+ const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int c_idx,
++ ptrdiff_t stride,
+ int mode, int size)
+{
+ int x, y;
@@ -31001,35 +38464,78 @@ index 0000000000..4ee776f955
+ }
+ }
+ }
++
++#if BIT_DEPTH == 10 && 0
++ if (size == 16 && mode != 10 && mode != 26) {
++ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
++// void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++ void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++ src = (c_dst_ptr_t)_src;
++ printf("C: mode=%d\n", mode);
++ for (y = 0; y < size; y++, src += stride)
++ {
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x:%3x ", src[x][0], src[x][1]);
++ }
++ printf("\n");
++ }
++
++ memset(a, 0, sizeof(a));
++ ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode);
++
++ src = (c_dst_ptr_t)a;
++ printf("A:\n");
++ for (y = 0; y < size; y++, src += size)
++ {
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x:%3x ", src[x][0], src[x][1]);
++ }
++ printf("\n");
++ }
++
++ src = (c_dst_ptr_t)_src;
++ for (y = 0; y < size; y++, src += stride)
++ {
++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
++ printf("Fail at line %d\n", y);
++ av_assert0(0);
++ }
++ }
++
++ }
++#endif
+}
+#endif
+
+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
+}
+
+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
+}
+
+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
+}
+
+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
+}
+
+#undef cpel
@@ -31049,6 +38555,11 @@ index 0000000000..4ee776f955
+#undef POS
+#undef PW
+
++#undef filter_light1
++#undef filter_light
++#undef filter_strong
++#undef ref_gen
++
+#ifndef INCLUDED_ONCE
+#define INCLUDED_ONCE
+#endif
@@ -35131,7 +42642,7 @@ index 0000000000..59c0d3959e
+# -Wa,-ahls
diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
new file mode 100755
-index 0000000000..28b7a4f483
+index 0000000000..c8da66514b
--- /dev/null
+++ b/pi-util/conf_pi2.sh
@@ -0,0 +1,32 @@
@@ -35142,7 +42653,7 @@ index 0000000000..28b7a4f483
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000"
++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
index 981a88e102..551a27104a 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
@@ -1,7 +1,7 @@
From 20af7af23a9f366476e67669f14957dfaf58f141 Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes
Date: Sat, 9 Jan 2016 16:34:09 +0100
-Subject: [PATCH 1/3] avcodec: add h264_mvc codec id and profiles
+Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
---
libavcodec/avcodec.h | 3 +++
@@ -75,13 +75,13 @@ index 37a6aa8bff..52c5b659c4 100644
{ 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
{ 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
--
-2.14.1
+2.17.0
From 0f3fda4e348e6b12570f5d279713f6da46511846 Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes
Date: Sat, 9 Jan 2016 16:34:40 +0100
-Subject: [PATCH 2/3] h264_parser: add support for parsing h264 mvc NALUs
+Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
---
libavcodec/h264.h | 2 ++
@@ -192,13 +192,13 @@ index f43b197d5e..f96e005ef3 100644
extern AVCodecParser ff_mjpeg_parser;
extern AVCodecParser ff_mlp_parser;
--
-2.14.1
+2.17.0
From cdd668dc436b9c78dcb31df477e329492356e7ec Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes
Date: Tue, 28 Nov 2017 16:12:12 +0000
-Subject: [PATCH 3/3] h264_parser: force grabing a new timestamp until a frame
+Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
start was found
---
@@ -220,5 +220,65 @@ index be8b9db9b0..81c9a1bbae 100644
*poutbuf = NULL;
*poutbuf_size = 0;
--
-2.14.1
+2.17.0
+
+
+From fb0ec9a132d6eb8fd74348ef87b1176c7ca34a00 Mon Sep 17 00:00:00 2001
+From: popcornmix
+Date: Mon, 28 May 2018 13:35:36 +0100
+Subject: [PATCH 4/4] fixup
+
+---
+ libavcodec/extract_extradata_bsf.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
+index 082b3e749b..7612749efc 100644
+--- a/libavcodec/extract_extradata_bsf.c
++++ b/libavcodec/extract_extradata_bsf.c
+@@ -59,7 +59,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+ HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
+ };
+ static const int extradata_nal_types_h264[] = {
+- H264_NAL_SPS, H264_NAL_PPS,
++ H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS,
+ };
+
+ ExtractExtradataContext *s = ctx->priv_data;
+@@ -90,7 +90,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+ if (nal->type == HEVC_NAL_SPS) has_sps = 1;
+ if (nal->type == HEVC_NAL_VPS) has_vps = 1;
+ } else {
+- if (nal->type == H264_NAL_SPS) has_sps = 1;
++ if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1;
+ }
+ } else if (s->remove) {
+ filtered_size += nal->raw_size + 3;
+@@ -99,7 +99,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+
+ if (extradata_size &&
+ ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
+- (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
++ ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) {
+ AVBufferRef *filtered_buf;
+ uint8_t *extradata, *filtered_data;
+
+@@ -253,6 +253,7 @@ static const struct {
+ } extract_tab[] = {
+ { AV_CODEC_ID_CAVS, extract_extradata_mpeg4 },
+ { AV_CODEC_ID_H264, extract_extradata_h2645 },
++ { AV_CODEC_ID_H264_MVC, extract_extradata_h2645 },
+ { AV_CODEC_ID_HEVC, extract_extradata_h2645 },
+ { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12 },
+ { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12 },
+@@ -317,6 +318,7 @@ static void extract_extradata_close(AVBSFContext *ctx)
+ static const enum AVCodecID codec_ids[] = {
+ AV_CODEC_ID_CAVS,
+ AV_CODEC_ID_H264,
++ AV_CODEC_ID_H264_MVC,
+ AV_CODEC_ID_HEVC,
+ AV_CODEC_ID_MPEG1VIDEO,
+ AV_CODEC_ID_MPEG2VIDEO,
+--
+2.17.0
diff --git a/packages/multimedia/libdvdcss/package.mk b/packages/multimedia/libdvdcss/package.mk
index 7ef8ece181..a6426cb348 100644
--- a/packages/multimedia/libdvdcss/package.mk
+++ b/packages/multimedia/libdvdcss/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="libdvdcss"
-PKG_VERSION="8f5dba3"
-PKG_SHA256="669868984d1cac32e85ea7e2e669b22b960354e3078073b9176d857844fedc3c"
+PKG_VERSION="80fdc8d"
+PKG_SHA256="1bcdf96e4fe644ac3d70014ef144f3021b69c6f88bba06fc26ec32cd4b8dc82b"
PKG_ARCH="any"
PKG_LICENSE="GPL"
PKG_SITE="https://github.com/xbmc/libdvdcss"
diff --git a/packages/multimedia/libdvdnav/package.mk b/packages/multimedia/libdvdnav/package.mk
index 67150281c9..a108ebb33e 100644
--- a/packages/multimedia/libdvdnav/package.mk
+++ b/packages/multimedia/libdvdnav/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="libdvdnav"
-PKG_VERSION="6501b70"
-PKG_SHA256="4004342489f21ce1d67ef98e95669dbd5c6d3dde1dcdf6ce97358155dc16f6f9"
+PKG_VERSION="9277007"
+PKG_SHA256="e50db40a823ddc795d1fe5f18db2517fb3e05fe0c4a88abf1578d95d7a1cce63"
PKG_ARCH="any"
PKG_LICENSE="GPL"
PKG_SITE="https://github.com/xbmc/libdvdnav"
diff --git a/packages/multimedia/libdvdread/package.mk b/packages/multimedia/libdvdread/package.mk
index da44ec3c25..ce16e86315 100644
--- a/packages/multimedia/libdvdread/package.mk
+++ b/packages/multimedia/libdvdread/package.mk
@@ -17,8 +17,8 @@
################################################################################
PKG_NAME="libdvdread"
-PKG_VERSION="86f9500"
-PKG_SHA256="a73888c81d14443d9f09fa02f5e5ecc08d9ab09639789fab810557069d335f34"
+PKG_VERSION="bd6b329"
+PKG_SHA256="2d9d6d185dd25a983d6dfc2a00207cafdc396a969c227d5edd84b6215b2fba89"
PKG_ARCH="any"
PKG_LICENSE="GPL"
PKG_SITE="https://github.com/xbmc/libdvdread"