From 6c3ad63ba4b0cf2d8661d451032b35add29fb7b4 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 01/13] kodi: update to kodi-593949a --- packages/mediacenter/kodi/package.mk | 13 ++-- ...i-100.14-use-alsa-and-pulse-together.patch | 62 ++++++++++++------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index b8ebd019d5..f4420f76b8 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -1,24 +1,25 @@ ################################################################################ -# This file is part of OpenELEC - http://www.openelec.tv +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC # Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv) # -# OpenELEC is free software: you can redistribute it and/or modify +# LibreELEC is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # -# OpenELEC is distributed in the hope that it will be useful, +# LibreELEC is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with OpenELEC. If not, see . +# along with LibreELEC. If not, see . ################################################################################ PKG_NAME="kodi" -PKG_VERSION="3a989ee" -PKG_SHA256="deb3526aa28d1b64f8d295f18637c42cb031a476cabdbd9dc15af1e33c5d8965" +PKG_VERSION="593949a" +PKG_SHA256="7a4ccfacd24461d5dfbba9be362372912ebc26dd6743e52b706907b6cc081be5" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" diff --git a/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch b/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch index ba07ff7999..7fbca0cd66 100644 --- a/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch +++ b/packages/mediacenter/kodi/patches/kodi-100.14-use-alsa-and-pulse-together.patch @@ -1,20 +1,20 @@ -From 5d3b9dae20b9c9b1c9236d98bf9ce64306d8b63f Mon Sep 17 00:00:00 2001 +From 09ef179755107108722027dcc04ae62795c5d522 Mon Sep 17 00:00:00 2001 From: MilhouseVH -Date: Thu, 5 Apr 2018 11:43:28 +0100 +Date: Tue, 22 May 2018 00:28:13 +0100 Subject: [PATCH] allow using alsa and pulse together --- - xbmc/windowing/X11/WinSystemX11GLContext.cpp | 28 ++-------------------------- + xbmc/windowing/X11/WinSystemX11GLContext.cpp | 35 ++-------------------------- xbmc/windowing/amlogic/WinSystemAmlogic.cpp | 2 ++ - xbmc/windowing/gbm/WinSystemGbm.cpp | 27 ++------------------------- + xbmc/windowing/gbm/WinSystemGbm.cpp | 34 ++------------------------- xbmc/windowing/rpi/WinSystemRpi.cpp | 4 ++++ - 4 files changed, 10 insertions(+), 51 deletions(-) + 4 files changed, 10 insertions(+), 65 deletions(-) diff --git a/xbmc/windowing/X11/WinSystemX11GLContext.cpp b/xbmc/windowing/X11/WinSystemX11GLContext.cpp -index 17b83a0..2e76053 100644 +index 6e31a80..2e76053 100644 --- a/xbmc/windowing/X11/WinSystemX11GLContext.cpp +++ b/xbmc/windowing/X11/WinSystemX11GLContext.cpp -@@ -52,32 +52,8 @@ std::unique_ptr CWinSystemBase::CreateWinSystem() +@@ -52,39 +52,8 @@ std::unique_ptr CWinSystemBase::CreateWinSystem() CWinSystemX11GLContext::CWinSystemX11GLContext() { @@ -29,6 +29,10 @@ index 17b83a0..2e76053 100644 - { - OPTIONALS::PulseAudioRegister(); - } +- else if (StringUtils::EqualsNoCase(envSink, "OSS")) +- { +- OPTIONALS::OSSRegister(); +- } - else if (StringUtils::EqualsNoCase(envSink, "SNDIO")) - { - OPTIONALS::SndioRegister(); @@ -39,7 +43,10 @@ index 17b83a0..2e76053 100644 - { - if (!OPTIONALS::ALSARegister()) - { -- OPTIONALS::SndioRegister(); +- if (!OPTIONALS::SndioRegister()) +- { +- OPTIONALS::OSSRegister(); +- } - } - } - } @@ -50,7 +57,7 @@ index 17b83a0..2e76053 100644 } diff --git a/xbmc/windowing/amlogic/WinSystemAmlogic.cpp b/xbmc/windowing/amlogic/WinSystemAmlogic.cpp -index 1db2ba7..517aeea 100644 +index 324d47f..1766308 100644 --- a/xbmc/windowing/amlogic/WinSystemAmlogic.cpp +++ b/xbmc/windowing/amlogic/WinSystemAmlogic.cpp @@ -32,6 +32,7 @@ @@ -61,21 +68,21 @@ index 1db2ba7..517aeea 100644 #include "windowing/GraphicContext.h" #include "windowing/Resolution.h" #include "platform/linux/powermanagement/LinuxPowerSyscall.h" -@@ -79,6 +80,7 @@ CWinSystemAmlogic::CWinSystemAmlogic() +@@ -78,6 +79,7 @@ CWinSystemAmlogic::CWinSystemAmlogic() : // Register sink AE::CAESinkFactory::ClearSinks(); CAESinkALSA::Register(); + CAESinkPULSE::Register(); CLinuxPowerSyscall::Register(); - } - + m_lirc.reset(OPTIONALS::LircRegister()); + m_libinput->Start(); diff --git a/xbmc/windowing/gbm/WinSystemGbm.cpp b/xbmc/windowing/gbm/WinSystemGbm.cpp -index 45783bd..7b5e2ba 100644 +index 72ddf6a..79e81d5 100644 --- a/xbmc/windowing/gbm/WinSystemGbm.cpp +++ b/xbmc/windowing/gbm/WinSystemGbm.cpp -@@ -43,31 +43,8 @@ CWinSystemGbm::CWinSystemGbm() : - m_GBM(new CGBMUtils), - m_delayDispReset(false) +@@ -43,38 +43,8 @@ CWinSystemGbm::CWinSystemGbm() : + m_delayDispReset(false), + m_libinput(new CLibInputHandler) { - std::string envSink; - if (getenv("AE_SINK")) @@ -88,6 +95,10 @@ index 45783bd..7b5e2ba 100644 - { - OPTIONALS::PulseAudioRegister(); - } +- else if (StringUtils::EqualsNoCase(envSink, "OSS")) +- { +- OPTIONALS::OSSRegister(); +- } - else if (StringUtils::EqualsNoCase(envSink, "SNDIO")) - { - OPTIONALS::SndioRegister(); @@ -98,22 +109,25 @@ index 45783bd..7b5e2ba 100644 - { - if (!OPTIONALS::ALSARegister()) - { -- OPTIONALS::SndioRegister(); +- if (!OPTIONALS::SndioRegister()) +- { +- OPTIONALS::OSSRegister(); +- } - } - } - } + OPTIONALS::ALSARegister(); + OPTIONALS::PulseAudioRegister(); - m_winEvents.reset(new CWinEventsLinux()); CLinuxPowerSyscall::Register(); + m_lirc.reset(OPTIONALS::LircRegister()); diff --git a/xbmc/windowing/rpi/WinSystemRpi.cpp b/xbmc/windowing/rpi/WinSystemRpi.cpp -index 82534f2..d4e8ba9 100644 +index fac5cc4..f90e46d 100644 --- a/xbmc/windowing/rpi/WinSystemRpi.cpp +++ b/xbmc/windowing/rpi/WinSystemRpi.cpp -@@ -34,7 +34,9 @@ +@@ -33,7 +33,9 @@ + #include "guilib/DispResource.h" #include "utils/log.h" - #include "../WinEventsLinux.h" #include "cores/AudioEngine/AESinkFactory.h" +#include "cores/AudioEngine/Sinks/AESinkALSA.h" #include "cores/AudioEngine/Sinks/AESinkPi.h" @@ -121,15 +135,15 @@ index 82534f2..d4e8ba9 100644 #include "platform/linux/powermanagement/LinuxPowerSyscall.h" #include -@@ -56,6 +58,8 @@ CWinSystemRpi::CWinSystemRpi() - m_winEvents.reset(new CWinEventsLinux()); +@@ -55,6 +57,8 @@ CWinSystemRpi::CWinSystemRpi() : + AE::CAESinkFactory::ClearSinks(); CAESinkPi::Register(); + CAESinkALSA::Register(); + CAESinkPULSE::Register(); CLinuxPowerSyscall::Register(); m_lirc.reset(OPTIONALS::LircRegister()); - } + m_libinput->Start(); -- 2.14.1 From b7cdd3c844992ed53af0126f2a5f97006ad26196 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 02/13] kodi-binary-addons: update to latest versions --- .../kodi-binary-addons/audiodecoder.2sf/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.gsf/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.ncsf/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.openmpt/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.qsf/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.ssf/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.upse/package.mk | 4 ++-- .../kodi-binary-addons/audiodecoder.usf/package.mk | 4 ++-- .../kodi-binary-addons/inputstream.adaptive/package.mk | 4 ++-- .../kodi-binary-addons/inputstream.rtmp/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.argustv/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.dvblink/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk | 4 ++-- .../kodi-binary-addons/pvr.mediaportal.tvserver/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.stalker/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.teleboy/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk | 4 ++-- packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk | 4 ++-- .../kodi-binary-addons/screensaver.shadertoy/package.mk | 4 ++-- .../kodi-binary-addons/screensavers.rsxs/package.mk | 4 ++-- .../mediacenter/kodi-binary-addons/vfs.libarchive/package.mk | 4 ++-- 32 files changed, 64 insertions(+), 64 deletions(-) diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk index ae03236b23..91cef254d5 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.2sf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.2sf" -PKG_VERSION="5f70a33" -PKG_SHA256="378952a4745e93742ec1ff66de87c7f0532f00ba8ac0d80969edcbf832c4e4b0" +PKG_VERSION="afe3580" +PKG_SHA256="d3225745b1f52cc7af32615b967e0ed628a8e98d0f86f408603e3a3e9473b18a" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk index 347a36bf37..7ca5e13bb6 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.gsf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.gsf" -PKG_VERSION="0795b7e" -PKG_SHA256="d6515f4d0a860251ef7cab5f7598438f9bf46231c32201d5f835bf44d0fdfd11" +PKG_VERSION="081ee65" +PKG_SHA256="063a5b0ac606e889e93256fd9ca45db3d7b52e0736ffaa1c22526bfe89f64afb" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk index e0c476d61c..9f55417f7c 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ncsf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.ncsf" -PKG_VERSION="236bcf9" -PKG_SHA256="0e85db9bd16374e024243420dc12bb8bf17c9d71d769eacb6effb887032e595a" +PKG_VERSION="149f324" +PKG_SHA256="f5879d227ee63b63bba872f7cfda5a562b5f6e16c7e3e06c3522124eb11e528e" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk index 17b576e268..0d64f567d4 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.openmpt/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.openmpt" -PKG_VERSION="fb1041a" -PKG_SHA256="f953c8c7f59c4bd2490c272a77fef128eaa3273d2548448c6e2a6e6cb68e2329" +PKG_VERSION="47e3814" +PKG_SHA256="8485250d47b290820aa7f60f6334bb89c9cbe56b524a8d81476f216e76541d0b" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk index 74008c46e7..bd47a459c7 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.qsf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.qsf" -PKG_VERSION="9182d5e" -PKG_SHA256="38678039bb15e272abc7da6e94952ab1434e5f51e1bf2766fe6d96cb093ff053" +PKG_VERSION="876201e" +PKG_SHA256="06f74b44375c1b3bf565fb219dcf164490b7c894ebc76d8684503d1324b86665" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk index a87cd122a4..b1549fb5cd 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.ssf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.ssf" -PKG_VERSION="8801d12" -PKG_SHA256="9a130e94542c82e8ddf1b6a8a38d49796488902d0862b809cf60b5dcb3a9f8cc" +PKG_VERSION="8adf121" +PKG_SHA256="18328f92bdfd426814bfd4e7549f674a171420c945f9284aa6183d70870b7f60" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk index f76a31cbe6..d494f0dad0 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.upse" -PKG_VERSION="de09fb0" -PKG_SHA256="c450453389ac75612c12b599bdb32f85c86a277f70eceac5f4b21c476ff9a932" +PKG_VERSION="6fa70f8" +PKG_SHA256="e0fcf4c85122c293aed7a4ba5f71802db9231d65a69ece9ea47732facb386d1c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk index 6688bb01e0..bfaaa8fbbc 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.usf" -PKG_VERSION="cec0fe2" -PKG_SHA256="1bb0afd2debc806fe72c466de76385043b642a9c5b0e2dc6d15ee3bfa0533f7b" +PKG_VERSION="ccb1edc" +PKG_SHA256="d0dc7bc7ad61bc19ec1297da4b04e2291ad27b68e0dc384d378e5106bba87709" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index 174b03ef21..2f7bb5fa0d 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.adaptive" -PKG_VERSION="0f0ced4" -PKG_SHA256="5d3b640f59abcf591d1fb3c4d4ab788683a0e3326bfaa8e9b4f5c2a78f58b947" +PKG_VERSION="babcca4" +PKG_SHA256="1351012bbdfe18e683f217ea999d596e0a7f21ea48e9a5c1783ca06e864b144e" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk index 98e9986820..801767ec17 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.rtmp" -PKG_VERSION="3b5058d" -PKG_SHA256="cf2b935bcd12dee759aeaca6c9ad65294a4323d430c7029e7f2923e10aa1a679" +PKG_VERSION="b8e3f39" +PKG_SHA256="eb6cc5f164c3bc76582362ea0683cfdc368ae31591786e11b4a75e0b30c8b0b8" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk index 231134987a..5fc2d2fe44 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.argustv" -PKG_VERSION="377f796" -PKG_SHA256="7ac85250793690c2e05692a5c3db7398fc84cffa9cf023c1d2a97d378fe53eb3" +PKG_VERSION="2bce465" +PKG_SHA256="2e80867293949e452ca623ac3ed88aa33e5de50fe7e0c6c51f476fca1fa5841a" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk index 40c086ed3d..2d29359c4f 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.demo" -PKG_VERSION="d5e5cd1" -PKG_SHA256="cb63a50c85a02f7ca38144d2f1a536e85116b01dd849bcce9300ca778d0de7ea" +PKG_VERSION="20d81d8" +PKG_SHA256="67b37fc6d7401dfa7b508241ff2d230fbf0879286b43a70667fd3fb89002470a" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk index eded9c6311..41ca080b14 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvblink/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.dvblink" -PKG_VERSION="c61ea73" -PKG_SHA256="127fc5139603c59c1e3a27cf3694e558d46d0fb22219f0e78c45372fd356c75f" +PKG_VERSION="a87258b" +PKG_SHA256="a9ddc8b70d42e174aa9486b84d467296afa870f80fff32dd84223b12abf762e8" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk index 71cc7fcbc7..3336865d47 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.dvbviewer" -PKG_VERSION="098d23c" -PKG_SHA256="e47ccbbb6c1ee7fa096d91e93ae9878ee33fe442bd02baafa745c2946fa02d40" +PKG_VERSION="884b732" +PKG_SHA256="13e2c95aabfc5ee8ded5bcf1259492bd4487574ad2e2ee531061989b2e8f4e41" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk index ae7ca6840a..5fdc85ed58 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.filmon" -PKG_VERSION="0ca1665" -PKG_SHA256="3ea8ae440fd7adb65f3e8d619af592c0224da366636ba0ba7aadb89406b6ec5b" +PKG_VERSION="470ca1c" +PKG_SHA256="be27454a280664b0bb20c1f18d281ca293d0d74cfa464eaabd771c417c5ff174" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk index 851d180aec..f1f749d291 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.hdhomerun" -PKG_VERSION="484b91d" -PKG_SHA256="a6d00a4e293dda7a2a48262d94548bda6c9e34971061e05e437edb1c2cf8515b" +PKG_VERSION="4639740" +PKG_SHA256="0682689ff55e0585ccd9b57e81af57defab1efde6c56b2e645c03ab4438e2e44" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk index 62ea381327..52474200e8 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.hts" -PKG_VERSION="9533cce" -PKG_SHA256="2fa8490abcaefdc1e0652d9fa5b873b246946f578842eba0e5aebd4bc0c69b20" +PKG_VERSION="4f7196d" +PKG_SHA256="12f5a51e9923b96f870be59a47336c33d160a8e8903e58027f0dd0cd82cf8347" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk index dc922f8e59..67e58c5186 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.iptvsimple" -PKG_VERSION="77156cb" -PKG_SHA256="96da93cedab5ecafb4ca49fc8942ce0979b2b931b7115359ec97f55f260f9e5f" +PKG_VERSION="e220777" +PKG_SHA256="ed6159cea372129ec49776a778aa9284898abdc2996c1744401273ac1fc21ef5" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk index 24bb53155a..514b8fa7a9 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.mediaportal.tvserver" -PKG_VERSION="9702684" -PKG_SHA256="53d295c69a53c775c477c117e7efc3a4a2f61bd896396087004a1e8c58f2e2b6" +PKG_VERSION="c4e32b0" +PKG_SHA256="16531a64827dd0f475c5184c7f89aa47d279736919a06e7cd55d8154f7bac798" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk index 4548dba7c8..d5070944bc 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.nextpvr" -PKG_VERSION="de30ff2" -PKG_SHA256="a468a22b7d9e709950cd24b9c9d6ce025d91e2e5509fc4a39f7ffd35e163ed3d" +PKG_VERSION="78a80de" +PKG_SHA256="25cd42764b2b8285f8f7d8855bef24a960d6ae8b18f2f9870c0c429af32116d8" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk index bfbd3690b0..ff9147cd8d 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.njoy" -PKG_VERSION="bd6581f" -PKG_SHA256="f99f4b31577b3c388183fc1c4aef3f4fde077e7df84e84b643ff5cdeb61fb221" +PKG_VERSION="5a2c2d3" +PKG_SHA256="14a02f78df7651dd8cb668c1c587e398ec8788125289ed66058e91ba111328f6" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk index 637223c496..16180a94a7 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.pctv" -PKG_VERSION="5e95300" -PKG_SHA256="878aee780117d878e9658a0916f47cfba66f884718af41d5d22d2b6aeee73c3e" +PKG_VERSION="17c1897" +PKG_SHA256="9a1277275833ac0288ac34083daf8521472f2f550d21f8953078d2d4c73559db" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk index f4868145b3..6777f540fa 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.stalker" -PKG_VERSION="da6f0f8" -PKG_SHA256="d043b9b144496efb7a7bd2106716d139d701add883d6db25a0eb26847858baf4" +PKG_VERSION="0700069" +PKG_SHA256="a3322c8567400b7dbdc9a91bfa5e21375064a9483b4b676414e4164a577d307f" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk index 510da61715..9a8482db40 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.teleboy" -PKG_VERSION="a84e5ef" -PKG_SHA256="84ef0fcc6dda0f67df0dfbd7d9e781f8834e4c7319bafc919430c28a705d2e55" +PKG_VERSION="3e9e537" +PKG_SHA256="5c40d59c4403688d15d9b8a5b96112bd21e2558667a85adc13afeca6aac43fb3" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk index ebd3624a15..c5234f3d98 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vbox" -PKG_VERSION="3cf15ad" -PKG_SHA256="f57a67a14a6b260ef35bc15bccbf5280a104b2a5a8fe96d2cf13003762daafa2" +PKG_VERSION="48ffcba" +PKG_SHA256="07e46dbc9df1253af0d277c924850ddaf12c02a3e1b8ff1559096b16e528d29a" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk index 5f5c6f6de6..f39c0d5511 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vdr.vnsi" -PKG_VERSION="d6fe796" -PKG_SHA256="f56e9bfeab4596526ff1243f90ebd36c41c057cc78ed655072e5491aaa6c1a00" +PKG_VERSION="a2880c7" +PKG_SHA256="975ce55c888b46b9b47bf7a8bbe4db56a2169aeebfda11fa9ca51510d1db2148" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk index 75048da951..4e3fad543f 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vuplus" -PKG_VERSION="00a963f" -PKG_SHA256="b286ed850ddce31b4fec1f55cf3639467c7ae39e548051b5485db035e20bf51e" +PKG_VERSION="6c94eec" +PKG_SHA256="fc645a611a78250299a83edca56dd03686d4ad67900be20fe00f46b2fb6d8e17" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk index cc3731b5e7..171c13c979 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.wmc" -PKG_VERSION="2acca13" -PKG_SHA256="6d19fbc313f089eff40af72f3f8b70358e357491bff8504a76aa029ef6f3fe21" +PKG_VERSION="a7ec576" +PKG_SHA256="ecc460e5e50c6e75a857dc7ec0e8de8142fb3bbb036e9253bca72ac20b5a2111" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk index a7095b565b..4bfca61773 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.zattoo" -PKG_VERSION="1244e14" -PKG_SHA256="20543c189b3d77bb8fc9f2306be9646235461db6c12e1f83623e82740279cba0" +PKG_VERSION="f04367b" +PKG_SHA256="5685ccafe979935123bce6cea2a7499f5bab8ff16f4b1d5b60c9ed3b943ac6b6" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk index d1713d74a9..34edf6eae7 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.shadertoy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.shadertoy" -PKG_VERSION="2638205" -PKG_SHA256="0c04af6aa45f1838ad785a2914a47ad4ce5c6b7998f73d848aa92b4480096b58" +PKG_VERSION="0290c8e" +PKG_SHA256="970eed3e63db75043fafe5a172bcd218bba3b5ae5f3b418206da00865ccb4647" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk index 246359bd96..15ed585ef8 100644 --- a/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensavers.rsxs/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensavers.rsxs" -PKG_VERSION="36b9f97" -PKG_SHA256="43fcaae28e00fd0a58fd12091560d25258cf5a228114e46799847031de65e063" +PKG_VERSION="be03db6" +PKG_SHA256="b0f35760a3f444769c2f0f948defc220b34459dde1bea06522708498eefe2e99" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk index dcf210d389..66e21cd078 100644 --- a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="vfs.libarchive" -PKG_VERSION="e7d149e" -PKG_SHA256="dd9604752dcb4fbe38b082455935e95dc7b572a1424a49c935989292038f1b74" +PKG_VERSION="84a4876" +PKG_SHA256="38591095f93a380aac4be58c5e92bf870da095679a152a3ca4a1552ac4415968" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" From 7e44994f430bacc38d722b09a47e863487416adf Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 03/13] libdvdread: update to libdvdread-bd6b329 --- packages/multimedia/libdvdread/package.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/multimedia/libdvdread/package.mk b/packages/multimedia/libdvdread/package.mk index da44ec3c25..ce16e86315 100644 --- a/packages/multimedia/libdvdread/package.mk +++ b/packages/multimedia/libdvdread/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="libdvdread" -PKG_VERSION="86f9500" -PKG_SHA256="a73888c81d14443d9f09fa02f5e5ecc08d9ab09639789fab810557069d335f34" +PKG_VERSION="bd6b329" +PKG_SHA256="2d9d6d185dd25a983d6dfc2a00207cafdc396a969c227d5edd84b6215b2fba89" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/xbmc/libdvdread" From 315d944d2448ab3e333b3208073bb3de52353f81 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 04/13] libdvdnav: update to libdvdnav-9277007 --- packages/multimedia/libdvdnav/package.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/multimedia/libdvdnav/package.mk b/packages/multimedia/libdvdnav/package.mk index 67150281c9..a108ebb33e 100644 --- a/packages/multimedia/libdvdnav/package.mk +++ b/packages/multimedia/libdvdnav/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="libdvdnav" -PKG_VERSION="6501b70" -PKG_SHA256="4004342489f21ce1d67ef98e95669dbd5c6d3dde1dcdf6ce97358155dc16f6f9" +PKG_VERSION="9277007" +PKG_SHA256="e50db40a823ddc795d1fe5f18db2517fb3e05fe0c4a88abf1578d95d7a1cce63" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/xbmc/libdvdnav" From 4bd1870490c587efa19be3e6d9e78485da3a0dc7 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 05/13] libdvdcss: update to libdvdcss-80fdc8d --- packages/multimedia/libdvdcss/package.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/multimedia/libdvdcss/package.mk b/packages/multimedia/libdvdcss/package.mk index 7ef8ece181..a6426cb348 100644 --- a/packages/multimedia/libdvdcss/package.mk +++ b/packages/multimedia/libdvdcss/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="libdvdcss" -PKG_VERSION="8f5dba3" -PKG_SHA256="669868984d1cac32e85ea7e2e669b22b960354e3078073b9176d857844fedc3c" +PKG_VERSION="80fdc8d" +PKG_SHA256="1bcdf96e4fe644ac3d70014ef144f3021b69c6f88bba06fc26ec32cd4b8dc82b" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://github.com/xbmc/libdvdcss" From d5efe8b39fc180a050002720ffc7edf98abcbffd Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 06/13] LibreELEC-settings: update to LibreELEC-settings-a562ed0 --- packages/mediacenter/LibreELEC-settings/package.mk | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk index f8e24b3e12..3bcc9af1de 100644 --- a/packages/mediacenter/LibreELEC-settings/package.mk +++ b/packages/mediacenter/LibreELEC-settings/package.mk @@ -1,24 +1,25 @@ ################################################################################ -# This file is part of OpenELEC - http://www.openelec.tv +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC # Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv) # -# OpenELEC is free software: you can redistribute it and/or modify +# LibreELEC is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # -# OpenELEC is distributed in the hope that it will be useful, +# LibreELEC is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with OpenELEC. If not, see . +# along with LibreELEC. If not, see . ################################################################################ PKG_NAME="LibreELEC-settings" -PKG_VERSION="0ec74f6" -PKG_SHA256="f9e5a1ead9c1a3832122deb4831980dac87ec3b8f748e6449b6b090c40f09249" +PKG_VERSION="a562ed0" +PKG_SHA256="98f2d5aa3ef3d422a359fc0a10e2c50efc14d3eaf351312b3aceea449a0ff151" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="https://libreelec.tv" From 547b3a01e827b4f0d99dcca9e03e4a17b002dcce Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 07/13] ffmpeg: hevc: Latest updates from Ben; update copyright --- packages/multimedia/ffmpeg/package.mk | 9 +- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 7516 +++++++++++++++-- 2 files changed, 6659 insertions(+), 866 deletions(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 057d1692fe..4d9a4a6157 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -1,19 +1,20 @@ ################################################################################ -# This file is part of OpenELEC - http://www.openelec.tv +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC # Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv) # -# OpenELEC is free software: you can redistribute it and/or modify +# LibreELEC is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # -# OpenELEC is distributed in the hope that it will be useful, +# LibreELEC is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with OpenELEC. If not, see . +# along with LibreELEC. If not, see . ################################################################################ PKG_NAME="ffmpeg" diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index ef2f4d7d62..91ea9da3dd 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -582,18 +582,19 @@ index 4d4ef530e4..fba8776c9f 100644 { const AVCodec *p, *experimental = NULL; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index e656011c3c..69cd820f06 100644 +index e656011c3c..70c3f026b8 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ +@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o -+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ ++ arm/rpi_hevcpred_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o -@@ -136,10 +137,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -136,10 +138,23 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -608,7 +609,12 @@ index e656011c3c..69cd820f06 100644 + arm/rpi_hevcdsp_idct_neon.o \ + arm/rpi_hevcdsp_res8_neon.o \ + arm/rpi_hevcdsp_res16_neon.o \ -+ arm/rpi_hevcdsp_sao_neon.o ++ arm/rpi_hevcdsp_sao_neon.o \ ++ arm/rpi_hevcpred_init_neon.o \ ++ arm/rpi_hevcpred_intra_angular_neon.o \ ++ arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_hv_neon.o \ ++ arm/rpi_hevcpred_intra_planar_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o @@ -1738,10 +1744,10 @@ index 0000000000..62b9326532 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 -index 0000000000..e665bd848a +index 0000000000..f75c82671e --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1249 @@ +@@ -0,0 +1,1593 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -1766,65 +1772,72 @@ index 0000000000..e665bd848a +#include "libavutil/arm/asm.S" +#include "neon.S" + -+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a ++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 + vsubl.u8 q0, \Q0a, \P0a -+ vsubl.u8 q2, \P1a, \Q1a -+ vshl.i16 q0, #2 -+ vadd.i16 q0, q2 ++ vsubl.u8 q1, \P1a, \Q1a + vdup.16 d4, r2 -+ -+ vrshr.s16 q0, #3 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 + vmovl.u8 q2, d4 -+ ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 -+ vmax.s16 q0, q2 -+ vaddw.u8 q2, q0, \P0a -+ -+ vqmovun.s16 \P0a, q2 + vmovl.u8 q2, \Q0a -+ vsub.i16 q2, q0 -+ -+ vqmovun.s16 \Q0a, q2 ++ vmax.s16 q0, q1 ++ vaddw.u8 q1, q0, \P0a ++ vsub.i16 q0, q2, q0 ++ vqmovun.s16 \P0a, q1 ++ vqmovun.s16 \Q0a, q0 +.endm + + -+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v -+ vsubl.u8 q0, \Q0u, \P0u -+ vsubl.u8 q1, \Q0v, \P0v -+ vsubl.u8 q2, \P1u, \Q1u -+ vsubl.u8 q3, \P1v, \Q1v -+ vshl.i16 q0, #2 -+ vshl.i16 q1, #2 -+ vadd.i16 q0, q2 -+ vdup.16 d4, r2 -+ lsr r2, #16 -+ vadd.i16 q1, q3 -+ -+ vrshr.s16 q0, #3 -+ vdup.16 d6, r2 -+ vmovl.u8 q2, d4 -+ vmovl.u8 q3, d6 -+ vrshr.s16 q1, #3 -+ ++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 ++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b ++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a ++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vmovl.u8 q2, d4 @ tc0a, tc0b ++ \I3 ++ vmovl.u8 q3, d6 @ tc1a, tc1b ++ \I4 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 + vmin.s16 q1, q3 -+ vneg.s16 q3, q3 -+ vmax.s16 q0, q2 -+ vaddw.u8 q2, q0, \P0u -+ vmax.s16 q1, q3 -+ vaddw.u8 q3, q1, \P0v -+ -+ vqmovun.s16 \P0u, q2 -+ vmovl.u8 q2, \Q0u -+ vqmovun.s16 \P0v, q3 -+ vmovl.u8 q3, \Q0v -+ vsub.i16 q2, q0 -+ vsub.i16 q3, q1 -+ -+ vqmovun.s16 \Q0u, q2 -+ vqmovun.s16 \Q0v, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q1, q3 @ delta0b ++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a ++ vsub.i16 q0, q2, q0 @ q0a - delta0a ++ vmovl.u8 q2, \Q0b ++ vsub.i16 q2, q1 @ q0b - delta0b ++ vaddw.u8 q1, \P0b @ p0b + delta0b ++ vqmovun.s16 \Q0a, q0 ++ vqmovun.s16 \P0a, q3 ++ vqmovun.s16 \Q0b, q2 ++ vqmovun.s16 \P0b, q1 +.endm + + @@ -1835,33 +1848,36 @@ index 0000000000..e665bd848a +@ [0..7] tc U a +@ [8..15] tc V a + -+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth ++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 + vsub.i16 q0, \Q0a, \P0a -+ vsub.i16 q2, \P1a, \Q1a -+ vshl.i16 q0, #2 -+ vadd.i16 q0, q2 -+ vrshr.s16 q0, #3 -+ ++ vsub.i16 q1, \P1a, \Q1a + vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 + vshll.u8 q2, d4, #\bit_depth - 8 -+ -+ movw r2, #(1 << \bit_depth) - 1 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 -+ vmax.s16 q0, q2 -+ vmov.i64 q2, #0 -+ vdup.i16 q3, r2 ++ vmov.i16 q2, #0 ++ vmax.s16 q0, q1 + vadd.i16 \P0a, q0 + vsub.i16 \Q0a, q0 -+ ++ vmov.i16 q1, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 -+ vmin.s16 \P0a, q3 -+ vmin.s16 \Q0a, q3 ++ vmin.s16 \P0a, q1 ++ vmin.s16 \Q0a, q1 +.endm + -+@ Preserves r12 -+@ Clobbers r2 ++@ Clobbers r2, r12 +@ P0a et al all contain UVUVUVUV +@ r2 (tc4) contains +@ [0..7] tc U a @@ -1869,38 +1885,41 @@ index 0000000000..e665bd848a +@ [16..23] tc U b +@ [24..31] tc V b + -+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth -+ vsub.i16 q0, \Q0a, \P0a -+ vsub.i16 q1, \Q0b, \P0b -+ vsub.i16 q2, \P1a, \Q1a -+ vsub.i16 q3, \P1b, \Q1b -+ vshl.i16 q0, #2 -+ vshl.i16 q1, #2 -+ vadd.i16 q0, q2 -+ vrshr.s16 q0, #3 -+ vadd.i16 q1, q3 -+ vrshr.s16 q1, #3 -+ -+ vdup.16 d4, r2 -+ lsr r2, #16 -+ vdup.16 d6, r2 -+ vshll.u8 q2, d4, #\bit_depth - 8 -+ vshll.u8 q3, d6, #\bit_depth - 8 -+ -+ movw r2, #(1 << \bit_depth) - 1 ++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 ++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b ++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a ++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b ++ \I3 ++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b ++ \I4 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 + vmin.s16 q1, q3 -+ vneg.s16 q3, q3 -+ vmax.s16 q0, q2 -+ vmov.i64 q2, #0 -+ vmax.s16 q1, q3 -+ vdup.i16 q3, r2 -+ vadd.i16 \P0a, q0 -+ vsub.i16 \Q0a, q0 -+ vadd.i16 \P0b, q1 -+ vsub.i16 \Q0b, q1 -+ ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vadd.i16 \P0a, q0 @ p0a + delta0a ++ vsub.i16 \Q0a, q0 @ q0a - delta0a ++ vmax.s16 q1, q3 @ delta0b ++ vadd.i16 \P0b, q1 @ p0b + delta0b ++ vsub.i16 \Q0b, q1 @ q0b - delta0b ++ vmov.i16 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 + vmax.s16 \P0b, q2 @@ -1923,11 +1942,10 @@ index 0000000000..e665bd848a + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes -+ ldr r5, [sp, #32] @ &_no_p -+ ldrb r10, [r5] -+ ldr r5, [sp, #36] @ &_no_q ++ ldrd r4, r5, [sp, #32] @ &_no_p ++ ldrb r4, [r4] + ldrb r5, [r5] -+ cmp r10, #0 ++ movs r10, r4 + it ne + movne r10, #1 + cmp r5, #0 @@ -1950,244 +1968,207 @@ index 0000000000..e665bd848a +@ Junks: +@ r5, r6, r7, r8, r9 + -+.macro m_filter_luma bit_depth ++.macro m_filter_luma bit_depth, Q11, Q15 +.if \bit_depth == 8 -+ vmovl.u8 q15, d23 -+ vmovl.u8 q14, d22 -+ vmovl.u8 q13, d21 -+ vmovl.u8 q12, d20 -+ vmovl.u8 q11, d19 -+ vmovl.u8 q10, d18 -+ vmovl.u8 q9, d17 -+ vmovl.u8 q8, d16 ++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 ++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 ++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 ++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 ++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 ++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 +.endif -+ vadd.i16 q7, q9, q11 ++ vadd.i16 q0, q9, \Q11 @ P2 + P0 +.if \bit_depth > 8 -+ lsl r2, r2, #(\bit_depth - 8) ++ lsl r3, r3, #(\bit_depth - 8) +.endif -+ vadd.i16 q6, q14, q12 ++ vadd.i16 q1, q14, q12 @ Q2 + Q0 +.if \bit_depth > 8 -+ lsl r3, r3, #(\bit_depth - 8) ++ lsl r2, r2, #(\bit_depth - 8) +.endif -+ vsub.i16 q7, q10 -+ vsub.i16 q6, q13 -+ vabd.s16 q7, q7, q10 -+ vabd.s16 q6, q6, q13 ++ vsub.i16 q0, q10 @ P2 - P1 + P0 ++ lsr r5, r3, #16 ++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0 ++.if \bit_depth == 8 ++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 ++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 ++.endif ++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) ++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) ++ vmov.i64 q2, #0xffffffff0000 ++ vbic q0, q2 @ only dp0(') and dp3(') ++ vbic q1, q2 @ only dq0(') and dq3(') ++ vsra.u64 q0, #16 ++ vsra.u64 q1, #16 ++ vdup.16 q3, r2 @ beta ++ vdup.16 d14, r3 @ tC[0] ++ vdup.16 d15, r5 @ tC[1] ++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) ++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 ++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 ++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 ++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) ++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) ++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 ++ vshl.s16 q6, q7, #2 @ tC[] * 4 ++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 ++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) ++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) ++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 ++ cmp r7, #0 ++ beq .Lbypasswrite + -+ vdup.16 q0, r2 -+ vmov q4, q7 -+ vmov q5, q6 -+ vdup.16 d4, r3 -+ lsr r3, r3, #16 -+ vtrn.16 q7, q4 -+ vtrn.16 q6, q5 ++ vcgt.s16 q5, q6, q5 @ if < tc25 ++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) ++ vand q4, q5 ++ vbic d8, d4 ++ vbic d9, d4 ++ vshr.s16 q3, #2 @ beta_2 = beta >> 2 ++ vsra.u64 q4, #16 ++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 ++ vshl.i16 q7, #1 @ tc2 = tC[] << 1 ++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc ++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half ++ vand d6, d8 @ && beta_2 tests, prime in ms half ++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 ++ vneg.s16 q6, q7 @ -tc2 ++ vmovn.i32 d8, q3 ++ vshrn.i32 d6, q3, #16 ++ vand d6, d8 ++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 ++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) ++ vadd.i16 q0, \Q11, q12 @ p0 + q0 ++ ands r9, r7, r8 ++ beq 1f + -+ vshl.u64 q7, #32 -+ vshr.u64 q4, #32 -+ vshl.u64 q6, #32 -+ vshr.u64 q5, #32 -+ vshr.u64 q7, #32 -+ vshr.u64 q6, #32 -+ vshl.u64 q5, #32 -+ vshl.u64 q4, #32 -+ vorr q6, q5 -+ vorr q7, q4 -+ vdup.16 d5, r3 -+ vadd.i16 q5, q7, q6 -+ -+ vmov q4, q5 -+ vmov q3, q5 -+ vtrn.32 q3, q4 -+ -+ vadd.i16 q4, q3 -+ -+ vshl.s16 q5, q5, #1 -+ vcgt.s16 q3, q0, q4 -+ -+ vmovn.i16 d6, q3 -+ vshr.s16 q1, q0, #2 -+ vmovn.i16 d6, q3 -+ vcgt.s16 q5, q1, q5 -+ vmov r7, s12 -+ cmp r7, #0 -+ beq .Lbypasswrite -+ -+ vpadd.i32 d0, d14, d12 -+ vpadd.i32 d1, d15, d13 -+ vmov q4, q2 -+ vshl.s16 q2, #2 -+ vshr.s16 q1, q1, #1 -+ vrhadd.s16 q2, q4 -+ -+ vabd.s16 q7, q8, q11 -+ vaba.s16 q7, q15, q12 -+ -+ vmovn.i32 d0, q0 -+ vmov r5, r6, s0, s1 -+ vcgt.s16 q6, q1, q7 -+ vand q5, q5, q6 -+ vabd.s16 q7, q11, q12 -+ vcgt.s16 q6, q2, q7 -+ vand q5, q5, q6 -+ -+ vmov q2, q5 -+ vtrn.s16 q5, q2 -+ vshr.u64 q2, #32 -+ vshl.u64 q5, #32 -+ vshl.u64 q2, #32 -+ vshr.u64 q5, #32 -+ vorr q5, q2 -+ -+ vmov q2, q5 -+ vshl.i16 q7, q4, #1 -+ vtrn.32 q2, q5 -+ vand q5, q2 -+ vneg.s16 q6, q7 -+ vmovn.i16 d4, q5 -+ vmovn.i16 d4, q2 -+ vmov r8, s8 -+ -+ and r9, r8, r7 -+ cmp r9, #0 -+ beq 1f -+ -+ vadd.i16 q2, q11, q12 -+ vadd.i16 q4, q9, q8 -+ vadd.i16 q1, q2, q10 -+ vdup.16 d10, r9 -+ vadd.i16 q0, q1, q9 -+ vshl.i16 q4, #1 -+ lsr r9, #16 -+ vadd.i16 q1, q0 -+ vrshr.s16 q3, q0, #2 -+ vadd.i16 q1, q13 -+ vadd.i16 q4, q0 -+ vsub.i16 q3, q10 -+ vrshr.s16 q1, #3 -+ vrshr.s16 q4, #3 -+ vmax.s16 q3, q6 -+ vsub.i16 q1, q11 -+ vsub.i16 q4, q9 -+ vmin.s16 q3, q7 -+ vmax.s16 q4, q6 -+ vmax.s16 q1, q6 -+ vadd.i16 q3, q10 -+ vmin.s16 q4, q7 -+ vmin.s16 q1, q7 -+ vdup.16 d11, r9 -+ vadd.i16 q4, q9 -+ vadd.i16 q1, q11 -+ vbit q9, q4, q5 -+ vadd.i16 q4, q2, q13 -+ vbit q11, q1, q5 -+ vadd.i16 q0, q4, q14 -+ vadd.i16 q2, q15, q14 -+ vadd.i16 q4, q0 -+ -+ vshl.i16 q2, #1 -+ vadd.i16 q4, q10 -+ vbit q10, q3, q5 -+ vrshr.s16 q4, #3 -+ vadd.i16 q2, q0 -+ vrshr.s16 q3, q0, #2 -+ vsub.i16 q4, q12 -+ vrshr.s16 q2, #3 -+ vsub.i16 q3, q13 -+ vmax.s16 q4, q6 -+ vsub.i16 q2, q14 -+ vmax.s16 q3, q6 -+ vmin.s16 q4, q7 -+ vmax.s16 q2, q6 -+ vmin.s16 q3, q7 -+ vadd.i16 q4, q12 -+ vmin.s16 q2, q7 -+ vadd.i16 q3, q13 -+ vbit q12, q4, q5 -+ vadd.i16 q2, q14 -+ vbit q13, q3, q5 -+ vbit q14, q2, q5 ++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0 ++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1 ++ lsr r3, r9, #16 ++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) ++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) ++ vadd.i16 q0, q8, q9 @ p3 + p2 ++ vadd.i16 q5, \Q15, q14 @ q2 + q3 ++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 ++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 ++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 ++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 ++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) ++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) ++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) ++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) ++ vrshr.s16 q0, #3 @ scale, with rounding ++ vrshr.s16 q5, #3 ++ vrshr.s16 q1, #2 ++ vrshr.s16 q4, #2 ++ vrshr.s16 q2, #3 ++ vrshr.s16 q3, #3 ++ vsub.i16 q0, q9 @ find difference ++ vsub.i16 q5, q14 ++ vsub.i16 q1, q10 ++ vsub.i16 q4, q13 ++ vsub.i16 q2, \Q11 ++ vsub.i16 q3, q12 ++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 ++ vmax.s16 q5, q6 ++ vmax.s16 q1, q6 ++ vmax.s16 q4, q6 ++ vmax.s16 q2, q6 ++ vmax.s16 q3, q6 ++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure ++ vdup.16 d13, r3 ++ vmin.s16 q0, q7 ++ vmin.s16 q5, q7 ++ vmin.s16 q1, q7 ++ vmin.s16 q4, q7 ++ vmin.s16 q2, q7 ++ vmin.s16 q3, q7 ++ vadd.i16 q0, q9 @ apply difference ++ vadd.i16 q5, q14 ++ vadd.i16 q1, q10 ++ vadd.i16 q4, q13 ++ vadd.i16 q2, \Q11 ++ vadd.i16 q3, q12 ++ vbit q9, q0, q6 @ apply filtered values according to mask ++ vbit q14, q5, q6 ++ vbit q10, q1, q6 ++ vbit q13, q4, q6 ++ vbit \Q11, q2, q6 ++ vbit q12, q3, q6 ++ vneg.s16 q6, q7 @ restore -tc2 + +1: -+ mvn r8, r8 -+ and r9, r8, r7 -+ cmp r9, #0 -+ beq 2f ++ bics r9, r7, r8 ++ beq 2f + -+ vdup.16 q4, r2 -+ -+ vdup.16 d10, r9 -+ lsr r9, #16 -+ vmov q1, q4 -+ vdup.16 d11, r9 -+ vshr.s16 q1, #1 -+ vsub.i16 q2, q12, q11 -+ vadd.i16 q4, q1 -+ vshl.s16 q0, q2, #3 -+ vshr.s16 q4, #3 -+ vadd.i16 q2, q0 -+ vsub.i16 q0, q13, q10 -+ vsub.i16 q2, q0 -+ vshl.i16 q0, q0, #1 -+ vsub.i16 q2, q0 -+ vshl.s16 q1, q7, 2 -+ vrshr.s16 q2, q2, #4 -+ vadd.i16 q1, q7 -+ vabs.s16 q3, q2 -+ vshr.s16 q6, q6, #1 -+ vcgt.s16 q1, q1, q3 -+ vand q5, q1 -+ vshr.s16 q7, q7, #1 -+ vmax.s16 q2, q2, q6 -+ vmin.s16 q2, q2, q7 -+ -+ vshr.s16 q7, q7, #1 -+ vrhadd.s16 q3, q9, q11 -+ vneg.s16 q6, q7 -+ vsub.s16 q3, q10 -+ vdup.16 d2, r5 -+ vhadd.s16 q3, q2 -+ vdup.16 d3, r6 -+ vmax.s16 q3, q3, q6 -+ vcgt.s16 q1, q4, q1 -+ vmin.s16 q3, q3, q7 -+ vand q1, q5 -+ vadd.i16 q3, q10 -+ lsr r5, #16 -+ lsr r6, #16 -+ vbit q10, q3, q1 -+ -+ vrhadd.s16 q3, q14, q12 -+ vdup.16 d2, r5 -+ vsub.s16 q3, q13 -+ vdup.16 d3, r6 -+ vhsub.s16 q3, q2 -+ vcgt.s16 q1, q4, q1 -+ vmax.s16 q3, q3, q6 -+ vand q1, q5 -+ vmin.s16 q3, q3, q7 -+ vadd.i16 q3, q13 -+ vbit q13, q3, q1 -+ vadd.i16 q0, q11, q2 -+ vsub.i16 q4, q12, q2 -+ vbit q11, q0, q5 -+ vbit q12, q4, q5 ++ vsub.i16 q0, q12, \Q11 @ q0 - p0 ++ vsub.i16 q1, q13, q10 @ q1 - p1 ++ lsr r3, r9, #16 ++ vshl.i16 q2, q0, #3 ++ lsr r7, r5, #16 ++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) ++ lsr r8, r6, #16 ++ vshl.i16 q2, q1, #1 ++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) ++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1 ++ vsub.i16 q5, q3, q4 ++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 ++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 ++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 ++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 ++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 ++ vmax.s16 q6, q5 @ ++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 ++ vdup.16 q0, r2 @ beta ++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] ++ vshr.s16 q4, #1 @ tc_2 = tc >> 1 ++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 ++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 ++ vshr.s16 q2, q0, #1 @ beta >> 1 ++ vadd.i16 q2, q0 @ beta + (beta >> 1) ++ vneg.s16 q0, q4 @ -tc_2 ++ vabs.s16 q5, q5 @ abs(original delta0) ++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 ++ vmax.s16 q1, q0 ++ vmax.s16 q3, q0 ++ vshl.s16 q0, q7, #2 @ 8 * tc ++ vadd.i16 q7, q0 @ 10 * tc ++ vdup.16 d0, r9 ++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering ++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) ++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) ++ vdup.16 d8, r5 @ dp0 + dp3 ++ vdup.16 d9, r7 @ dp0' + dp3' ++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) ++ vdup.16 d10, r6 @ dq0 + dq3 ++ vdup.16 d11, r8 @ dq0' + dq3' ++ vand q7, q0 @ AND block and line masks ++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) ++ vadd.i16 q0, q1, q10 @ p1 + deltap1 ++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) ++ vadd.i16 q3, q3, q13 @ q1 + deltaq1 ++ vadd.i16 q1, \Q11, q6 @ p0 + delta0 ++ vsub.i16 q2, q12, q6 @ q0 - delta0 ++ vand q4, q7 @ AND nd_p test with block/line masks ++ vand q5, q7 @ AND nd_q test with block/line masks ++ vbit q10, q0, q4 ++ vbit \Q11, q1, q7 ++ vbit q12, q2, q7 ++ vbit q13, q3, q5 + +2: +.if \bit_depth == 8 ++ vmovn.i16 d16, q8 ++ vmovn.i16 d23, \Q15 + neg r1, r1 -+ vqmovun.s16 d16, q8 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 -+ vqmovun.s16 d19, q11 ++ vqmovun.s16 d19, \Q11 + lsls r10, #31 + vqmovun.s16 d20, q12 + vqmovun.s16 d21, q13 + vqmovun.s16 d22, q14 -+ vqmovun.s16 d23, q15 +.else -+ movw r5, #(1 << \bit_depth - 1) -+ vmov.i64 q0, #0 -+ vdup.i16 q1, r5 ++ vmov.i16 q0, #0 ++ vmov.i16 q1, #(1 << \bit_depth - 1) + @ q8 & q15 should be unaltered and so don't require clipping + neg r1, r1 + vmax.s16 q9, q0 @@ -2204,14 +2185,14 @@ index 0000000000..e665bd848a + vmin.s16 q13, q1 + vmin.s16 q14, q1 +.endif -+ mov pc, lr ++ bx lr +.endm + +function hevc_loop_filter_luma_body -+ m_filter_luma 8 ++ m_filter_luma 8, q15, q11 +endfunc + -+@ void ff_hevc_rpi_v_loop_filter_luma_neon( ++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( +@ uint8_t *_pix, [r0] +@ ptrdiff_t _stride, [r1] +@ int _beta, [r2] @@ -2219,7 +2200,7 @@ index 0000000000..e665bd848a +@ uint8_t *_no_p, [sp+0] +@ uint8_t *_no_q) [sp+4] + -+function ff_hevc_rpi_v_loop_filter_luma_neon, export=1 ++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + + sub r4, r0, #4 @@ -2245,66 +2226,72 @@ index 0000000000..e665bd848a +.Lv_loop_luma_common: + vpush {d8-d15} + -+ @ Uses slightly fewer instructions to do laned loads than unlaned -+ @ and transpose. This also means that we can use the same code for -+ @ both split & unsplit deblock -+ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1 -+ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1 -+ -+ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ -+ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 -+ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 -+ -+ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ -+ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 -+ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 -+ -+ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ -+ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 -+ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 -+ -+ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32] -+ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32] ++ @ It's slightly faster to do unlaned loads and transpose in the ++ @ 8-bit case, even though it needs more instructions, because ++ @ VLD4.8 is a really slow way to read from memory. ++ vld1.32 {d16[0]}, [r4:32], r1 ++ vld1.32 {d20[0]}, [r0:32], r1 ++ vld1.32 {d16[1]}, [r4:32], r1 ++ vld1.32 {d20[1]}, [r0:32], r1 ++ vld1.32 {d17[0]}, [r4:32], r1 ++ vld1.32 {d21[0]}, [r0:32], r1 ++ vld1.32 {d17[1]}, [r4:32], r1 ++ vld1.32 {d21[1]}, [r0:32], r1 ++ vld1.32 {d18[0]}, [r4:32], r1 ++ vld1.32 {d22[0]}, [r0:32], r1 ++ vld1.32 {d18[1]}, [r4:32], r1 ++ vld1.32 {d22[1]}, [r0:32], r1 ++ vld1.32 {d19[0]}, [r4:32], r1 ++ vld1.32 {d23[0]}, [r0:32], r1 ++ vld1.32 {d19[1]}, [r4:32] ++ vld1.32 {d23[1]}, [r0:32] ++ vuzp.16 q8, q9 ++ vuzp.16 q10, q11 ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vswp d17, d18 ++ vswp d21, d22 + + bl hevc_loop_filter_luma_body + ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ + @ no_p[1] + bmi 1f + vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 -+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 + vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 + + vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 + vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] +1: + @ no_q[1] -+@ tst r10, #2 + bcs 1f + vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 -+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 + vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 + + vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 + vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] +1: ++ pop {r4-r10,pc} ++ +.Lbypasswrite: + vpop {d8-d15} + pop {r4-r10,pc} +endfunc + -+.macro m_filter_v_luma_common_16 bit_depth ++.macro m_filter_v_luma_16 bit_depth + vpush {d8-d15} + + @ Uses slightly fewer instructions to do laned loads than unlaned @@ -2336,29 +2323,34 @@ index 0000000000..e665bd848a + + bl hevc_loop_filter_luma_body_\bit_depth + ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ + @ p[1] + bmi 1f + vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 -+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 + vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 -+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 + vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 -+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 + vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 -+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4] ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] +1: + @ q[1] + bcs 1f + vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 -+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 + vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 -+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 + vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 -+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 + vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 -+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0] ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] +1: -+ vpop {d8-d15} + pop {r4-r10,pc} +.endm + @@ -2374,7 +2366,7 @@ index 0000000000..e665bd848a +@ +@ Src should always be on 8 byte boundry & all in the same slice + -+function ff_hevc_rpi_h_loop_filter_luma_neon, export=1 ++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + b .Lh_loop_filter_luma_common_8 +endfunc @@ -2387,71 +2379,75 @@ index 0000000000..e665bd848a + ldr r10, [sp, #32] + +.Lh_loop_filter_luma_common_8: ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 + vpush {d8-d15} -+ sub r0, r0, r1, lsl #2 + -+ vld1.8 {d16}, [r0], r1 ++ vld1.8 {d16}, [r4], r1 + vld1.8 {d17}, [r0], r1 -+ vld1.8 {d18}, [r0], r1 ++ vld1.8 {d18}, [r4], r1 + vld1.8 {d19}, [r0], r1 -+ vld1.8 {d20}, [r0], r1 ++ vld1.8 {d20}, [r4], r1 + vld1.8 {d21}, [r0], r1 -+ vld1.8 {d22}, [r0], r1 ++ vld1.8 {d22}, [r4] + vld1.8 {d23}, [r0] + + bl hevc_loop_filter_luma_body + -+ add r2, r0, r1, lsl #2 -+ add r0, r0, r1 -+ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f -+ vst1.8 {d22}, [r0], r1 -+ vst1.8 {d21}, [r0], r1 -+ vst1.8 {d20}, [r0] ++ vst1.8 {d22}, [r4], r1 ++ vst1.8 {d21}, [r6] ++ vst1.8 {d20}, [r4] +1: + @ Q0-Q2 + bmi 1f -+ vst1.8 {d19}, [r2], r1 -+ vst1.8 {d18}, [r2], r1 -+ vst1.8 {d17}, [r2] ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r2] ++ vst1.8 {d17}, [r0] +1: + pop {r4-r10,pc} +endfunc + + +.macro m_filter_h_luma_16 bit_depth ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 + vpush {d8-d15} -+ sub r0, r0, r1, lsl #2 + -+ vld1.16 { q8}, [r0], r1 ++ vld1.16 { q8}, [r4], r1 + vld1.16 { q9}, [r0], r1 -+ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q10}, [r4], r1 + vld1.16 {q11}, [r0], r1 -+ vld1.16 {q12}, [r0], r1 ++ vld1.16 {q12}, [r4], r1 + vld1.16 {q13}, [r0], r1 -+ vld1.16 {q14}, [r0], r1 ++ vld1.16 {q14}, [r4] + vld1.16 {q15}, [r0] + + bl hevc_loop_filter_luma_body_\bit_depth + -+ add r2, r0, r1, lsl #2 -+ add r0, r1 -+ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f -+ vst1.16 {q14}, [r0], r1 -+ vst1.16 {q13}, [r0], r1 -+ vst1.16 {q12}, [r0] ++ vst1.16 {q14}, [r4], r1 ++ vst1.16 {q13}, [r6] ++ vst1.16 {q12}, [r4] +1: + bmi 1f -+ vst1.16 {q11}, [r2], r1 -+ vst1.16 {q10}, [r2], r1 -+ vst1.16 { q9}, [r2] ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r2] ++ vst1.16 { q9}, [r0] +1: + pop {r4-r10,pc} +.endm @@ -2474,23 +2470,25 @@ index 0000000000..e665bd848a +@ common in the H direction than V due to how we arrange deblock. + +function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 ++ sub r12, r0, r1 + cmp r2, #0 + bxeq lr -+ sub r0, r0, r1, lsl #1 ++ vld1.8 {d26,d27}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.8 {d18,d19}, [r12], r1 + vld1.8 {d16,d17}, [r0], r1 -+ vld1.8 {d18,d19}, [r0], r1 -+ vld1.8 {d26,d27}, [r0], r1 -+ vld1.8 {d28,d29}, [r0] -+ sub r0, r0, r1, lsl #1 -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29 ++ vld1.8 {d28,d29}, [r12] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ ++ "sub r12, r0, r1, asr #1" + -+ lsls r2, r3, #31 @ b0 -> N, b1 -> C -+ vstrpl d18, [r0, #0] -+ vstrcc d19, [r0, #8] -+ add r0, r1 + lsls r3, #29 @ b2 -> N, b3 -> C + vstrpl d26, [r0, #0] + vstrcc d27, [r0, #8] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ vstrpl d18, [r12, #0] ++ vstrcc d19, [r12, #8] + bx lr + +endfunc @@ -2506,37 +2504,38 @@ index 0000000000..e665bd848a +@ Macro here actual function near bottom + +.macro m_filter_h_uv_16 bit_depth ++ sub r12, r0, r1 + cmp r2, #0 + bxeq lr -+ sub r0, r0, r1, lsl #1 ++ vld1.16 {q12, q13}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.16 {q10, q11}, [r12], r1 + vld1.16 {q8, q9 }, [r0], r1 -+ vld1.16 {q10, q11}, [r0], r1 -+ vld1.16 {q12, q13}, [r0], r1 -+ vld1.16 {q14, q15}, [r0] -+ sub r0, r0, r1, lsl #1 ++ vld1.16 {q14, q15}, [r12] + -+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ ++ "sub r12, r0, r1, asr #1", \ ++ "cmp r3, #0" + -+ cmp r3, #0 + bne 1f -+ vst1.16 {q10, q11}, [r0], r1 ++ vst1.16 {q10, q11}, [r12] + vst1.16 {q12, q13}, [r0] + bx lr + + @ At least one no_f bit is set + @ Which means we need to break this apart in an ugly fashion +1: -+ lsls r2, r3, #31 @ b0 -> N, b1 -> C -+ vstrpl d20, [r0, #0] -+ vstrpl d21, [r0, #8] -+ vstrcc d22, [r0, #16] -+ vstrcc d23, [r0, #24] -+ add r0, r1 + lsls r3, #29 @ b2 -> N, b3 -> C + vstrpl d24, [r0, #0] + vstrpl d25, [r0, #8] + vstrcc d26, [r0, #16] + vstrcc d27, [r0, #24] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ vstrpl d20, [r12, #0] ++ vstrpl d21, [r12, #8] ++ vstrcc d22, [r12, #16] ++ vstrcc d23, [r12, #24] + bx lr +.endm + @@ -2556,6 +2555,7 @@ index 0000000000..e665bd848a +function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + cmp r2, #0 + bxeq lr ++ push {lr} + vld2.16 {d16[0], d18[0]}, [r3], r1 + vld2.16 {d20[0], d22[0]}, [r0], r1 + @@ -2570,106 +2570,112 @@ index 0000000000..e665bd848a + vld2.16 {d20[3], d22[3]}, [r0], r1 + blo 10f + -+ sub r12, r0, r3 + vld2.16 {d17[0], d19[0]}, [r3], r1 + vld2.16 {d21[0], d23[0]}, [r0], r1 + -+ cmp r12, #4 ++ sub ip, r0, r3 + vld2.16 {d17[1], d19[1]}, [r3], r1 + vld2.16 {d21[1], d23[1]}, [r0], r1 + ++ cmp ip, #4 + vld2.16 {d17[2], d19[2]}, [r3], r1 + vld2.16 {d21[2], d23[2]}, [r0], r1 + + vld2.16 {d17[3], d19[3]}, [r3] + vld2.16 {d21[3], d23[3]}, [r0] -+ it eq -+ ldreq r12, [sp, #0] + -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 -+ cmp r12, #0 -+ add r3, #2 -+ neg r1, r1 ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #2", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ + bne 1f + +@ Much/most of the time r0 == r3 + 4 and no_f == 0 +@ so it is worth having this special case + vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b -+ vst2.16 {d19[2], d21[2]}, [r3], r1 ++ vst2.16 {d19[2], d21[2]}, [ip], r1 + vst2.16 {d19[1], d21[1]}, [r3], r1 -+ vst2.16 {d19[0], d21[0]}, [r3], r1 ++ vst2.16 {d19[0], d21[0]}, [ip], r1 + vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a -+ vst2.16 {d18[2], d20[2]}, [r3], r1 -+ vst2.16 {d18[1], d20[1]}, [r3], r1 -+ vst2.16 {d18[0], d20[0]}, [r3] -+ bx lr ++ vst2.16 {d18[2], d20[2]}, [ip], r1 ++ vst2.16 {d18[1], d20[1]}, [r3] ++ vst2.16 {d18[0], d20[0]}, [ip] ++ pop {pc} + +@ Either split or partial +1: -+ ldr r12, [sp, #0] -+ @ I have no idea if this is faster than any of the other ways of -+ @ testing these bits but it does free up r12 -+ lsl r12, #28 -+ add r2, r0, r1, lsl #2 -+ msr APSR_nzcvq, r12 @ b0 (P0a) -> V, b1 (Q0a) -> C, b2 (P0b) -> Z, b3 (Q0b) -> N -+ add r12, r3, r1, lsl #2 -+ bmi 1f ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f + @ Q0b + vst1.16 {d21[3]}, [r0], r1 -+ vst1.16 {d21[2]}, [r0], r1 ++ vst1.16 {d21[2]}, [r2], r1 + vst1.16 {d21[1]}, [r0], r1 -+ vst1.16 {d21[0]}, [r0] ++ vst1.16 {d21[0]}, [r2], r1 +1: -+ beq 2f ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f + @ P0b + vst1.16 {d19[3]}, [r3], r1 -+ vst1.16 {d19[2]}, [r3], r1 ++ vst1.16 {d19[2]}, [ip], r1 + vst1.16 {d19[1]}, [r3], r1 -+ vst1.16 {d19[0]}, [r3] -+ -+2: -+ bcs 3f ++ vst1.16 {d19[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f + @ Q0a -+ vst1.16 {d20[3]}, [r2], r1 ++ vst1.16 {d20[3]}, [r0], r1 + vst1.16 {d20[2]}, [r2], r1 -+ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[1]}, [r0] + vst1.16 {d20[0]}, [r2] -+ -+3: -+ it vs -+ bxvs lr -+ vst1.16 {d18[3]}, [r12], r1 -+ vst1.16 {d18[2]}, [r12], r1 -+ vst1.16 {d18[1]}, [r12], r1 -+ vst1.16 {d18[0]}, [r12] -+ bx lr ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[3]}, [r3], r1 ++ vst1.16 {d18[2]}, [ip], r1 ++ vst1.16 {d18[1]}, [r3] ++ vst1.16 {d18[0]}, [ip] ++ pop {pc} + +@ Single lump (rather than double) +10: -+ hevc_loop_filter_uv_body1 d16, d18, d20, d22 -+ + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) -+ ldr r12, [sp, #0] -+ add r3, #2 -+ sub r0, r0, r1, lsl #2 -+ sub r3, r3, r1, lsl #2 -+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #2", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" + + bcs 3f ++ @ Q0a + vst1.16 {d20[0]}, [r0], r1 -+ vst1.16 {d20[1]}, [r0], r1 -+ vst1.16 {d20[2]}, [r0], r1 -+ vst1.16 {d20[3]}, [r0] -+ ++ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[2]}, [r0] ++ vst1.16 {d20[3]}, [r2] +3: -+ it mi -+ bxmi lr ++ it mi ++ popmi {pc} ++ @ P0a + vst1.16 {d18[0]}, [r3], r1 -+ vst1.16 {d18[1]}, [r3], r1 -+ vst1.16 {d18[2]}, [r3], r1 -+ vst1.16 {d18[3]}, [r3] -+ bx lr ++ vst1.16 {d18[1]}, [ip], r1 ++ vst1.16 {d18[2]}, [r3] ++ vst1.16 {d18[3]}, [ip] ++ pop {pc} + +endfunc + @@ -2695,14 +2701,14 @@ index 0000000000..e665bd848a +.macro m_filter_v_uv2_16 bit_depth + cmp r2, #0 + bxeq lr -+ ++ push {lr} + vld2.32 {d16[0], d18[0]}, [r3], r1 + vld2.32 {d20[0], d22[0]}, [r0], r1 + ++ cmp r2, #0x10000 + vld2.32 {d16[1], d18[1]}, [r3], r1 + vld2.32 {d20[1], d22[1]}, [r0], r1 + -+ cmp r2, #0x10000 + vld2.32 {d17[0], d19[0]}, [r3], r1 + vld2.32 {d21[0], d23[0]}, [r0], r1 + @@ -2713,170 +2719,509 @@ index 0000000000..e665bd848a + vld2.32 {d24[0], d26[0]}, [r3], r1 + vld2.32 {d28[0], d30[0]}, [r0], r1 + ++ sub ip, r0, r3 + vld2.32 {d24[1], d26[1]}, [r3], r1 + vld2.32 {d28[1], d30[1]}, [r0], r1 -+ sub r12, r0, r3 + ++ cmp ip, #8 + vld2.32 {d25[0], d27[0]}, [r3], r1 + vld2.32 {d29[0], d31[0]}, [r0], r1 -+ cmp r12, #8 + + vld2.32 {d25[1], d27[1]}, [r3] + vld2.32 {d29[1], d31[1]}, [r0] -+ it eq -+ ldreq r12, [sp, #0] + -+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth -+ cmp r12, #0 -+ add r3, #4 -+ neg r1, r1 ++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #4", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ + bne 1f + -+@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ Much/most of the time r0 == r3 + 8 and no_f == 0 +@ so it is worth having this special case -+ vst2.32 {d27[1], d29[1]}, [r3], r1 -+ vst2.32 {d27[0], d29[0]}, [r3], r1 -+ vst2.32 {d26[1], d28[1]}, [r3], r1 -+ vst2.32 {d26[0], d28[0]}, [r3], r1 -+ vst2.32 {d19[1], d21[1]}, [r3], r1 -+ vst2.32 {d19[0], d21[0]}, [r3], r1 -+ vst2.32 {d18[1], d20[1]}, [r3], r1 -+ vst2.32 {d18[0], d20[0]}, [r3] -+ bx lr ++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b ++ vst2.32 {d27[0], d29[0]}, [ip], r1 ++ vst2.32 {d26[1], d28[1]}, [r3], r1 ++ vst2.32 {d26[0], d28[0]}, [ip], r1 ++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a ++ vst2.32 {d19[0], d21[0]}, [ip], r1 ++ vst2.32 {d18[1], d20[1]}, [r3] ++ vst2.32 {d18[0], d20[0]}, [ip] ++ pop {pc} + +@ Either split or partial +1: -+ ldr r12, [sp, #0] -+ lsls r12, #29 @ b2 (P0b) -> N, b3 (Q0b) -> C ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 + bcs 1f + @ Q0b -+ mov r2, r0 -+ vst1.32 {d29[1]}, [r2], r1 ++ vst1.32 {d29[1]}, [r0], r1 + vst1.32 {d29[0]}, [r2], r1 -+ vst1.32 {d28[1]}, [r2], r1 -+ vst1.32 {d28[0]}, [r2] ++ vst1.32 {d28[1]}, [r0], r1 ++ vst1.32 {d28[0]}, [r2], r1 +1: -+ bmi 2f ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f + @ P0b -+ mov r2, r3 -+ vst1.32 {d27[1]}, [r2], r1 -+ vst1.32 {d27[0]}, [r2], r1 -+ vst1.32 {d26[1]}, [r2], r1 -+ vst1.32 {d26[0]}, [r2] -+ -+2: -+ lsls r12, #2 @ b0 (P0a) -> N, b1 (Q0a) -> C -+ bcs 3f ++ vst1.32 {d27[1]}, [r3], r1 ++ vst1.32 {d27[0]}, [ip], r1 ++ vst1.32 {d26[1]}, [r3], r1 ++ vst1.32 {d26[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f + @ Q0a -+ add r0, r0, r1, lsl #2 + vst1.32 {d21[1]}, [r0], r1 -+ vst1.32 {d21[0]}, [r0], r1 -+ vst1.32 {d20[1]}, [r0], r1 -+ vst1.32 {d20[0]}, [r0] -+ -+3: -+ it mi -+ bxmi lr ++ vst1.32 {d21[0]}, [r2], r1 ++ vst1.32 {d20[1]}, [r0] ++ vst1.32 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} + @ P0a -+ add r3, r3, r1, lsl #2 + vst1.32 {d19[1]}, [r3], r1 -+ vst1.32 {d19[0]}, [r3], r1 -+ vst1.32 {d18[1]}, [r3], r1 -+ vst1.32 {d18[0]}, [r3] -+ bx lr -+ ++ vst1.32 {d19[0]}, [ip], r1 ++ vst1.32 {d18[1]}, [r3] ++ vst1.32 {d18[0]}, [ip] ++ pop {pc} + ++@ Single lump (rather than double) +10: -+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth -+ + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) -+ ldr r12, [sp, #0] -+ add r3, #4 -+ sub r0, r0, r1, lsl #2 -+ sub r3, r3, r1, lsl #2 -+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #4", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" + + bcs 3f + @ Q0a + vst1.32 {d20[0]}, [r0], r1 -+ vst1.32 {d20[1]}, [r0], r1 -+ vst1.32 {d21[0]}, [r0], r1 -+ vst1.32 {d21[1]}, [r0] -+ ++ vst1.32 {d20[1]}, [r2], r1 ++ vst1.32 {d21[0]}, [r0] ++ vst1.32 {d21[1]}, [r2] +3: -+ it mi -+ bxmi lr ++ it mi ++ popmi {pc} + @ P0a + vst1.32 {d18[0]}, [r3], r1 -+ vst1.32 {d18[1]}, [r3], r1 -+ vst1.32 {d19[0]}, [r3], r1 -+ vst1.32 {d19[1]}, [r3] -+ bx lr ++ vst1.32 {d18[1]}, [ip], r1 ++ vst1.32 {d19[0]}, [r3] ++ vst1.32 {d19[1]}, [ip] ++ pop {pc} +.endm + + ++#if 1 // NEON version + + -+/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i -+ * int *curr_rpl0, int *curr_ -+ * MvField *curr, MvField *ne ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ mov ip, sp ++ push {a2,v1-v8,lr} ++ ldm ip, {v1-v5} ++ cmp a1, #2 ++ bls 2f ++ vpush {d8-d13} ++ sub v5, v5, #10 ++ mov v6, #32 ++1: ++ vld2.32 {d0[0], d2[0]}, [a3]! ++ vld2.32 {d4[0], d6[0]}, [a4]! ++ vmov.u8 q12, #0 ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[0]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[0]}, [ip] ++ vld1.32 {d18[0]}, [v8] ++ vld1.32 {d22[0]}, [lr] ++ ++ vld2.32 {d0[1], d2[1]}, [a3]! ++ vld2.32 {d4[1], d6[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d12, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d13, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d27, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[2]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[1]}, [ip] ++ vld1.32 {d18[1]}, [v8] ++ vld1.32 {d22[1]}, [lr] ++ ++ vld2.32 {d1[0], d3[0]}, [a3]! ++ vld2.32 {d5[0], d7[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[4]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[4]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[0]}, [ip] ++ vld1.32 {d19[0]}, [v8] ++ vld1.32 {d23[0]}, [lr] ++ ++ vld2.32 {d1[1], d3[1]}, [a3]! ++ vld2.32 {d5[1], d7[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[6]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[6]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[1]}, [ip] ++ vld1.32 {d19[1]}, [v8] ++ vld1.32 {d23[1]}, [lr] ++ ++ @ So now we have: ++ @ q0.32[i] = curr[i].mv[0] ++ @ q1.32[i] = curr[i].mv[1] ++ @ q2.32[i] = neigh[i].mv[0] ++ @ q3.32[i] = neigh[i].mv[1] ++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d24.16[i] = curr[i].pred_flag ++ @ d25.16[i] = neigh[i].pred_flag ++ ++ vtst.16 d28, d24, d12 ++ vtst.16 d29, d24, d13 ++ vadd.i16 d8, d24, d12 ++ vadd.i16 d9, d25, d12 ++ vtst.16 d30, d25, d12 ++ vtst.16 d31, d25, d13 ++ veor d26, d8, d9 ++ ldr lr, [sp, 6*8] ++ vmovl.s16 q4, d28 ++ vmovl.s16 q5, d29 ++ teq lr, #1 ++ vmovl.s16 q14, d30 ++ lslne v1, lr, #1 ++ vmovl.s16 q15, d31 ++ rsbne v2, v1, #32 ++ vbif q0, q1, q4 ++ vbif q2, q3, q14 ++ vbif q1, q0, q5 ++ vbif q3, q2, q15 ++ vabd.s16 q12, q0, q2 ++ vabd.s16 q2, q1 ++ vabd.s16 q0, q3 ++ vabd.s16 q1, q3 ++ vbif q8, q9, q4 ++ vbif q10, q11, q14 ++ vbif q9, q8, q5 ++ vbif q11, q10, q15 ++ vclt.u16 d6, d24, d27 ++ vclt.u16 d8, d2, d27 ++ vclt.u16 d7, d25, d27 ++ vclt.u16 d9, d3, d27 ++ vclt.u16 d2, d0, d27 ++ vclt.u16 d0, d4, d27 ++ vclt.u16 d3, d1, d27 ++ vclt.u16 d1, d5, d27 ++ vceq.i32 q12, q10, q8 ++ vceq.i32 q10, q9 ++ vceq.i32 q8, q11 ++ vceq.i32 q9, q11 ++ vshrn.i32 d6, q3, #8 ++ vshrn.i32 d7, q4, #8 ++ vshrn.i32 d8, q1, #8 ++ vshrn.i32 d9, q0, #8 ++ vmovn.i32 d4, q12 ++ vmovn.i32 d2, q10 ++ vmovn.i32 d3, q8 ++ vmovn.i32 d5, q9 ++ vand q2, q3 ++ vrev16.8 q3, q3 ++ vand q2, q3 ++ vand q1, q4 ++ vrev16.8 q4, q4 ++ vand q1, q4 ++ vand d4, d5 ++ vand d2, d3 ++ vbic d0, d12, d4 ++ vshr.u16 d26, #2 ++ vbic d0, d2 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d26 ++ bne 10f ++ ++ @ Merge results into result word, no duplicates ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ sub v6, #8 ++ lsl a2, #30 ++ lsl v8, #30 ++ lsl ip, #30 ++ lsl lr, #30 ++ orr a2, ip, a2, lsr #2 ++ orr v8, lr, v8, lsr #2 ++ orr a2, v8, a2, lsr #4 ++ subs a1, #4 ++ orr v7, a2, v7, lsr #8 ++ bhi 1b ++ ++ vpop {d8-d13} ++ mov a1, v7, lsr v6 ++ pop {a2,v1-v8,pc} ++10: ++ @ Merge results into result word, with duplicates ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ sub v6, v6, v1, lsl #2 ++ lsl a2, v2 ++ subs a1, #4 ++ lsl v8, v2 ++ lsl ip, v2 ++ lsl lr, v2 ++ ldr v2, [sp, #6*8 + 10*4 + 1*4] ++ orr a2, ip, a2, lsr v1 ++ lsl ip, v1, #1 ++ orr v8, lr, v8, lsr v1 ++ lsl lr, v1, #2 ++ orr a2, v8, a2, lsr ip ++ ldr v1, [sp, #6*8 + 10*4] ++ orr v7, a2, v7, lsr lr ++ bhi 1b ++ ++ vpop {d8-d13} ++ mov a1, v7, lsr v6 ++ pop {a2,v1-v8,pc} ++ ++ ++2: ++ sub v5, v5, #10 ++ vmov.u8 d16, #0 ++ blo 3f ++ vld2.32 {d0[0], d1[0]}, [a3]! ++ vld2.32 {d2[0], d3[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[4]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[0]}, [ip] ++ vld1.32 {d6[0]}, [v8] ++ vld1.32 {d7[0]}, [lr] ++ ++3: ++ vld2.32 {d0[1], d1[1]}, [a3]! ++ vld2.32 {d2[1], d3[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d17, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d18, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d19, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[6]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[1]}, [ip] ++ vld1.32 {d6[1]}, [v8] ++ vld1.32 {d7[1]}, [lr] ++ ++ @ So now we have: ++ @ d0.32[i] = curr[i].mv[0] ++ @ d1.32[i] = curr[i].mv[1] ++ @ d2.32[i] = neigh[i].mv[0] ++ @ d3.32[i] = neigh[i].mv[1] ++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d16.16[i] = curr[i].pred_flag ++ @ d16.16[2+i] = neigh[i].pred_flag ++ ++ vtst.16 d20, d16, d17 ++ vtst.16 d22, d16, d18 ++ vadd.i16 d30, d16, d17 ++ vswp d2, d3 ++ ldr lr, [sp] ++ vmovl.s16 q10, d20 ++ teq lr, #1 ++ vmovl.s16 q11, d22 ++ lslne v1, lr, #1 ++ vbif d0, d1, d20 ++ vbif d4, d6, d20 ++ vbif d3, d2, d21 ++ vbif d5, d7, d21 ++ vbif d1, d0, d22 ++ vbif d6, d4, d22 ++ vbif d2, d3, d23 ++ vbif d7, d5, d23 ++ vshr.u16 d30, #2 ++ vabd.s16 d24, d0, d3 ++ vabd.s16 d25, d1, d2 ++ vabd.s16 q0, q0, q1 ++ vceq.i32 d2, d4, d5 ++ vceq.i32 d20, d5, d6 ++ vceq.i32 d21, d4, d7 ++ vceq.i32 d3, d6, d7 ++ vclt.u16 d6, d24, d19 ++ vclt.u16 d7, d25, d19 ++ vclt.u16 d22, d1, d19 ++ vclt.u16 d23, d0, d19 ++ vshrn.i32 d6, q3, #8 ++ vmovn.i32 d2, q1 ++ vshrn.i32 d7, q11, #8 ++ vmovn.i32 d3, q10 ++ vand q0, q3, q1 ++ rsbne v2, v1, #32 ++ vrev16.8 q3, q3 ++ vand q0, q3 ++ vsra.u64 d30, #32 ++ vshr.u64 q1, q0, #32 ++ vand q0, q1 ++ vbic d0, d17, d0 ++ vand d30, d30, d17 ++ vbic d0, d1 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d30 ++ bne 10f ++ ++ @ Construct result word, no duplicates ++ cmp a1, #2 ++ vmov.u16 a1, d0[1] ++ vmov.u16 a2, d0[0] ++ orreq a1, a2, a1, lsl #2 ++ pop {a2,v1-v8,pc} ++10: ++ @ Construct result word, with duplicates ++ cmp a1, #2 ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov.u16 a1, d0[1] ++ lsl a2, #16 ++ pkhbt a1, a1, a1, lsl #16 ++ lsr a2, v2 ++ lsr a1, v2 ++ orreq a1, a2, a1, lsl v1 ++ pop {a2,v1-v8,pc} ++endfunc ++ ++ ++ ++#else // non-NEON version ++ ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 + push {a2-a4,v1-v8,lr} -+ ldmia ip, {v5-v7} ++ mov v6, #32 +1: ldmdb ip, {v1-v4} -+ ldrsb a3, [v5, #8] @ curr->ref_idx -+ ldrsb v8, [v5, #9] -+ ldrsb ip, [v6, #8] @ neigh->ref_idx -+ ldrsb lr, [v6, #9] -+ ldr v1, [v1, a3, lsl #2] -+ ldrb a3, [v5, #10] @ curr->pred_flag ++ ldrsb v5, [a3, #8] @ curr->ref_idx ++ ldrsb v8, [a3, #9] ++ ldrsb ip, [a4, #8] @ neigh->ref_idx ++ ldrsb lr, [a4, #9] ++ ldr v1, [v1, v5, lsl #2] ++ ldrb v5, [a3, #10] @ curr->pred_flag + ldr v2, [v2, v8, lsl #2] -+ ldrb v8, [v6, #10] @ neigh->pred_flag ++ ldrb v8, [a4, #10] @ neigh->pred_flag + ldr v3, [v3, ip, lsl #2] + ldr v4, [v4, lr, lsl #2] -+ teq a3, #3 ++ teq v5, #3 + beq 20f + teq v8, #3 + beq 90f + -+ tst a3, #1 ++ tst v5, #1 + itee ne -+ ldrne a3, [v5, #0] @ curr->mv[0] -+ ldreq a3, [v5, #4] @ curr->mv[1] ++ ldrne v5, [a3, #0] @ curr->mv[0] + moveq v1, v2 ++ ldreq v5, [a3, #4] @ curr->mv[1] + tst v8, #1 + itee ne -+ ldrne v8, [v6, #0] @ neigh->mv[0] -+ ldreq v8, [v6, #4] @ neigh->mv[1] ++ ldrne v8, [a4, #0] @ neigh->mv[0] + moveq v3, v4 ++ ldreq v8, [a4, #4] @ neigh->mv[1] + teq v1, v3 + bne 10f + ldr lr, =0xFFFCFFFC -+ ssub16 ip, v8, a3 -+ ssub16 a3, a3, v8 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 ip, v8, v5 ++ ssub16 v5, v5, v8 ++ sel v5, v5, ip ++ ands v5, v5, lr + @ drop through +10: it ne -+ movne a3, #1 -+11: subs a2, a2, #1 -+12: -+A strbhs a3, [v7], a4 -+T itt hs -+T strbhs a3, [v7] -+T addhs v7, v7, a4 ++ movne v5, #1<<30 ++11: ++ sub v6, v6, #2 ++T mov v7, v7, lsr #2 + subs a2, a2, #1 -+ bhs 12b ++A orr v7, v5, v7, lsr #2 ++T orr v7, v5, v7 ++ bhi 11b + -+ ldm sp, {a2, a3} ++ ldr v5, [sp, #16*4] + add ip, sp, #16*4 ++ ldr a2, [sp] + subs a1, a1, #1 -+ add v5, v5, a3 -+ add v6, v6, a3 ++ add a3, a3, v5 ++ add a4, a4, v5 + bhi 1b ++ mov a1, v7, lsr v6 + pop {a2-a4,v1-v8,pc} + +20: teq v8, #3 @@ -2889,43 +3234,43 @@ index 0000000000..e665bd848a + teq v1, v2 + bne 30f + -+ ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 -+ ssub16 a3, v1, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 25f + ssub16 ip, v4, v2 -+ ssub16 a3, v2, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + beq 11b + @ drop through +25: ssub16 ip, v4, v1 -+ ssub16 a3, v1, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 10b + ssub16 ip, v3, v2 -+ ssub16 a3, v2, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + b 10b + -+30: ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++30: ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 -+ ssub16 a3, v1, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 10b + ssub16 ip, v4, v2 -+ ssub16 a3, v2, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + b 10b + +40: teq v1, v4 @@ -2933,21 +3278,26 @@ index 0000000000..e665bd848a + teqeq v2, v3 + bne 10b + -+ ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + b 25b + -+90: mov a3, #1 ++90: ++ mov v5, #1<<30 + b 11b +endfunc + ++ ++#endif ++ ++ +@ ============================================================================= +@ +@ 10 bit + +function hevc_loop_filter_luma_body_10 -+ m_filter_luma 10 ++ m_filter_luma 10, q11, q15 +endfunc + +function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 @@ -2980,7 +3330,7 @@ index 0000000000..e665bd848a + ldr r10, [sp, #32] + +.Lv_loop_luma_common_10: -+ m_filter_v_luma_common_16 10 ++ m_filter_v_luma_16 10 +endfunc + +function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 @@ -3220,7 +3570,7 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..a721e392ab +index 0000000000..ce7e6091f1 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c @@ -0,0 +1,465 @@ @@ -3255,8 +3605,8 @@ index 0000000000..a721e392ab +// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but +// have been removed from head as we never use them. + -+void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); @@ -3455,9 +3805,9 @@ index 0000000000..a721e392ab + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + -+void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, ++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs); ++ int in_inc); + + +static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) @@ -3557,10 +3907,10 @@ index 0000000000..a721e392ab +av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) +{ + if (bit_depth == 8) { -+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon; -+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; + c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; + c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; + c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; @@ -7255,6 +7605,5151 @@ index 0000000000..b56e0f9644 + edge_64b_bodies edge_64b_body_16, 4 +endfunc + +diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h +new file mode 100644 +index 0000000000..36a23a5bf9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_arm.h +@@ -0,0 +1,28 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H ++#define AVCODEC_ARM_HEVCPRED_ARM_H ++ ++#include "libavcodec/rpi_hevcpred.h" ++ ++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c +new file mode 100644 +index 0000000000..80724d4cf3 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/cpu.h" ++#include "libavutil/arm/cpu.h" ++ ++#include "libavcodec/rpi_hevcpred.h" ++#include "rpi_hevcpred_arm.h" ++ ++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevc_rpi_pred_init_neon(c, bit_depth); ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c +new file mode 100644 +index 0000000000..8c267a0368 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcpred_arm.h" ++ ++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ switch (bit_depth) ++ { ++ case 8: ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; ++ break; ++ case 10: ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; ++ break; ++ default: ++ break; ++ } ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +new file mode 100644 +index 0000000000..1a2d413ea2 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +@@ -0,0 +1,2352 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * General angular pred ++ * ++ * Horizontal (10) & Vertical (26) cases have their own file ++ * and are not dealt with properly here (luma filtering is missing) ++ * ++ * The inv_angle calculations are annoying - if it wasn't for the +128 ++ * rounding step then the result would simply be the loop counter :-( ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.text ++ ++@ Horizontal Patch functions ++@ These need a transpose before store so exist as smaller patches ++@ Patches can be called repeatedly without any intermediate setup ++@ to generate a horizontal block ++@ ++@ It is almost certainly the case that larger patch fns can be built ++@ and they would be a little faster, but we would still need the small ++@ fns and code size (or at least instruction cache size) is an issue ++@ given how much code we already have here ++ ++@ Generate 8x8 luma 8 patch ++@ ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ d24 Cur Line - load before 1st call for down - set by _up ++@ d16 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ q0-q3, q14, q15 ++ ++patch_h_down_8x8_8: ++ mov r5, #8 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++ vld1.8 {d24[7]}, [r2]! ++ ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ vdup.8 d30, r6 ++ vext.8 q2, q3, #8 ++ vdup.8 d31, r12 ++ vext.8 q3, q3, #8 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d7, q14, #5 ++ bne 2b ++ ++store_tran_8x8_8: ++ add r12, r0, #4 ++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ] ++ add r5, r0, r3 ++ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r12], r3 ++ add r0, #8 ++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r5 ], r3 ++ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r12], r3 ++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r5 ], r3 ++ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r12], r3 ++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r5 ], r3 ++ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r12], r3 ++ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r5 ], r3 ++ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r12], r3 ++ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r5 ], r3 ++ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r12], r3 ++ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r5 ], r3 ++ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r12], r3 ++ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r5 ] ++ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r12] ++ bx lr ++ ++ ++patch_h_up_8x8_8: ++ mov r5, #8 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ @ r2=left (variable), r1=up (const) ++ adds r8, r7 ++ vmov d24, d16 ++ ldrbmi r12, [r2, #-1]! ++ ldrbpl r12, [r1, r8, asr #8] ++ vext.8 d16, d16, d16, #7 ++ sub r6, #32 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vext.8 q2, q3, #8 ++ vdup.8 d30, r12 ++ vext.8 q3, q3, #8 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d7, q14, #5 ++ bne 2b ++ b store_tran_8x8_8 @ This will return ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ mov r5, #4 @ Loop counter for all cases ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2] ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ ++98: ++ add r12, r0, r3 ++ lsl r3, #1 ++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ], r3 ++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r12], r3 ++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0 ] ++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r12] ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.32 {d16[0]}, [r2] ++ sub r8, r7 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ @ r2=left (variable), r1=up (const) ++ adds r8, r7 ++ vmov d24, d16 ++ ldrbmi r12, [r2, #-1]! ++ ldrbpl r12, [r1, r8, asr #8] ++ vext.8 d16, d16, d16, #7 ++ sub r6, #32 ++ vmov.8 d16[0], r12 ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ b 98b ++ ++18: ++ cmp r12, #26 ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.32 {d16[0]}, [r1 :32] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov d24, d16 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 d16, d16, #7 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.32 {d0[0]}, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {d24}, [r1] @ Up + up-right, may be on 32-bit align rather than 64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.32 {d0[0]}, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ bl patch_h_down_8x8_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ sub r8, r7 ++ bl patch_h_up_8x8_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {d16}, [r1 :64] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov d24, d16 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 d16, d16, #7 ++ vmov.8 d16[0], r12 ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {d24, d25}, [r1 :64]! @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 q12, q12, #1 ++ sub r6, #32 ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ ++ mov r2, r1 @ restore r2 ++ sub r0, #16 ++ add r6, r4, #32 @ Force initial load in main loop ++ vld1.8 {d24}, [r2]! ++ add r0, r0, r3, lsl #3 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.8 {d16}, [r2] ++ sub r8, r7 ++ ++ push {r2, r8} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r2, r8} ++ ++ sub r0, #16 ++ add r6, r4, #32 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #3 ++ add r0, r0, r3, lsl #3 ++ vld1.8 {d16}, [r2] ++ ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov q12, q8 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 q8, q8, q8, #15 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1 :128]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.8 q12, q12, #1 ++ sub r6, #32 ++ vld1.8 {d25[7]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #3 ++ add r0, r0, r3, lsl #3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #32 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8, q9 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ sub r6, #32 ++ vext.8 q9, q8, q9, #15 ++ vext.8 q8, q8, q8, #15 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmull.u8 q2, d18, d31 ++ vmull.u8 q3, d19, d31 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ vmlal.u8 q2, d26, d30 ++ vmlal.u8 q3, d27, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.8 q12, q13, #1 ++ vext.8 q13, q13, #1 ++ sub r6, #32 ++ vld1.8 {d27[7]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmull.u8 q2, d26, d30 ++ vmull.u8 q3, d27, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ vmlal.u8 q2, d18, d31 ++ vmlal.u8 q3, d19, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++@ Chroma 8 bit 4x4 patch fns ++ .text ++ ++patch_h_down_c_4x4_8: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 d24, d24, #1 ++ sub r6, #32 ++ vld1.16 {d24[3]}, [r2]! ++ ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ ++store_tran_c_4x4_8: ++ add r12, r0, r3 ++ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r5 ] ++ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12] ++ bx lr ++ ++patch_h_up_c_4x4_8: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov d24, d16 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #2 ++ addpl r2, r1, r8, asr #7 ++ vext.16 d16, d16, d16, #3 ++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 ++ and r2, #~1 ++ sub r6, #32 ++ vld1.16 d16[0], [r2] ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ b store_tran_c_4x4_8 @ This will return ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.8 {d16}, [r2] ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {d16}, [r1 :64] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r12, r8, #8 ++ vmov d24, d16 ++ add r8, r7 ++ vext.16 d16, d16, #3 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1] @ Up + UR (only 64-bit aligned) ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q12, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ mov r1, r2 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ ++ sub r0, #16 ++ add r0, r0, r3, lsl #2 ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.8 {d16}, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r2, r8} ++ ++ add r2, r2, #8 ++ sub r0, #16 ++ sub r8, r8, r7, lsl #2 ++ vld1.8 {d16}, [r2] ++ add r0, r0, r3, lsl #2 ++ add r6, r4, #32 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vext.16 q8, q8, #7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ vmull.u8 q1, d17, d31 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1 :128]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q12, #1 ++ sub r6, #32 ++ vld1.16 {d25[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8, q9 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ vext.16 q9, q8, q9, #7 ++ sub r6, #32 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmull.u8 q2, d18, d31 ++ vmull.u8 q3, d19, d31 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ vmlal.u8 q2, d26, d30 ++ vmlal.u8 q3, d27, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ sub r6, #32 ++ vld1.16 {d27[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmull.u8 q2, d26, d30 ++ vmull.u8 q3, d27, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ vmlal.u8 q2, d18, d31 ++ vmlal.u8 q3, d19, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ Data ++ ++ .text ++ .balign 64 ++angle_2: ++ .byte 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Sign inverted from standards table ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Standard sign ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ ++ @ Sign inverted from standards table ++inv_angle: ++ .short 4096, 1638, 910, 630, 482, 390, 315 ++ .short 256 ++ .short 315, 390, 482, 630, 910, 1638, 4096 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bit fns ++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code ++@ but runs out of register width for 12+ bit ++ ++ .text ++ .balign 64 ++ ++patch_h_down_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 d24, d24, #1 ++ sub r6, #32 ++ vld1.16 {d24[3]}, [r2]! ++ ++1: ++ rsb r12, r6, #32 ++ vext.16 q1, q2, #4 ++ vmov s0, r6 ++ vmov s1, r12 ++ vext.16 q2, q2, #4 ++ ++ vmul.u16 d1, d24, d0[0] ++ add r6, r4 ++ vmla.u16 d1, d16, d0[2] ++ subs r5, #1 ++ vrshr.u16 d5, d1, #5 ++ bne 2b ++ ++store_tran_4x4_10: ++ add r12, r0, r3 ++ vst4.16 {d2[0], d3[0], d4[0], d5[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.16 {d2[1], d3[1], d4[1], d5[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.16 {d2[2], d3[2], d4[2], d5[2]}, [r5 ] ++ vst4.16 {d2[3], d3[3], d4[3], d5[3]}, [r12] ++ bx lr ++ ++patch_h_up_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov d24, d16 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #2 ++ addpl r2, r1, r8, asr #7 ++ vext.16 d16, d16, d16, #3 ++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 ++ and r2, #~1 ++ sub r6, #32 ++ vld1.16 d16[0], [r2] ++ ++1: ++ rsb r12, r6, #32 ++ vext.16 q1, q2, #4 ++ vmov s0, r6 ++ vmov s1, r12 ++ vext.16 q2, q2, #4 ++ ++ vmul.u16 d1, d24, d0[2] ++ add r6, r4 ++ vmla.u16 d1, d16, d0[0] ++ subs r5, #1 ++ vrshr.u16 d5, d1, #5 ++ bne 2b ++ b store_tran_4x4_10 @ This will return ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.16 {d24}, [r2]! ++ bl patch_h_down_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.16 {d16}, [r2] ++ bl patch_h_up_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {d16}, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov d24, d16 ++ add r8, r7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vext.16 d16, d16, #3 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 d2, d16, d0[2] ++ vmla.u16 d2, d24, d0[0] ++ vrshr.u16 d2, #5 ++ ++ subs r5, #1 ++ vst1.16 {d2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {d24, d25}, [r1 :64] @ Up + UR (64bit aligned) ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 q12, q13, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 d2, d24, d0[0] ++ vmla.u16 d2, d16, d0[2] ++ vrshr.u16 d2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {d2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.16 {d24}, [r2]! ++ mov r1, r2 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ ++ vld1.16 {d24}, [r1]! ++ sub r0, #16 ++ add r6, r4, #32 @ Force initial load in main loop ++ add r0, r0, r3, lsl #2 ++ mov r2, r1 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.16 {d16}, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #16 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vrshr.u16 q1, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1 :128] @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q13, #1 ++ sub r6, #32 ++ vext.16 q13, q13, #1 ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vrshr.u16 q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.16 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8, q9}, [r1] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ sub r6, #32 ++ vext.16 q9, q8, q9, #7 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ sub r6, #32 ++ vld1.16 {d27[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ vpush {q4 } ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #8 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.16 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ mov r9, #4 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ subs r9, #1 ++ bne 1b ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ mov r9, #4 ++1: ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ subs r9, #1 ++ bne 1b ++ pop {r2, r8} ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++18: ++ cmp r12, #26 ++ mov r5, #32 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vldm r1, {q8-q11} @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ vmov q14, q10 ++ vmov q15, q11 ++ sub r6, #32 ++ vext.16 q11, q10, q11, #7 ++ vext.16 q10, q9, q10, #7 ++ vext.16 q9, q8, q9, #7 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmul.u16 q3, q10, d0[2] ++ vmul.u16 q4, q11, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vmla.u16 q3, q14, d0[0] ++ vmla.u16 q4, q15, d0[0] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++ b 99f ++ ++@ Right of vertical - works along top - left unused ++26: ++ vldm r1, {q12-q15} @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++ add r1, #64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vmov q10, q14 ++ vmov q11, q15 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q14, #1 ++ vext.16 q14, q15, #1 ++ vext.16 q15, q15, #1 ++ sub r6, #32 ++ vld1.16 {d31[3]}, [r1]! ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmul.u16 q3, q14, d0[0] ++ vmul.u16 q4, q15, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vmla.u16 q3, q10, d0[2] ++ vmla.u16 q4, q11, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++99: ++ vpop {q4 } ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ ++ ++@ Generate 4x4 chroma patch ++@ ++@ In (const) ++@ r1 Up ptr (_up only) ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ q2 Cur Line - load before 1st call for down - set by _up ++@ q8 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ d0, q1, q12-q15 ++ ++patch_h_down_c_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q2 ++ vext.32 q2, q2, #1 ++ sub r6, #32 ++ vld1.32 {d5[1]}, [r2]! ++1: ++ rsb r12, r6, #32 ++ vmov q12, q13 ++ vmov s0, r6 ++ vmov s1, r12 ++ vmov q13, q14 ++ ++ vmul.u16 q3, q2, d0[0] ++ add r6, r4 ++ vmla.u16 q3, q8, d0[2] ++ vmov q14, q15 ++ subs r5, #1 ++ vrshr.u16 q15, q3, #5 ++ bne 2b ++ ++store_tran_c_4x4_10: ++ add r12, r0, r3 ++ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.32 {d24[1], d26[1], d28[1], d30[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.32 {d25[0], d27[0], d29[0], d31[0]}, [r5 ] ++ vst4.32 {d25[1], d27[1], d29[1], d31[1]}, [r12] ++ bx lr ++ ++patch_h_up_c_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov q2, q8 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #4 ++ addpl r2, r1, r8, asr #6 ++ vext.32 q8, q8, #3 ++ @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1 ++ and r2, #~3 ++ sub r6, #32 ++ vld1.32 d16[0], [r2] ++1: ++ rsb r12, r6, #32 ++ vmov q12, q13 ++ vmov s0, r6 ++ vmov s1, r12 ++ vmov q13, q14 ++ ++ vmul.u16 q1, q2, d0[2] ++ add r6, r4 ++ vmla.u16 q1, q8, d0[0] ++ vmov q14, q15 ++ subs r5, #1 ++ vrshr.u16 q15, q1, #5 ++ bne 2b ++ b store_tran_c_4x4_10 @ This will return ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.32 {q2 }, [r2]! ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.32 {q8 }, [r2] ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vext.32 q8, q8, q8, #3 ++ add r12, r2, r12, lsl #2 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r12] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vrshr.u16 q1, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1] @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q13, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vrshr.u16 q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.32 {q2 }, [r2]! ++ mov r1, r2 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ ++ vld1.32 {q2 }, [r1]! ++ sub r0, #32 ++ add r6, r4, #32 @ Force initial load in main loop ++ add r0, r0, r3, lsl #2 ++ mov r2, r1 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.32 {q8 }, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ vld1.32 {q8 }, [r2] ++ add r6, r4, #32 ++ ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8, q9 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q12, q8 ++ asr r12, r8, #8 ++ vmov q13, q9 ++ add r8, r7 ++ vext.32 q9, q8, q9, #3 ++ add r12, r2, r12, lsl #2 ++ vext.32 q8, q8, q8, #3 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q14, #1 ++ sub r6, #32 ++ vld1.32 {d27[1]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ vpush {q4 } ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.32 {q2 }, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.32 {q8 }, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vldm r1, {q8-q11} @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #2 ++ vmov q14, q10 ++ vmov q15, q11 ++ vext.32 q11, q10, q11, #3 ++ vext.32 q10, q9, q10, #3 ++ vext.32 q9, q8, q9, #3 ++ vext.32 q8, q8, q8, #3 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r9] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmul.u16 q3, q10, d0[2] ++ vmul.u16 q4, q11, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vmla.u16 q3, q14, d0[0] ++ vmla.u16 q4, q15, d0[0] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++ b 99f ++ ++@ Right of vertical - works along top - left unused ++26: ++ vldm r1, {q12-q15} @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++ add r1, #64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vmov q10, q14 ++ vmov q11, q15 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q14, #1 ++ vext.32 q14, q15, #1 ++ vext.32 q15, q15, #1 ++ sub r6, #32 ++ vld1.32 {d31[1]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmul.u16 q3, q14, d0[0] ++ vmul.u16 q4, q15, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vmla.u16 q3, q10, d0[2] ++ vmla.u16 q4, q11, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++99: ++ vpop {q4 } ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +new file mode 100644 +index 0000000000..af7ba1f45e +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +@@ -0,0 +1,682 @@ ++/* ++ * Copyright (c) 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ ldr r2, [r2] ++ vld1.32 {d0[0]}, [r1] ++ mov r1, #2 ++ vmov s1, r2 ++ vmov s2, r2 ++ vmov.i16 q2, #3 ++ add r2, r0, r3 ++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] ++ lsl r3, #1 ++ vmovl.u8 q0, d0 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.32 {d0[0]}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d1, d0, #5*8 ++ vshr.u64 d2, d0, #6*8 ++ vshr.u64 d3, d0, #7*8 ++ vbif d1, d6, d7 ++ vbif d2, d6, d7 ++ vst1.32 {d1[0]}, [r2], r3 ++ vbif d3, d6, d7 ++ vst1.32 {d2[0]}, [r0] ++ vst1.32 {d3[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ vld1.8 {d1}, [r2] ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T lsl r3, #1 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshrn.u16 d0, q1, #3 ++ ++ @ Store ++ vst1.8 {d0}, [r0], r3 ++ vst1.8 {d0}, [r2], r3 ++ vst1.8 {d0}, [r0] ++ vst1.8 {d0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ mov r1, #2 ++ vld1.8 {d16}, [r2] ++ vmov.i16 q2, #3 ++ vmov.i64 d7, #0xffff ++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] ++ vmovl.u8 q0, d0 ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vmovl.u8 q1, d16 ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q1, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d2, q1, #2 ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.8 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d2, #8 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ mov r1, #6 ++1: ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ subs r1, #2 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8 }, [r1] ++ vld1.8 {q12}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q2, d24, d25 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ mov r1, #4 ++ vpadd.i32 d0, d0 @ This add U & V separately ++ lsl r3, #1 @ pels ++ vrshrn.u16 d0, q0, #4 ++ vdup.u16 q0, d0[0] @ Dup results ++ ++ @ Store ++1: ++ vst1.8 {q0 }, [r0], r3 ++ subs r1, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 { q8}, [r1] ++ vld1.8 {q12}, [r2] ++ vaddl.u8 q0, d16, d24 ++ vaddl.u8 q2, d17, d25 ++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ vrshr.u16 d0, #5 ++ ++ vmov.i64 d31, #0xff ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + dc * 2) ++ ++ vmov.u16 r12, d0[0] @ dc ++ add r2, r12, r12, lsl #1 @ dc*3 ++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 ++ ++ vdup.u16 q3, r2 ++ vaddw.u8 q1, q3, d16 ++ vaddw.u8 q2, q3, d17 ++ vmov.u16 d2[0], r1 ++ vrshrn.u16 d2, q1, #2 ++ vrshrn.u16 d3, q2, #2 ++ ++ @ Construct lhs pels ++ vaddw.u8 q2, q3, d24 ++ vaddw.u8 q3, q3, d25 ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ ++ @ Store top line ++ vst1.8 { q1}, [r0], r3 ++ ++ mov r1, #15 ++ vdup.u8 q0, d0[0] ++ ++1: ++ vext.8 q2, q2, #1 ++ vbit d0, d4, d31 ++ subs r1, #1 ++ vst1.8 { q0}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 { q8, q9}, [r1] ++ vld1.8 {q12,q13}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q1, d18, d19 ++ vaddl.u8 q2, d24, d25 ++ vaddl.u8 q3, d26, d27 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ lsl r3, #1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ mov r1, #4 ++ vpadd.i32 d0, d0 @ This add U & V separately ++ add r2, r0, r3 ++ vmov d1, d0 ++ lsl r3, #1 ++ vrshrn.u16 d0, q0, #5 ++ vmov d1, d0 @ Dup results ++ vmov q1, q0 ++ ++ @ Store ++1: ++ vst1.8 { q0, q1}, [r0], r3 ++ vst1.8 { q0, q1}, [r2], r3 ++ subs r1, #1 ++ vst1.8 { q0, q1}, [r0], r3 ++ vst1.8 { q0, q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_32_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8, q9 }, [r1] ++ vld1.8 {q12, q13}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q1, d18, d19 ++ vaddl.u8 q2, d24, d25 ++ vaddl.u8 q3, d26, d27 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ mov r1, #8 ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ add r2, r0, r3 ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ lsl r3, #1 ++ vrshrn.u16 d0, q0, #6 ++ vdup.u8 q1, d0[0] @ Dup results ++ vdup.u8 q0, d0[0] ++ ++ @ Store ++1: ++ vst1.8 {q0, q1 }, [r0], r3 ++ vst1.8 {q0, q1 }, [r2], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ vst1.8 {q0, q1 }, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ----------------------------------------------------------------------------- ++@ ++@ 10 Bit versions ++@ ++@ There is no actual bit depth dependency in this code except that our ++@ intermediate results will overflow the 16 bits they are stored in ++@ All there functions are good to 10 bits - with the worst case being ++@ in dc_32 where we use all 16 bits. ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {d0}, [r1] ++ mov r1, #2 ++ vld1.16 {d1}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vmov.i64 d7, #0xffff ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vrshr.u16 q0, #2 ++ ++ @ Store top line ++ vst1.16 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d3, d1, #1*16 ++ vshr.u64 d4, d1, #2*16 ++ vshr.u64 d5, d1, #3*16 ++ vbif d3, d6, d7 ++ vbif d4, d6, d7 ++ vst1.16 {d3}, [r2], r3 ++ vbif d5, d6, d7 ++ vst1.16 {d4}, [r0] ++ vst1.16 {d5}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ vld1.8 {q1}, [r2] ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q0, q1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshr.u16 q0, q1, #3 ++ ++ vst1.16 {q0}, [r0], r3 ++ vst1.16 {q0}, [r2], r3 ++ vst1.16 {q0}, [r0] ++ vst1.16 {q0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0}, [r1] ++ mov r1, #2 ++ vld1.16 {q8}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q8, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.16 q2, d6[0] ++ vdup.16 q9, d6[0] ++ vrshr.u16 q8, q8, #2 ++ vrshr.u16 q0, q0, #2 ++ vext.16 q1, q8, q8, #1 ++ ++ @ Store top line ++ vst1.16 {q0}, [r0], r3 ++ ++ @ Store the rest ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ mov r1, #6 ++1: ++ vext.16 q8, q8, q8, #2 ++ subs r1, #2 ++ vext.16 q1, q1, q1, #2 ++ vbit d4, d16, d7 ++ vst1.16 {q2}, [r0], r3 ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 ++ vld1.8 { q8, q9 }, [r1] ++ vld1.8 {q12, q13}, [r2] ++ vadd.i16 q8, q9 ++ vadd.i16 q12, q13 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 2 pairs ++ mov r1, #4 ++ vpadd.i32 d16, d16 ++ lsl r3, #2 @ stride in pels ++ vrshr.u16 d16, #4 ++ vdup.u32 q9, d16[0]; ++ vdup.u32 q8, d16[0]; ++ ++ @ Store ++1: ++ vst1.16 {q8, q9 }, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q8, q9 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q8, q9 }, [r1] ++ vld1.16 {q12, q13}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.u16 q0, q8, q12 ++ vadd.u16 q2, q9, q13 ++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ vrshr.u16 d0, #5 ++ ++ vmov.i64 d31, #0xffff ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + dc * 2) ++ ++ vmov.u16 r12, d0[0] @ dc ++ add r2, r12, r12, lsl #1 @ dc*3 ++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 ++ ++ vdup.u16 q3, r2 ++ vadd.u16 q8, q3 ++ vadd.u16 q9, q3 ++ vmov.u16 d16[0], r1 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ ++ @ Construct lhs pels ++ vadd.u16 q12, q3 ++ vadd.u16 q13, q3 ++ vrshr.u16 q12, #2 ++ vrshr.u16 q13, #2 ++ ++ @ Store top line ++ vst1.16 {q8, q9 }, [r0], r3 ++ ++ mov r1, #15 ++ vdup.u16 q1, d0[0] ++ vdup.u16 q0, d0[0] ++ ++1: ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ vbit d0, d24, d31 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vldm r1, { q8-q11} ++ vldm r2, {q12-q15} ++ vadd.i16 q8, q9 ++ vadd.i16 q10, q11 ++ vadd.i16 q12, q13 ++ vadd.i16 q14, q15 ++ vadd.i16 q8, q10 ++ vadd.i16 q12, q14 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 2 pairs ++ mov r1, #8 ++ vpadd.i32 d16, d16 ++ lsl r3, #2 @ stride in pels ++ vrshr.u16 d16, #5 ++ vmov d17, d16 @ Dup results ++ vmov q9, q8 ++ vmov q10, q8 ++ vmov q11, q8 ++ ++ @ Store ++1: ++ vstm r0, {q8-q11} ++ add r0, r3 ++ subs r1, #1 ++ vstm r0, {q8-q11} ++ add r0, r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels) ++ ++function ff_hevc_rpi_pred_dc_32_neon_10, export=1 ++ ++ @ Average the els of top & left ++ @ With 10 bits we are (just) safe from overflow in i16 ++ vldm r1, { q8-q11} ++ vldm r2, {q12-q15} ++ vadd.i16 q8, q9 ++ vadd.i16 q10, q11 ++ vadd.i16 q12, q13 ++ vadd.i16 q14, q15 ++ vadd.i16 q8, q10 ++ vadd.i16 q12, q14 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 4 vals ++ mov r1, #16 ++ vpadd.i16 d16, d16 @ 2 (top & bottom the same) ++ lsl r3, #1 @ stride in pels ++ vpadd.i16 d16, d16 @ 1 (all the same) ++ vrshr.u16 d16, #6 ++ vmov d17, d16 @ Dup results ++ vmov q9, q8 ++ vmov q10, q8 ++ vmov q11, q8 ++ ++ @ Store ++1: ++ vstm r0, { q8-q11} ++ add r0, r3 ++ subs r1, #1 ++ vstm r0, { q8-q11} ++ add r0, r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +new file mode 100644 +index 0000000000..ccf13a081f +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +@@ -0,0 +1,888 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * Horizontal & Vertical special cases of angular intra pred ++ * ++ * Split out because: ++ * Vertical, at least, is relatively common ++ * Much simpler code than the general angular case ++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else ++ * ++ * *** Currently luma filtering is mandatory where it occurs, but there are ++ * cases where it should be turned off (rdpcm & an extension sps flag). ++ * These don't occur in the standard conformance suite for Main Profile ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 ++ vld1.32 {d0[0] }, [r1 :32] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.32 {d24[0]}, [r2 :32] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d24, d4 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ mov r1, #4 ++ vdup.8 d2, d2[0] ++ vqadd.s8 d24, d2 ++ vmov.i64 d4, #0xff ++ veor.8 d24, d6 ++ ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 d24, d24, #1 ++ subs r1, #1 ++ vst1.32 {d0[0] }, [r0 :32], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 ++ vld1.8 {d0 }, [r1 :64] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {d24}, [r2 :64] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d24, d4 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ mov r1, #8 ++ vdup.8 d2, d2[0] ++ vqadd.s8 d24, d2 ++ vmov.i64 d4, #0xff ++ veor.8 d24, d6 ++ ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 d24, d24, #1 ++ subs r1, #1 ++ vst1.8 {d0 }, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {q12}, [r2 :128] @ left ++ ++ vdup.8 q2, r12 ++ vmov.u8 q3, #128 ++ vhsub.u8 q12, q2 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ vdup.8 q1, d2[0] ++ vqadd.s8 q12, q1 ++ veor.8 q12, q3 ++ ++ vmov.i64 d4, #0xff ++ mov r1, #16 ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 q12, q12, #1 ++ subs r1, #1 ++ vst1.8 {q0 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vert_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 ++ vld1.8 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d0 }, [r2 :64], r3 ++ vst1.16 {d0 }, [r0 :64] ++ vst1.16 {d0 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++@ ? Might be faster as simple arm ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 ++ vld1.32 {d0[0] }, [r1 :32] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.32 {d16[0]}, [r2 :32] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d0, d4 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 d2, d2[0] ++ lsl r3, #1 ++ vqadd.s8 d0, d2 ++ veor.8 d0, d6 ++ ++ vdup.8 d1, d16[1] ++ vdup.8 d2, d16[2] ++ vdup.8 d3, d16[3] ++ vst1.32 {d0[0] }, [r0 :32], r3 ++ vst1.32 {d1[0] }, [r2 :32], r3 ++ vst1.32 {d2[0] }, [r0 :32] ++ vst1.32 {d3[0] }, [r2 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 ++ vld1.8 {d0 }, [r1 :64] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {d16}, [r2 :64] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d0, d4 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 d2, d2[0] ++ lsl r3, #1 ++ vqadd.s8 d0, d2 ++ mov r1, #3 ++ veor.8 d0, d6 ++ ++ vdup.8 d4, d16[1] ++ vst1.8 {d0 }, [r0 :64], r3 ++ vst1.8 {d4 }, [r2 :64], r3 ++ ++1: ++ vext.8 d16, d16, #2 ++ subs r1, #1 ++ vdup.8 d0, d16[0] ++ vdup.8 d4, d16[1] ++ vst1.8 {d0 }, [r0 :64], r3 ++ vst1.8 {d4 }, [r2 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {q8 }, [r2 :128] @ left ++ ++ vdup.8 q2, r12 ++ vmov.u8 q3, #128 ++ vhsub.u8 q0, q2 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 q1, d2[0] ++ lsl r3, #1 ++ vqadd.s8 q0, q1 ++ mov r1, #7 ++ veor.8 q0, q3 ++ ++ vdup.8 q2, d16[1] ++ vst1.8 {q0 }, [r0 :128], r3 ++ vst1.8 {q2 }, [r2 :128], r3 ++ ++1: ++ vext.8 q8, q8, #2 ++ subs r1, #1 ++ vdup.8 q0, d16[0] ++ vdup.8 q2, d16[1] ++ vst1.8 {q0 }, [r0 :128], r3 ++ vst1.8 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 ++ vld1.8 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vdup.8 q0, d16[0] ++ vdup.8 q1, d16[0] ++ vdup.8 q2, d16[1] ++ vdup.8 q3, d16[1] ++ vext.8 q8, q9, #2 ++ vext.8 q9, q9, #2 ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 ++ vld1.16 {d16}, [r2 :64] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vdup.16 d0, d16[0] ++ vdup.16 d1, d16[1] ++ vdup.16 d2, d16[2] ++ vdup.16 d3, d16[3] ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d1 }, [r2 :64], r3 ++ vst1.16 {d2 }, [r0 :64] ++ vst1.16 {d3 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 ++ vld1.16 {q8 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q2, d16[1] ++ vext.16 q8, q8, #2 ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 ++ vld1.16 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ vext.16 q8, q9, #2 ++ vext.16 q9, q9, #2 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 Bit ++@ Has clipping constants so 10-bit only but could easily be macroed up to ++@ 14-bit before we run out of bits ++ ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {d24}, [r2 :64] @ left ++ ++ vdup.16 d4, r12 ++ lsl r3, #1 ++ vhsub.u16 d24, d4 ++ ++ vdup.16 d6, d0[0] ++ vmov.s16 d4, #0 ++ vadd.i16 d24, d6 ++ ++ vmov.s16 d6, #0x3ff ++ vmax.s16 d24, d4 ++ vmov.i64 d4, #0xffff ++ vmin.s16 d24, d6 ++ ++ mov r1, #4 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 d24, d24, #1 ++ subs r1, #1 ++ vst1.16 {d0 }, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q12}, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ lsl r3, #1 ++ vhsub.u16 q12, q2 ++ ++ vdup.16 q3, d0[0] ++ vmov.s16 q2, #0 ++ vadd.i16 q12, q3 ++ ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q12, q2 ++ vmin.s16 q12, q3 ++ ++ vmov.i64 d4, #0xffff ++ mov r1, #8 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 q12, q12, #1 ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q12, q13}, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ lsl r3, #1 ++ vhsub.u16 q12, q2 ++ vhsub.u16 q13, q2 ++ ++ vdup.16 q3, d0[0] ++ vmov.s16 q2, #0 ++ vadd.i16 q12, q3 ++ vadd.i16 q13, q3 ++ ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q12, q2 ++ vmax.s16 q13, q2 ++ vmin.s16 q12, q3 ++ vmin.s16 q13, q3 ++ ++ vmov.i64 d4, #0xffff ++ mov r1, #16 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ mov r1, #32 ++1: ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #1 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128] ++ vst1.16 {q0 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ mov r1, #16 ++1: ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++@ ff_hevc_rpi_pred_horizontal_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {d16}, [r2 :64] @ left ++ ++ vdup.16 d4, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 d0, d4 ++ ++ vdup.16 d6, d16[0] ++ vmov.s16 d4, #0 ++ vadd.i16 d0, d6 ++ ++ vmov.s16 d6, #0x3ff ++ vmax.s16 d0, d4 ++ lsl r3, #2 ++ vmin.s16 d0, d6 ++ ++ vdup.16 d1, d16[1] ++ vdup.16 d2, d16[2] ++ vdup.16 d3, d16[3] ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d1 }, [r2 :64], r3 ++ vst1.16 {d2 }, [r0 :64] ++ vst1.16 {d3 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q8 }, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 q0, q2 ++ ++ vdup.16 q3, d16[0] ++ lsl r3, #2 ++ vmov.s16 q2, #0 ++ vadd.i16 q0, q3 ++ ++ mov r1, #3 ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmin.s16 q0, q3 ++ ++ vdup.16 q2, d16[1] ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q2 }, [r2 :128], r3 ++1: ++ vext.16 q8, q8, #2 ++ vdup.16 q0, d16[0] ++ vdup.16 q2, d16[1] ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q8, q9 }, [r2 :128] @ left ++ ++ ++ vdup.16 q2, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 q0, q2 ++ vhsub.u16 q1, q2 ++ ++ vdup.16 q3, d16[0] ++ lsl r3, #2 ++ vmov.s16 q2, #0 ++ vadd.i16 q0, q3 ++ vadd.i16 q1, q3 ++ ++ mov r1, #7 ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmax.s16 q1, q2 ++ vmin.s16 q0, q3 ++ vmin.s16 q1, q3 ++ ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++1: ++ vext.16 q8, q9, #2 ++ vext.16 q9, q9, #2 ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 ++ vldm r2, { q8-q11} ++ mov r1, #16 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[0] ++ vdup.16 q3, d16[0] ++ add r2, r0, r3, lsl #1 ++ vdup.16 q12, d16[1] ++ vdup.16 q13, d16[1] ++ vdup.16 q14, d16[1] ++ vdup.16 q15, d16[1] ++ vstm r0, { q0-q3 } ++ vstm r2, {q12-q15} ++ ++ vext.16 q8, q9, #2 ++ vext.16 q9, q10, #2 ++ add r0, r0, r3, lsl #2 ++ vext.16 q10, q11, #2 ++ subs r1, #1 ++ vext.16 q11, q11, #2 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 ++ vld1.16 {q8 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[1] ++ vdup.32 q2, d17[0] ++ vdup.32 q3, d17[1] ++ ++ vst1.32 {q0 }, [r0 :128], r3 ++ vst1.16 {q1 }, [r2 :128], r3 ++ vst1.32 {q2 }, [r0 :128] ++ vst1.16 {q3 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 ++ vld1.16 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[0] ++ vdup.32 q2, d16[1] ++ vdup.32 q3, d16[1] ++ vext.32 q8, q9, #2 ++ vext.32 q9, q9, #2 ++ vst1.32 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.32 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 ++ vldm r2, { q8-q11} ++ mov r1, #8 ++1: ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[0] ++ vdup.32 q2, d16[0] ++ vdup.32 q3, d16[0] ++ add r2, r0, r3, lsl #2 ++ vdup.32 q12, d16[1] ++ vdup.32 q13, d16[1] ++ vdup.32 q14, d16[1] ++ vdup.32 q15, d16[1] ++ vstm r0, { q0-q3 } ++ vstm r2, {q12-q15} ++ ++ vext.32 q8, q9, #2 ++ vext.32 q9, q10, #2 ++ add r0, r0, r3, lsl #3 ++ vext.32 q10, q11, #2 ++ subs r1, #1 ++ vext.32 q11, q11, #2 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +new file mode 100644 +index 0000000000..9fb3633862 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +@@ -0,0 +1,930 @@ ++/* ++ * Copyright (c) 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ Planar intra pred (8.4.4.2.4) ++@ ++@ predSamples[ x ][ y ] = ++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + ++@ ( x + 1 ) * p[ nTbS ][ -1 ] + ++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + ++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_8, export=1 ++ adr r12, nb_3_0_1_4 ++ vld1.8 {d24}, [r2] @ Left ++ vld1.8 {d0 }, [r1] @ Up ++ vld1.8 {q8 }, [r12 :128] @ 3.. ++ ++ vdup.8 d30, d24[4] ++ vdup.8 d31, d0[4] ++ ++ vdup.32 d0, d0[0] @ copy lo -> hi ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q0, d0, #2 ++ add r1, r0, r3 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++ vshl.i16 q3, q2, #1 ++ vadd.i16 d0, d4 ++ vadd.i16 d1, d6 ++ lsl r3, #1 ++ vadd.i16 q1, q0, q3 ++ ++ vdup.u8 d20, d24[0] ++ vdup.u8 d21, d24[1] ++ vdup.u8 d22, d24[2] ++ vdup.u8 d23, d24[3] ++ ++ vtrn.32 d20, d21 ++ vtrn.32 d22, d23 ++ ++ vmull.u8 q10, d16, d20 ++ vmull.u8 q11, d16, d22 ++ vadd.i16 q10, q0 ++ vadd.i16 q11, q1 ++ ++ vrshrn.u16 d28, q10, #3 ++ vrshrn.u16 d29, q11, #3 ++ ++ vst1.32 {d28[0]}, [r0 :32], r3 ++ vst1.32 {d28[1]}, [r1 :32], r3 ++ vst1.32 {d29[0]}, [r0 :32] ++ vst1.32 {d29[1]}, [r1 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_8, export=1 ++ adr r12, nb_7_0_1_8 ++ vld1.8 {q12}, [r2] @ Left ++ vld1.8 {q0 }, [r1] @ Up ++ vld1.8 {q8 }, [r12 :128] @ 7.. ++ ++ vdup.8 d30, d25[0] ++ vdup.8 d31, d1[0] ++ ++ mov r1, #8 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q0, d0, #3 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++@ u8 7..0 [1] d16 ++@ u8 left[y] [1] d24 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q0, q2 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 d24, d24, #1 ++ ++ vmull.u8 q10, d16, d20 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d28, q10, #4 ++ ++ subs r1, #1 ++ vst1.8 {d28}, [r0 :64], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_8, export=1 ++ vld1.8 {q12}, [r2 :128] @ Left ++ ldrb r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread ++ adr r12, nb_15_0_1_16 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r1, [r1, #16] @ Up-right ++ vld1.8 {q8, q9 }, [r12 :128] @ 15... ++ ++ vdup.8 d30, r2 ++ vdup.8 d31, r1 ++ ++ mov r1, #16 ++ vsubl.u8 q3, d30, d1 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q1, d1, #4 ++ vshll.u8 q0, d0, #4 ++ vmlal.u8 q1, d19, d31 ++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free ++ ++@ u8 15..0 [1] q8 ++@ u8 left[y] [1] q12 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q1, q3 ++ vadd.i16 q0, q2 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 q12, q12, #1 ++ ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d29, q11, #5 ++ vrshrn.u16 d28, q10, #5 ++ ++ subs r1, #1 ++ vst1.8 {q14}, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_8, export=1 ++ vpush {q4-q7} ++ vld1.8 {q12, q13}, [r2 :128]! @ Left ++ adr r12, nb_31_0_1_32 ++ vld1.8 {q0, q1 }, [r1 :128]! @ Up ++ vld1.8 {d30[0]}, [r2] @ Down left ++ vld1.8 {d31[0]}, [r1] @ Up-right ++ vldm r12, { q8-q11} @ 1..32, 31..0 ++ ++ vdup.8 d30, d30[0] ++ vdup.8 d31, d31[0] ++ ++ vsubl.u8 q7, d30, d3 ++ vsubl.u8 q6, d30, d2 ++ vsubl.u8 q5, d30, d1 ++ vsubl.u8 q4, d30, d0 @ Add set up ++ ++ vshll.u8 q3, d3, #5 ++ vshll.u8 q2, d2, #5 ++ vshll.u8 q1, d1, #5 ++ vshll.u8 q0, d0, #5 ++ vmlal.u8 q3, d23, d31 ++ vmlal.u8 q2, d22, d31 ++ vmlal.u8 q1, d21, d31 ++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free ++ ++ mov r1, #32 ++ ++@ u8 31..0 [2] q10, q11 ++@ u8 left[y] [2] q12, q13 ++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q3, q7 ++ vadd.i16 q2, q6 ++ vadd.i16 q1, q5 ++ vadd.i16 q0, q4 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 q12, q13, #1 ++ vext.8 q13, q13, #1 ++ ++ vmull.u8 q15, d19, d20 ++ vmull.u8 q14, d18, d20 ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q15, q3 ++ vadd.i16 q14, q2 ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d31, q15, #6 ++ vrshrn.u16 d30, q14, #6 ++ vrshrn.u16 d29, q11, #6 ++ vrshrn.u16 d28, q10, #6 ++ ++ subs r1, #1 ++ vst1.8 {q14, q15}, [r0 :128], r3 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 ++ vld1.8 {q12}, [r2 :64] @ Left + down-left - <1d of overread is OK ++ adr r12, nbx2_3_0_1_4 ++ vld1.8 {q0 }, [r1 :64] @ Up + up right ++ vld1.8 {q8 }, [r12 :128] @ 3,3.. ++ ++ vdup.16 d30, d25[0] ++ vdup.16 d31, d1[0] ++ ++ mov r1, #4 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q0, d0, #2 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++@ u8 3,3..0,0 [1] d16 ++@ u8 left[y] [1] d24 ++@ u16 acc [1] q0 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q0, q2 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 d24, d24, #1 ++ ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d28, q10, #3 ++ ++ subs r1, #1 ++ vst1.8 {d28}, [r0 :64], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 ++ vld1.8 {q12}, [r2 :128] @ Left ++ ldrh r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread ++ adr r12, nbx2_7_0_1_8 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrh r1, [r1, #16] @ Up-right ++ vld1.8 {q8, q9 }, [r12 :128] @ 7,7... ++ ++ vdup.16 d30, r2 ++ vdup.16 d31, r1 ++ ++ mov r1, #8 ++ vsubl.u8 q3, d30, d1 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q1, d1, #3 ++ vshll.u8 q0, d0, #3 ++ vmlal.u8 q1, d19, d31 ++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free ++ ++@ u8 7,7..0,0 [1] q8 ++@ u8 left[y] [1] q12 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q1, q3 ++ vadd.i16 q0, q2 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 q12, q12, #1 ++ ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d29, q11, #4 ++ vrshrn.u16 d28, q10, #4 ++ ++ subs r1, #1 ++ vst1.8 {q14}, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 ++ vpush {q4-q7} ++ vld1.8 {q12, q13}, [r2 :128]! @ Left ++ adr r12, nbx2_15_0_1_16 ++ vld1.8 {q0, q1 }, [r1 :128]! @ Up ++ vld1.16 {d30[0]}, [r2] @ Down left ++ vld1.16 {d31[0]}, [r1] @ Up-right ++ vldm r12, { q8-q11} @ 1..32, 31..0 ++ ++ vdup.16 d30, d30[0] ++ vdup.16 d31, d31[0] ++ ++ mov r1, #16 ++ vsubl.u8 q7, d30, d3 ++ vsubl.u8 q6, d30, d2 ++ vsubl.u8 q5, d30, d1 ++ vsubl.u8 q4, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q3, d3, #4 ++ vshll.u8 q2, d2, #4 ++ vshll.u8 q1, d1, #4 ++ vshll.u8 q0, d0, #4 ++ vmlal.u8 q3, d23, d31 ++ vmlal.u8 q2, d22, d31 ++ vmlal.u8 q1, d21, d31 ++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free ++ ++@ u8 31..0 [2] q10, q11 ++@ u8 left[y] [2] q12, q13 ++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q3, q7 ++ vadd.i16 q2, q6 ++ vadd.i16 q1, q5 ++ vadd.i16 q0, q4 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ ++ vmull.u8 q15, d19, d20 ++ vmull.u8 q14, d18, d20 ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q15, q3 ++ vadd.i16 q14, q2 ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d31, q15, #5 ++ vrshrn.u16 d30, q14, #5 ++ vrshrn.u16 d29, q11, #5 ++ vrshrn.u16 d28, q10, #5 ++ ++ subs r1, #1 ++ vst1.8 {q14, q15}, [r0 :256], r3 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - put btween the 2 code lumps so we can reach it with an adr from both ++@ Beware - it gets quite close which is why nb_3_0_1_4 is 1st... ++ ++ .text ++ .balign 64 ++ ++ @ These could be extracted from the above array, but separate out ++ @ out for better (16 byte) alignment ++nb_3_0_1_4: ++ .byte 3, 2, 1, 0, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 1, 2, 3, 4 ++nb_7_0_1_8: ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++nbh_3_0_1_4: ++ .short 3, 2, 1, 0, 1, 2, 3, 4 ++nbx2_3_0_1_4: ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ ++ @ should be back on a 64-byte boundary here ++nb_31_0_1_32: ++ .byte 31, 30, 29, 28, 27, 26, 25, 24 ++ .byte 23, 22, 21, 20, 19, 18, 17, 16 ++nb_15_0_1_16: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++ .byte 9, 10, 11, 12, 13, 14, 15, 16 ++ .byte 17, 18, 19, 20, 21, 22, 23, 24 ++ .byte 25, 26, 27, 28, 29, 30, 31, 32 ++ ++ @ should be back on a 64-byte boundary here ++nbx2_15_0_1_16: ++ .byte 15, 15, 14, 14, 13, 13, 12, 12 ++ .byte 11, 11, 10, 10, 9, 9, 8, 8 ++nbx2_7_0_1_8: ++ .byte 7, 7, 6, 6, 5, 5, 4, 4 ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ .byte 5, 5, 6, 6, 7, 7, 8, 8 ++ .byte 9, 9, 10, 10, 11, 11, 12, 12 ++ .byte 13, 13, 14, 14, 15, 15, 16, 16 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bits ++@ (all would work with 9) ++ ++@ ff_hevc_rpi_pred_planar_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbh_3_0_1_4 ++ vld1.16 {q14}, [r2 :64] ++ vld1.16 {q8 }, [r12 :128] @ 3..0,1,..4 ++ vld1.16 {q12}, [r1 :64] @ Up ++ vdup.16 d2, d29[0] ++ ++ lsl r3, #1 ++ vsub.i16 d4, d2, d24 @ Add set up ++ ++ vdup.16 d0, d25[0] ++ vshl.i16 d24, #2 ++ vmla.i16 d24, d17, d0 @ Acc set up ++ add r1, r0, r3 ++ vmov d17, d16 ++ ++ vadd.i16 d24, d4 ++ vadd.i16 d25, d24, d4 ++ vshl.i16 d4, d4, #1 @ x2 ++ lsl r3, #1 ++ vadd.i16 d26, d24, d4 ++ vadd.i16 d27, d25, d4 ++ ++ vdup.16 d0, d28[0] ++ vdup.16 d1, d28[1] ++ vdup.16 d2, d28[2] ++ vdup.16 d3, d28[3] ++ ++ vmul.i16 q0, q8, q0 ++ vmul.i16 q1, q8, q1 ++ vadd.i16 q0, q12 ++ vadd.i16 q1, q13 ++ ++ vrshr.u16 q0, #3 ++ vrshr.u16 q1, #3 ++ ++ vst1.16 {d0}, [r0], r3 ++ vst1.16 {d1}, [r1], r3 ++ vst1.16 {d2}, [r0] ++ vst1.16 {d3}, [r1] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_7_0_1_8 ++ vld1.16 {q14}, [r2 :128] ++ ldrh r2, [r2, #16] @ Down left ++ vld1.8 {q0 }, [r12 :128] @ 7..0,1,..8 ++ vld1.16 {q12}, [r1 :128] @ Up ++ ldrh r1, [r1, #16] @ Up-right ++ vmovl.u8 q8, d1 ++ vdup.16 q1, r2 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #1 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ vdup.16 q0, r1 ++ mov r1, #8 ++ vshl.i16 q12, #3 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 15..0 [1] q10 ++@ u32 left[y] [1] q14 ++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.16 q0, d28[0] ++ vext.16 q14, q14, #1 ++ ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q0, q10, q0 ++ vadd.i16 q0, q12 ++ vrshr.u16 q0, #4 ++ ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_15_0_1_16 ++ vld1.16 {q14, q15}, [r2 :128] ++ ldrh r2, [r2, #32] @ Down left ++ vld1.8 {q0, q1 }, [r12 :128] @ 15..0,1,..16 ++ vld1.16 {q12, q13}, [r1 :128] @ Up ++ ldrh r1, [r1, #32] @ Up-right ++ vmovl.u8 q9, d3 ++ vmovl.u8 q8, d2 ++ vdup.16 q1, r2 ++ vmovl.u8 q11, d1 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #1 ++ vsub.i16 q3, q1, q13 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ vdup.16 q0, r1 ++ mov r1, #16 ++ vshl.i16 q13, #4 ++ vshl.i16 q12, #4 ++ vmla.i16 q13, q9, q0 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 15..0 [2] q10..q11 ++@ u32 left[y] [2] q14..q15 ++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.16 q0, d28[0] ++ vext.16 q14, q15, #1 ++ vext.16 q15, q15, #1 ++ ++ vadd.i16 q13, q3 ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q1, q11, q0 ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q0, #5 ++ ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_10, export=1 ++ push {r4, lr} ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_31_0_1_32 ++ vpush { q4-q7 } ++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0 ++ vldm r1!, {q12-q15} @ Up ++ ldrh r12, [r2, #64] @ Down left ++ vmovl.u8 q8, d4 ++ vmovl.u8 q9, d5 ++ vmovl.u8 q10, d6 ++ vmovl.u8 q11, d7 ++ vdup.16 q3, r12 ++ vld1.16 {d4[0]}, [r1] @ Up-right ++ ++ vsub.i16 q7, q3, q15 ++ vsub.i16 q6, q3, q14 ++ vsub.i16 q5, q3, q13 ++ vsub.i16 q4, q3, q12 @ Add set up ++ ++ vshl.i16 q15, #5 ++ vshl.i16 q14, #5 ++ vshl.i16 q13, #5 ++ vshl.i16 q12, #5 ++ vmla.i16 q15, q11, d4[0] ++ vmla.i16 q14, q10, d4[0] ++ vmla.i16 q13, q9, d4[0] ++ vmla.i16 q12, q8, d4[0] @ Acc set up - q8-q11 free ++ ++ mov r1, #32 ++ vmovl.u8 q8, d0 ++ vmovl.u8 q9, d1 ++ vmovl.u8 q10, d2 ++ vmovl.u8 q11, d3 ++ ++@ u8 31..0 [4] q8..q11 ++@ u8 left[y] [4] [r2] ++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vld1.16 {d0[0]}, [r2]! ++ ++ vadd.i16 q15, q7 ++ vadd.i16 q14, q6 ++ vadd.i16 q13, q5 ++ vadd.i16 q12, q4 ++ ++ vmul.i16 q3, q11, d0[0] ++ vmul.i16 q2, q10, d0[0] ++ vmul.i16 q1, q9, d0[0] ++ vmul.i16 q0, q8, d0[0] ++ ++ vadd.i16 q3, q15 ++ vadd.i16 q2, q14 ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q3, #6 ++ vrshr.u16 q2, #6 ++ vrshr.u16 q1, #6 ++ vrshr.u16 q0, #6 ++ ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #1 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ pop {r4, pc} ++ ++endfunc ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_3_0_1_4 ++ vld1.8 {q0 }, [r12 :128] @ 3,3..0,0,1,1..4,4 ++ vld1.16 {q14}, [r2 :128] @ left ++ ldr r12, [r2, #16] @ Down left ++ vld1.16 {q12}, [r1 :128] @ Up ++ vmovl.u8 q8, d1 ++ vdup.32 q1, r12 ++ ldr r12, [r1, #16] @ Up-right ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #2 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ mov r1, #4 ++ vdup.32 q0, r12 ++ vshl.i16 q12, #2 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 3,3..0,0 [1] q10 ++@ u32 left[y] [1] q14 ++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.32 q0, d28[0] ++ vext.32 q14, q14, #1 ++ ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q0, #3 ++ ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_7_0_1_8 ++ vld1.8 {q0, q1 }, [r12 :128] @ 7,7..0,0,1,1..8,8 ++ vld1.16 {q14, q15}, [r2 :128] ++ ldr r12, [r2, #32] @ Down left ++ vld1.16 {q12, q13}, [r1 :128] @ Up ++ vmovl.u8 q9, d3 ++ vmovl.u8 q8, d2 ++ vdup.32 q1, r12 ++ ldr r12, [r1, #32] @ Up-right ++ vmovl.u8 q11, d1 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #2 ++ vsub.i16 q3, q1, q13 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ mov r1, #8 ++ vdup.32 q0, r12 ++ vshl.i16 q13, #3 ++ vshl.i16 q12, #3 ++ vmla.i16 q13, q9, q0 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 7,7..0,0 [2] q10..q11 ++@ u32 left[y] [2] q14..q15 ++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.32 q0, d28[0] ++ vext.32 q14, q15, #1 ++ vext.32 q15, q15, #1 ++ ++ vadd.i16 q13, q3 ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q1, q11, q0 ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q1, #4 ++ vrshr.u16 q0, #4 ++ ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :256], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_15_0_1_16 ++ vpush { q4-q7 } ++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0 ++ vldm r1!, {q12-q15} @ Up ++ ldr r12, [r2, #64] @ Down left ++ vmovl.u8 q11, d7 ++ vmovl.u8 q10, d6 ++ vmovl.u8 q9, d5 ++ vmovl.u8 q8, d4 ++ vdup.32 q3, r12 ++ ldr r12, [r1] @ Up-right ++ ++ vsub.i16 q7, q3, q15 ++ vsub.i16 q6, q3, q14 ++ vsub.i16 q5, q3, q13 ++ vsub.i16 q4, q3, q12 @ Add set up ++ ++ vdup.32 q2, r12 ++ vshl.i16 q15, #4 ++ vshl.i16 q14, #4 ++ vshl.i16 q13, #4 ++ vshl.i16 q12, #4 ++ vmla.i16 q15, q11, q2 ++ vmla.i16 q14, q10, q2 ++ vmla.i16 q13, q9, q2 ++ vmla.i16 q12, q8, q2 @ Acc set up - q8-q11 free ++ ++ mov r1, #16 ++ vmovl.u8 q11, d3 ++ vmovl.u8 q10, d2 ++ vmovl.u8 q9, d1 ++ vmovl.u8 q8, d0 ++ ++@ u16 15,15..0,0 [4] q8..q11 ++@ u32 left[y] [4] [r2] ++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1] ++1: ++ ldr r12, [r2], #4 ++ ++ vadd.i16 q15, q7 ++ vadd.i16 q14, q6 ++ vdup.32 q0, r12 ++ vadd.i16 q13, q5 ++ vadd.i16 q12, q4 ++ ++ vmul.i16 q3, q11, q0 ++ vmul.i16 q2, q10, q0 ++ vmul.i16 q1, q9, q0 ++ vmul.i16 q0, q8, q0 ++ ++ vadd.i16 q3, q15 ++ vadd.i16 q2, q14 ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q3, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q1, #5 ++ vrshr.u16 q0, #5 ++ ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #2 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++endfunc ++ ++ diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index fb0c6fae70..9f2ebb16f3 100644 --- a/libavcodec/avcodec.h @@ -10034,10 +15529,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..a8601da4e7 +index 0000000000..4bfa000da4 --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1165 @@ +@@ -0,0 +1,1236 @@ +/* + * HEVC video decoder + * @@ -10599,6 +16094,15 @@ index 0000000000..a8601da4e7 +#endif +} + ++// When bits are delivered to deblock we want them ++//#define TL 1 ++//#define TR 2 ++//#define BL 4 ++//#define BR 8 ++ ++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br ++// so we need to rearrange before passing on ++ +static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; @@ -10614,23 +16118,60 @@ index 0000000000..a8601da4e7 + return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); +} + -+// We sometimes need 17 2-bit entries (annoying!) -+// * This could be avoided if we separate out the H filter left-stub deblock -+// but 64 bit constant shr shouldn't be too bad - though the variable mask here is probably quite nasty -+static inline uint64_t hbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++// We cast away const here as we want this to work for both get and set ++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ -+ unsigned int n = (xr - xl + 7) & ~7; -+ -+ return n == 0 ? (uint64_t)0 : -+ (*(uint64_t *)(s->horizontal_bs + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1); ++ return (uint32_t *)(bs + ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#warning Unexpected masks ++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); +} + -+static inline uint64_t vbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ -+ unsigned int n = (xr - xl + 7) & ~7; ++ return (uint8_t *)(bs + ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} + -+ return n == 0 ? (uint64_t)0 : -+ (*(uint64_t *)(s->vertical_bs2 + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1); ++ ++// Get block strength ++// Given how we call we will always get within the 32bit boundries ++static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2, ++ const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ if (xr <= xl) { ++ return 0; ++ } ++ else ++ { ++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y); ++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1; ++ ++ return n == 32 ? a : ++ (a >> ((xl >> 1) & 31)) & ~(~0U << n); ++ } ++} ++ ++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); ++} ++ ++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); +} + + @@ -10658,68 +16199,78 @@ index 0000000000..a8601da4e7 + // Main body + for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) + { ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); ++ + const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + ++ if (vbs != 0) + { + const uint8_t * const tcv = tctable + dbp->tc_offset; + const uint8_t * const betav = betatable + dbp->beta_offset; + unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+// const uint8_t * vbs = s->vertical_bs + (bv_l >> 3) * s->bs_height + (y >> 2); -+ uint64_t vbs2 = vbs_get(s, bv_l, bv_r, y); + unsigned int x; + -+ for (x = bv_l; x < bv_r; x += 8) ++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) + { -+ const unsigned int pcmf_v = pcmfa & 3; -+ const unsigned int bs0 = vbs2 & 3; -+ const unsigned int bs1 = (vbs2 & 0xc) >> 2; -+ -+ if ((bs0 | bs1) != 0 && pcmf_v != 3) ++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) + { + const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + frame_stride1(s->frame, LUMA), + betav[qp], -+ (bs0 == 0 ? 0 : tcv[qp + (int)(bs0 & 2)]) | -+ ((bs1 == 0 ? 0 : tcv[qp + (int)(bs1 & 2)]) << 16), -+ pcmf_v, ++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | ++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), ++ pcmfa & 3, + av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); + } -+ -+ pcmfa >>= 1; -+// vbs += s->bs_height; -+ vbs2 >>= 4; + } + } + + if (y != 0) + { -+ unsigned int x; -+ unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ uint64_t hbs = hbs_get(s, bh_l, bh_r + 1, y); // Will give (x <= bh_r) in for loop ++ uint32_t hbs; + -+ for (x = bh_l; hbs != 0; x += 8, hbs >>= 4) ++ // H left - mostly separated out so we only need a uint32_t hbs ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) + { -+ const unsigned int pcmf_h = (pcmfa & 1) | ((pcmfa & 0x10000) >> 15); -+ const unsigned int bs0 = hbs & 3; -+ const unsigned int bs1 = (hbs >> 2) & 3; ++ const unsigned int x = bh_l; ++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const DBParams * const dbph = dbp - 1; ++ const uint8_t * const tc = tctable + dbph->tc_offset + qp; + -+ if ((bs0 | bs1) != 0 && pcmf_h != 3) ++ av_assert2(cb_x - bh_l == 8); ++ ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbph->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ ++ // H ++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); ++ ++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) + { -+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const DBParams * const dbph = (x < cb_x ? dbp - 1 : dbp); -+ const uint8_t * const tc = tctable + dbph->tc_offset + qp; -+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ frame_stride1(s->frame, LUMA), -+ betatable[qp + dbph->beta_offset], -+ (bs0 == 0 ? 0 : tc[bs0 & 2]) | -+ ((bs1 == 0 ? 0 : tc[bs1 & 2]) << 16), -+ pcmf_h); ++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) ++ { ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + dbp->tc_offset + qp; ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbp->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } + } -+ -+ pcmfa >>= 1; + } + } + @@ -10727,11 +16278,6 @@ index 0000000000..a8601da4e7 + } +} + -+#define TL 1 -+#define TR 2 -+#define BL 4 -+#define BR 8 -+ +static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; @@ -10768,98 +16314,119 @@ index 0000000000..a8601da4e7 + // Deblock V up 8 + // CTB above current + // Top-half only (tc4 & ~0xffff == 0) is special cased in asm -+ unsigned int x; + const unsigned int y = bounds.y - 8; ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; + -+ unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; -+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U); -+ -+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8) ++ if (vbs != 0) + { -+ const unsigned int pcmf_v = (pcmfa & 3); -+ if ((vbs2 & 2) != 0 && pcmf_v != 3) ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { -+ const int qp0 = q2h(s, x, y); -+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), -+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ pcmf_v); ++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ pcmfa & 3); ++ } + } -+ pcmfa >>= 2; + } + } + + for (y = bounds.y; y < b_b; y += 16) + { ++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | ++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); ++ + // V ++ if (vbs != 0) + { + unsigned int x; -+ unsigned int pcmfa = pcm4(s, bv_l - 1, y); -+ const unsigned int pcmf_or = (y + 16 <= b_b) ? 0 : BL | BR; ++ unsigned int pcmfa = ++ (y + 16 > b_b ? ++ pcm2(s, bv_l - 1, y) | 0xffff0000 : ++ pcm4(s, bv_l - 1, y)); + const uint8_t * const tc = tctable + 2 + dbp->tc_offset; -+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U) | -+ ((vbs_get(s, bv_l, bv_r, y + 8) & 0x0202020202020202U) << 4); + -+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8) ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { -+ const unsigned int pcmf_v = pcmf_or | (pcmfa & 3) | ((pcmfa >> 14) & 0xc); -+ const unsigned int bs0 = (~pcmf_v & (TL | TR)) == 0 ? 0 : vbs2 & 2; -+ const unsigned int bs1 = (~pcmf_v & (BL | BR)) == 0 ? 0 : (vbs2 & 0x20) >> 4; -+ -+ if ((bs0 | bs1) != 0) ++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) + { + const int qp0 = q2h(s, x, y); + const int qp1 = q2h(s, x, y + 8); + s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), -+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), + av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ pcmf_v); ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); + } -+ -+ pcmfa >>= 2; + } + } + + // H + if (y != 0) + { -+ unsigned int x; -+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; ++ uint32_t hbs; + const unsigned int bh_l = bv_l - 16; -+ unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ uint64_t hbs = hbs_get(s, bh_l, bh_r, y) & 0x2222222222222222U; ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + -+ // Chop off bits we don't want... -+ if (bh_l < bounds.x) { -+ pcmfa |= 0x10001; // TL|BL pre rearrangement -+ hbs &= ~(uint64_t)3; // Make BS 0 -+ } -+ -+ for (x = bh_l; hbs != 0; x += 16, hbs >>= 8) ++ // H left - mostly separated out so we only need a uint32_t hbs ++ // Stub is width 8 to the left of bounds, but width 16 internally ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) + { -+ const unsigned int pcmf_h = (x + 16 > bh_r ? TR | BR : 0) | -+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc); -+ const int bs0 = hbs & 2; -+ const int bs1 = (~pcmf_h & (TR | BR)) == 0 ? 0 : (hbs >> 4) & 2; -+ if ((bs0 | bs1) != 0) ++ unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ ++ // Chop off bits we don't want... ++ if (bh_l < bounds.x) { ++ pcmfa |= 0x10001; // TL|BL pre rearrangement ++ hbs &= ~3; // Make BS 0 ++ } ++ ++ // Double check we still want this ++ if (hbs != 0 && (~pcmfa & 0x30003) != 0) + { ++ const unsigned int x = bh_l; + const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; -+ const uint8_t * const tc = tctable + 2 + (x < cb_x ? dbp - 1 : dbp)->tc_offset; ++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; + + s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), -+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), -+ pcmf_h); ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ ++ // H main ++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it ++ ++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) ++ { ++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } + } -+ pcmfa >>= 2; + } + } + } @@ -10871,18 +16438,18 @@ index 0000000000..a8601da4e7 + return x & ~(~0U << log2_n); +} + -+static inline void set_bs_h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + av_assert2((y & 7) == 0); + + // This doesn't have the same simultainious update issues that bsf_stash + // does (other threads will have a different y) so we can do it the easy way + if ((bsf &= mask) != 0) -+ *(uint32_t *)(s->horizontal_bs + ((x >> 4) & ~3) + (y >> 3) * s->hbs_stride) |= bsf << ((x >> 1) & 31); ++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); +} + + -+static void set_bs_v(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + // We arrange this in a slightly odd fashion but it lines up with + // how we are going to use it in the actual deblock code & it is easier @@ -10894,8 +16461,7 @@ index 0000000000..a8601da4e7 + + if ((bsf &= mask) != 0) + { -+ const unsigned int stride1 = s->hbs_stride; -+ uint8_t *p = s->vertical_bs2 + (x >> 4) + (y >> 3) * stride1; ++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); + const unsigned int sh = ((x & 8) | (y & 4)) >> 1; + + if (mask <= 0xf) @@ -10906,7 +16472,7 @@ index 0000000000..a8601da4e7 + { + do { + *p |= (bsf & 0xf) << sh; -+ p += stride1; ++ p += HEVC_RPI_BS_STRIDE1_BYTES; + } while ((bsf >>= 4) != 0); + } + } @@ -10918,19 +16484,10 @@ index 0000000000..a8601da4e7 + const RefPicList * const rpl_p, const RefPicList * const rpl_q, + const MvField * const mvf_p, const MvField * const mvf_q) +{ -+ uint8_t res[16]; -+ unsigned int i; -+ unsigned int a = 0; -+ -+ s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, -+ sizeof(MvField) * mvf_stride, 1, ++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, ++ mvf_p, mvf_q, + rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, -+ mvf_p, mvf_q, res); -+ -+ for (i = 0; i != rep * dup; ++i) { -+ a |= res[i] << (i * 2); -+ } -+ return a; ++ sizeof(MvField) * mvf_stride); +} + + @@ -11050,7 +16607,7 @@ index 0000000000..a8601da4e7 + } + + // Finally put the results into bs -+ set_bs_h(s, x0, y0, bsf_mask, bsf_h); ++ hbs_set(s, x0, y0, bsf_mask, bsf_h); + } + + // Max of 1 pu internal split - ignore if not on 8pel boundary @@ -11061,7 +16618,7 @@ index 0000000000..a8601da4e7 + // If we have the x split as well then it must be in the middle + const unsigned int log2_rep = has_x_split ? 1 : 0; + -+ set_bs_h(s, x0, lc->cu.y_split, bsf_mask, ++ hbs_set(s, x0, lc->cu.y_split, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl, @@ -11074,7 +16631,7 @@ index 0000000000..a8601da4e7 + { + // Boundary left + if (x0 != 0 && -+ ((x0 & ((1 << s->ps.sps->log2_ctb_size) - 1)) != 0 || ++ (off_boundary(x0, s->ps.sps->log2_ctb_size) || + (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) + { + if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) @@ -11090,7 +16647,7 @@ index 0000000000..a8601da4e7 + mvf_curr, mvf_curr - 1); + } + -+ set_bs_v(s, x0, y0, bsf_mask, bsf_v); ++ vbs_set(s, x0, y0, bsf_mask, bsf_v); + } + + if (has_x_split && !off_boundary(lc->cu.x_split, 3)) @@ -11099,7 +16656,7 @@ index 0000000000..a8601da4e7 + (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size); + const unsigned int log2_rep = has_y_split ? 1 : 0; + -+ set_bs_v(s, lc->cu.x_split, y0, bsf_mask, ++ vbs_set(s, lc->cu.x_split, y0, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl, @@ -11135,6 +16692,12 @@ index 0000000000..a8601da4e7 + + // Deblock may not touch the edges of the bound as they are still needed + // for Intra pred ++ // ++ // Deblock is disabled with a per-slice flag ++ // Given that bounds may cover multiple slices & we dblock outside bounds ++ // anyway we can't avoid deblock using that flag - about the only thing we ++ // could do is have a "no deblock seen yet" flag but it doesn't really ++ // seem worth the effort + + deblock_y_blk(s, bounds, x_end, y_end); + deblock_uv_blk(s, bounds, x_end, y_end); @@ -11150,9 +16713,12 @@ index 0000000000..a8601da4e7 + const unsigned int xl = ussub(bounds.x, xo); + const unsigned int xr = x_end ? br : ussub(br, xo); + -+ for (y = yt; y < yb; y += ctb_size) { -+ for (x = xl; x < xr; x += ctb_size) { -+ sao_filter_CTB(s, x, y); ++ if (s->ps.sps->sao_enabled) ++ { ++ for (y = yt; y < yb; y += ctb_size) { ++ for (x = xl; x < xr; x += ctb_size) { ++ sao_filter_CTB(s, x, y); ++ } + } + } + @@ -12162,7 +17728,7 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..e8df452021 +index 0000000000..744e7cf248 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c @@ -0,0 +1,1957 @@ @@ -13347,7 +18913,7 @@ index 0000000000..e8df452021 + sps->long_term_ref_pics_present_flag = get_bits1(gb); + if (sps->long_term_ref_pics_present_flag) { + sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); -+ if (sps->num_long_term_ref_pics_sps > 31U) { ++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { + av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", + sps->num_long_term_ref_pics_sps); + return AVERROR_INVALIDDATA; @@ -14125,7 +19691,7 @@ index 0000000000..e8df452021 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..c9ecf9a268 +index 0000000000..1e7120a43d --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h @@ -0,0 +1,441 @@ @@ -14388,8 +19954,8 @@ index 0000000000..c9ecf9a268 + uint8_t sao_enabled; + + uint8_t long_term_ref_pics_present_flag; -+ uint16_t lt_ref_pic_poc_lsb_sps[32]; -+ uint8_t used_by_curr_pic_lt_sps_flag[32]; ++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; + uint8_t num_long_term_ref_pics_sps; + + struct { @@ -15093,7 +20659,7 @@ index 0000000000..d7745711ab +} diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c new file mode 100644 -index 0000000000..c5133a8a88 +index 0000000000..cd8149d58e --- /dev/null +++ b/libavcodec/rpi_hevc_sei.c @@ -0,0 +1,368 @@ @@ -15194,10 +20760,11 @@ index 0000000000..c5133a8a88 + s->quincunx_subsampling = get_bits1(gb); + s->content_interpretation_type = get_bits(gb, 6); + -+ // the following skips spatial_flipping_flag frame0_flipped_flag -+ // field_views_flag current_frame_is_frame0_flag -+ // frame0_self_contained_flag frame1_self_contained_flag -+ skip_bits(gb, 6); ++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag ++ skip_bits(gb, 3); ++ s->current_frame_is_frame0_flag = get_bits1(gb); ++ // frame0_self_contained_flag, frame1_self_contained_flag ++ skip_bits(gb, 2); + + if (!s->quincunx_subsampling && s->arrangement_type != 5) + skip_bits(gb, 16); // frame[01]_grid_position_[xy] @@ -15371,8 +20938,8 @@ index 0000000000..c5133a8a88 + return 0; +} + -+static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps, -+ int type, int size, void *logctx) ++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, ++ int type, int size) +{ + switch (type) { + case 256: // Mismatched value from HM 8.1 @@ -15400,8 +20967,8 @@ index 0000000000..c5133a8a88 + } +} + -+static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s, -+ int type, int size, void *logctx) ++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ int type, int size) +{ + switch (type) { + case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: @@ -15413,9 +20980,8 @@ index 0000000000..c5133a8a88 + } +} + -+static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s, -+ const HEVCRpiParamSets *ps, int nal_unit_type, -+ void *logctx) ++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, ++ const HEVCRpiParamSets * const ps, const int nal_unit_type) +{ + int payload_type = 0; + int payload_size = 0; @@ -15436,9 +21002,9 @@ index 0000000000..c5133a8a88 + payload_size += byte; + } + if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { -+ return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx); ++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); + } else { /* nal_unit_type == NAL_SEI_SUFFIX */ -+ return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx); ++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); + } +} + @@ -15453,7 +21019,7 @@ index 0000000000..c5133a8a88 + int ret; + + do { -+ ret = decode_nal_sei_message(gb, s, ps, type, logctx); ++ ret = decode_nal_sei_message(gb, logctx, s, ps, type); + if (ret < 0) + return ret; + } while (more_rbsp_data(gb)); @@ -15467,7 +21033,7 @@ index 0000000000..c5133a8a88 +} diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h new file mode 100644 -index 0000000000..41e4a20127 +index 0000000000..d4ac348df9 --- /dev/null +++ b/libavcodec/rpi_hevc_sei.h @@ -0,0 +1,135 @@ @@ -15533,7 +21099,6 @@ index 0000000000..41e4a20127 +} HEVC_SEI_Type; + +typedef struct HEVCSEIPictureHash { -+ struct AVMD5 *md5_ctx; + uint8_t md5[3][16]; + uint8_t is_md5; +} HEVCSEIPictureHash; @@ -15543,6 +21108,7 @@ index 0000000000..41e4a20127 + int arrangement_type; + int content_interpretation_type; + int quincunx_subsampling; ++ int current_frame_is_frame0_flag; +} HEVCSEIFramePacking; + +typedef struct HEVCSEIDisplayOrientation { @@ -20363,10 +25929,10 @@ index 0000000000..1128a2c054 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..4034c77979 +index 0000000000..08686ff260 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5753 @@ +@@ -0,0 +1,5787 @@ +/* + * HEVC video Decoder + * @@ -21302,9 +26868,9 @@ index 0000000000..4034c77979 + av_freep(&s->tab_slice_address); + av_freep(&s->filter_slice_edges); + -+ av_freep(&s->horizontal_bs); ++ av_freep(&s->bs_horizontal); +// av_freep(&s->vertical_bs); -+ av_freep(&s->vertical_bs2); ++ av_freep(&s->bs_vertical); + av_freep(&s->bsf_stash_left); + av_freep(&s->bsf_stash_up); + @@ -21325,8 +26891,13 @@ index 0000000000..4034c77979 + int ctb_count = sps->ctb_width * sps->ctb_height; + int min_pu_size = sps->min_pu_width * sps->min_pu_height; + -+ s->hbs_stride = ((width + 63) & ~63) >> 4; -+ s->bs_size = (((height + 15) & ~15) >> 3) * s->hbs_stride; ++ { ++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); ++ unsigned int h = ((height + 15) & ~15); ++ ++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size ++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols ++ } + + s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly + s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); @@ -21352,9 +26923,9 @@ index 0000000000..4034c77979 + if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) + goto fail; + -+ s->horizontal_bs = av_mallocz(s->bs_size); -+ s->vertical_bs2 = av_mallocz(s->bs_size); -+ if (s->horizontal_bs == NULL || s->vertical_bs2 == NULL) ++ s->bs_horizontal = av_mallocz(s->bs_size); ++ s->bs_vertical = av_mallocz(s->bs_size); ++ if (s->bs_horizontal == NULL || s->bs_vertical == NULL) + goto fail; + + if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || @@ -21406,15 +26977,22 @@ index 0000000000..4034c77979 + uint8_t chroma_weight_l0_flag[16]; + uint8_t luma_weight_l1_flag[16]; + uint8_t chroma_weight_l1_flag[16]; -+ int luma_log2_weight_denom; ++ unsigned int luma_log2_weight_denom; + + luma_log2_weight_denom = get_ue_golomb_long(gb); -+ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7) ++ if (luma_log2_weight_denom > 7) { + av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom); -+ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3); ++ return AVERROR_INVALIDDATA; ++ } ++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; + if (ctx_cfmt(s) != 0) { -+ int delta = get_se_golomb(gb); -+ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3); ++ const unsigned int chroma_log2_weight_denom = luma_log2_weight_denom + get_se_golomb(gb); ++ if (chroma_log2_weight_denom > 7) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is invalid\n", chroma_log2_weight_denom); ++ return AVERROR_INVALIDDATA; ++ } ++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; + } + + for (i = 0; i < s->sh.nb_refs[L0]; i++) { @@ -21741,6 +27319,7 @@ index 0000000000..4034c77979 + if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { + const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; + const HEVCRpiSPS *last_sps = s->ps.sps; ++ enum AVPixelFormat pix_fmt; + + if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { + if (sps->width != last_sps->width || sps->height != last_sps->height || @@ -21750,10 +27329,20 @@ index 0000000000..4034c77979 + } + ff_hevc_rpi_clear_refs(s); + -+ ret = set_sps(s, sps, get_format(s, sps)); ++ ret = set_sps(s, sps, sps->pix_fmt); + if (ret < 0) + return ret; + ++ pix_fmt = get_format(s, sps); ++ if (pix_fmt < 0) ++ return pix_fmt; ++ ++// ret = set_sps(s, sps, pix_fmt); ++// if (ret < 0) ++// return ret; ++ ++ s->avctx->pix_fmt = pix_fmt; ++ + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + } @@ -25184,6 +30773,13 @@ index 0000000000..4034c77979 + + if (s->sei.frame_packing.content_interpretation_type == 2) + stereo->flags = AV_STEREO3D_FLAG_INVERT; ++ ++ if (s->sei.frame_packing.arrangement_type == 5) { ++ if (s->sei.frame_packing.current_frame_is_frame0_flag) ++ stereo->view = AV_STEREO3D_VIEW_LEFT; ++ else ++ stereo->view = AV_STEREO3D_VIEW_RIGHT; ++ } + } + + if (s->sei.display_orientation.present && @@ -25297,8 +30893,8 @@ index 0000000000..4034c77979 + ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + -+ memset(s->horizontal_bs, 0, s->bs_size); -+ memset(s->vertical_bs2, 0, s->bs_size); ++ memset(s->bs_horizontal, 0, s->bs_size); ++ memset(s->bs_vertical, 0, s->bs_size); + memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); + memset(s->skip_flag, 0, s->ps.sps->min_cb_height * s->skip_flag_stride); + memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); @@ -25421,7 +31017,12 @@ index 0000000000..4034c77979 + } + } +#endif -+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { ++ if ( ++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || ++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || ++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || ++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s))) ++ { + s->is_decoded = 0; + break; + } @@ -25596,7 +31197,7 @@ index 0000000000..4034c77979 + int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; + uint8_t md5[16]; + -+ av_md5_init(s->sei.picture_hash.md5_ctx); ++ av_md5_init(s->md5_ctx); + for (j = 0; j < h; j++) { + const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); +#if HAVE_BIGENDIAN @@ -25606,9 +31207,9 @@ index 0000000000..4034c77979 + src = s->checksum_buf; + } +#endif -+ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift); ++ av_md5_update(s->md5_ctx, src, w << pixel_shift); + } -+ av_md5_final(s->sei.picture_hash.md5_ctx, md5); ++ av_md5_final(s->md5_ctx, md5); + + if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { + av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); @@ -25759,7 +31360,7 @@ index 0000000000..4034c77979 + + pic_arrays_free(s); + -+ av_freep(&s->sei.picture_hash.md5_ctx); ++ av_freep(&s->md5_ctx); + + av_freep(&s->cabac_save); + @@ -25871,8 +31472,7 @@ index 0000000000..4034c77979 + + s->max_ra = INT_MAX; + -+ s->sei.picture_hash.md5_ctx = av_md5_alloc(); -+ if (!s->sei.picture_hash.md5_ctx) ++ if ((s->md5_ctx = av_md5_alloc()) == NULL) + goto fail; + + s->context_initialized = 1; @@ -26122,10 +31722,10 @@ index 0000000000..4034c77979 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..117432de0a +index 0000000000..df2bac1df4 --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,985 @@ +@@ -0,0 +1,1002 @@ +/* + * HEVC video decoder + * @@ -26397,6 +31997,8 @@ index 0000000000..117432de0a + INTRA_ANGULAR_33, + INTRA_ANGULAR_34, +}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 + +enum SAOType { + SAO_NOT_APPLIED = 0, @@ -26813,6 +32415,17 @@ index 0000000000..117432de0a + uint8_t state[HEVC_CONTEXTS]; +} HEVCRpiCabacState; + ++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels ++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) ++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte ++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el ++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row ++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ +typedef struct HEVCRpiContext { + const AVClass *c; // needed by private avoptions + AVCodecContext *avctx; @@ -26882,17 +32495,19 @@ index 0000000000..117432de0a + int eos; ///< current packet contains an EOS/EOB NAL + int last_eos; ///< last packet contains an EOS/EOB NAL + int max_ra; -+ unsigned int hbs_stride; -+ unsigned int bs_size; + + int is_decoded; + int no_rasl_output_flag; + -+ HEVCPredContext hpc; ++ HEVCRpiPredContext hpc; + HEVCDSPContext hevcdsp; + int8_t *qp_y_tab; -+ uint8_t *horizontal_bs; -+ uint8_t *vertical_bs2; ++ ++ // Deblocking block strength bitmaps ++ unsigned int bs_stride2; ++ unsigned int bs_size; ++ uint8_t *bs_horizontal; ++ uint8_t *bs_vertical; + uint8_t *bsf_stash_up; + uint8_t *bsf_stash_left; + @@ -26930,6 +32545,8 @@ index 0000000000..117432de0a + int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) + int nuh_layer_id; + ++ struct AVMD5 *md5_ctx; ++ + HEVCSEIContext sei; + + // Put structures that allocate non-trivial storage at the end @@ -27113,10 +32730,10 @@ index 0000000000..117432de0a +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 -index 0000000000..a6af5ecd85 +index 0000000000..c5d130c377 --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,416 @@ +@@ -0,0 +1,419 @@ +/* + * HEVC video decoder + * @@ -27242,10 +32859,12 @@ index 0000000000..a6af5ecd85 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + -+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc, ++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs) ++ int in_inc) +{ ++ int shift = 32; ++ uint32_t bs = 0; + for (; pus > 0; pus--) { + int strength, out; + int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; @@ -27350,10 +32969,11 @@ index 0000000000..a6af5ecd85 + + for (out = dup; out > 0; out--) + { -+ *bs = strength; -+ bs += out_inc; ++ bs = (bs >> 2) | (strength << 30); ++ shift -= 2; + } + } ++ return bs >> shift; +} + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) @@ -27535,7 +33155,7 @@ index 0000000000..a6af5ecd85 +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 -index 0000000000..59d06bbe28 +index 0000000000..8c9bf725bf --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h @@ -0,0 +1,183 @@ @@ -27707,9 +33327,9 @@ index 0000000000..59d06bbe28 + uint8_t * src_l, + unsigned int no_f); + -+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc, ++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs); ++ int in_inc); +} HEVCDSPContext; + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth); @@ -30008,10 +35628,10 @@ index 0000000000..cfe9264fc3 + diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c new file mode 100644 -index 0000000000..f6db76482d +index 0000000000..113ed33d64 --- /dev/null +++ b/libavcodec/rpi_hevcpred.c -@@ -0,0 +1,122 @@ +@@ -0,0 +1,150 @@ +/* + * HEVC video Decoder + * @@ -30037,6 +35657,9 @@ index 0000000000..f6db76482d +#include "rpi_hevcdec.h" + +#include "rpi_hevcpred.h" ++#if (ARCH_ARM) ++#include "arm/rpi_hevcpred_arm.h" ++#endif + +#define PRED_C 0 +#define BIT_DEPTH 8 @@ -30074,7 +35697,7 @@ index 0000000000..f6db76482d +#undef BIT_DEPTH +#undef PRED_C + -+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth) ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) +{ +#undef FUNC +#define FUNC(a, depth) a ## _ ## depth @@ -30091,7 +35714,18 @@ index 0000000000..f6db76482d + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ + hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ -+ hpc->pred_dc = FUNC(pred_dc, depth); \ ++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ ++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ ++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ ++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ ++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ @@ -30106,7 +35740,18 @@ index 0000000000..f6db76482d + hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ + hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ + hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ -+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ ++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ ++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ ++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ ++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ ++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ + hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ @@ -30131,15 +35776,18 @@ index 0000000000..f6db76482d + break; + } + -+ if (ARCH_MIPS) -+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#if (ARCH_ARM) ++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth); ++#elif (ARCH_MIPS) ++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#endif +} diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h new file mode 100644 -index 0000000000..03c6eb3295 +index 0000000000..31d7d57d95 --- /dev/null +++ b/libavcodec/rpi_hevcpred.h -@@ -0,0 +1,57 @@ +@@ -0,0 +1,68 @@ +/* + * HEVC video Decoder + * @@ -30172,37 +35820,48 @@ index 0000000000..03c6eb3295 +struct HEVCRpiContext; +struct HEVCRpiLocalContext; + -+typedef struct HEVCPredContext { ++typedef struct HEVCRpiPredContext { + void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + + void (*pred_planar[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); + void (*pred_angular[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, -+ int c_idx, int mode); ++ int mode); ++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); + void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + + void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); + void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, -+ int c_idx, int mode); -+} HEVCPredContext; ++ int mode); ++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++} HEVCRpiPredContext; + -+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth); -+void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth); ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); + +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 -index 0000000000..4ee776f955 +index 0000000000..a76ba4c442 --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,850 @@ +@@ -0,0 +1,983 @@ +/* + * HEVC video decoder + * @@ -30396,20 +36055,21 @@ index 0000000000..4ee776f955 + const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : + lc->tu.intra_pred_mode; + pixel4 a; -+ pixel left_array[2 * MAX_TB_SIZE + 1]; ++ ++ // Align so we can do multiple loads in the asm ++ // Padded to 16 byte boundary so as not to confuse anything ++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); +#if !PRED_C -+ pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; -+#endif -+ pixel top_array[2 * MAX_TB_SIZE + 1]; -+#if !PRED_C -+ pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; ++ DECLARE_ALIGNED(16, pixel, filtered_left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ DECLARE_ALIGNED(16, pixel, filtered_top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); +#endif + -+ pixel *left = left_array + 1; -+ pixel *top = top_array + 1; ++ pixel *left = left_array + 16 / sizeof(pixel); ++ pixel *top = top_array + 16 / sizeof(pixel); +#if !PRED_C -+ pixel *filtered_left = filtered_left_array + 1; -+ pixel *filtered_top = filtered_top_array + 1; ++ pixel *filtered_left = filtered_left_array + 16 / sizeof(pixel); ++ pixel *filtered_top = filtered_top_array + 16 / sizeof(pixel); +#endif + int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); + int cand_left = lc->na.cand_left; @@ -30664,12 +36324,22 @@ index 0000000000..4ee776f955 + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, log2_size, c_idx); ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); + break; + default: + s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, c_idx, ++ (uint8_t *)left, stride, + mode); + break; + } @@ -30680,12 +36350,22 @@ index 0000000000..4ee776f955 + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, log2_size, c_idx); ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); + break; + default: + s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, c_idx, ++ (uint8_t *)left, stride, + mode); + break; + } @@ -30768,7 +36448,7 @@ index 0000000000..4ee776f955 +#if !PRED_C +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int log2_size, int c_idx) ++ ptrdiff_t stride, int log2_size) +{ + int i, j, x, y; + int size = (1 << log2_size); @@ -30788,7 +36468,10 @@ index 0000000000..4ee776f955 + for (j = 0; j < size; j+=4) + AV_WN4P(&POS(j, i), a); + -+ if (c_idx == 0 && size < 32) { ++// if (c_idx == 0 && size < 32) ++// As we now have separate fns for y & c - no need to test that ++ if (size < 32) ++ { + POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; + for (x = 1; x < size; x++) + POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; @@ -30799,7 +36482,7 @@ index 0000000000..4ee776f955 +#else +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int log2_size, int c_idx) ++ ptrdiff_t stride, int log2_size) +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); @@ -30830,6 +36513,20 @@ index 0000000000..4ee776f955 +} +#endif + ++#define PRED_DC(size)\ ++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_DC(0) ++PRED_DC(1) ++PRED_DC(2) ++PRED_DC(3) ++ ++#undef PRED_DC ++ +#ifndef ANGLE_CONSTS +#define ANGLE_CONSTS +static const int intra_pred_angle[] = { @@ -30846,7 +36543,7 @@ index 0000000000..4ee776f955 +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int c_idx, ++ ptrdiff_t stride, + int mode, int size) +{ + int x, y; @@ -30889,10 +36586,12 @@ index 0000000000..4ee776f955 + AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); + } + } -+ if (mode == 26 && c_idx == 0 && size < 32) { ++// if (mode == 26 && c_idx == 0 && size < 32) { ++ if (mode == 26 && size < 32) { + for (y = 0; y < size; y++) + POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); + } ++ + } else { + ref = left - 1; + if (angle < 0 && last < -1) { @@ -30916,7 +36615,8 @@ index 0000000000..4ee776f955 + POS(x, y) = ref[y + idx + 1]; + } + } -+ if (mode == 10 && c_idx == 0 && size < 32) { ++// if (mode == 10 && c_idx == 0 && size < 32) { ++ if (mode == 10 && size < 32) { + for (x = 0; x < size; x += 4) { + POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1)); + POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1)); @@ -30925,12 +36625,61 @@ index 0000000000..4ee776f955 + } + } + } ++ ++ ++ ++#if BIT_DEPTH == 8 && 0 ++ if ((size == 16 || size == 32) && mode != 10 && mode != 26) { ++ DECLARE_ALIGNED(16, uint8_t, a[64*32]); ++ void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++// void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++#if 1 ++ src = (pixel *)_src; ++ printf("C: Mode=%d\n", mode); ++ for (y = 0; y < size; y++, src += stride) ++ { ++ printf("%2d: ", y); ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x ", src[x]); ++ } ++ printf("\n"); ++ } ++#endif ++// ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size); ++ memset(a, 0, sizeof(a)); ++// ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode); ++ ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode); ++#if 1 ++ src = (pixel *)a; ++ printf("A:\n"); ++ for (y = 0; y < size; y++, src += size) ++ { ++ printf("%2d: ", y); ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x ", src[x]); ++ } ++ printf("\n"); ++ } ++#endif ++ src = (pixel *)_src; ++ for (y = 0; y < size; y++, src += stride) ++ { ++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) { ++ printf("Fail at line %d\n", y); ++ av_assert0(0); ++ } ++ } ++ } ++#endif ++ +} +#else +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int c_idx, ++ ptrdiff_t stride, + int mode, int size) +{ + int x, y; @@ -31001,35 +36750,78 @@ index 0000000000..4ee776f955 + } + } + } ++ ++#if BIT_DEPTH == 10 && 0 ++ if (size == 16 && mode != 10 && mode != 26) { ++ DECLARE_ALIGNED(16, uint8_t, a[64*32]); ++// void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++ src = (c_dst_ptr_t)_src; ++ printf("C: mode=%d\n", mode); ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x:%3x ", src[x][0], src[x][1]); ++ } ++ printf("\n"); ++ } ++ ++ memset(a, 0, sizeof(a)); ++ ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode); ++ ++ src = (c_dst_ptr_t)a; ++ printf("A:\n"); ++ for (y = 0; y < size; y++, src += size) ++ { ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x:%3x ", src[x][0], src[x][1]); ++ } ++ printf("\n"); ++ } ++ ++ src = (c_dst_ptr_t)_src; ++ for (y = 0; y < size; y++, src += stride) ++ { ++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) { ++ printf("Fail at line %d\n", y); ++ av_assert0(0); ++ } ++ } ++ ++ } ++#endif +} +#endif + +static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); +} + +static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); +} + +static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); +} + +static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); +} + +#undef cpel From 587cbeff145a78d707afca292701804b1d47c4df Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 08/13] ffmpeg: cleanup configure --- packages/multimedia/ffmpeg/package.mk | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 4d9a4a6157..aadec506e0 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -81,6 +81,12 @@ else FFMPEG_FPU="--disable-neon" fi +if [ "$TARGET_ARCH" = "x86_64" ]; then + FFMPEG_X86ASM="--enable-x86asm" +else + FFMPEG_X86ASM="--disable-x86asm" +fi + pre_configure_target() { cd $PKG_BUILD rm -rf .$TARGET_NAME @@ -161,7 +167,6 @@ configure_target() { --enable-encoder=wmav2 \ --enable-encoder=mjpeg \ --enable-encoder=png \ - --disable-decoder=mpeg_xvmc \ --enable-hwaccels \ --disable-muxers \ --enable-muxer=spdif \ @@ -202,7 +207,7 @@ configure_target() { --enable-asm \ --disable-altivec \ $FFMPEG_FPU \ - --enable-yasm \ + $FFMPEG_X86ASM \ --disable-symver } From c55b87ed1ef120157ea801d36d58327f9c9c57e4 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 09/13] ffmpeg: use PKG_* variables --- packages/multimedia/ffmpeg/package.mk | 52 +++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index aadec506e0..9207d5450c 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -37,29 +37,29 @@ get_graphicdrivers if [ "$VAAPI_SUPPORT" = "yes" ]; then PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET libva" - FFMPEG_VAAPI="--enable-vaapi" + PKG_FFMPEG_VAAPI="--enable-vaapi" else - FFMPEG_VAAPI="--disable-vaapi" + PKG_FFMPEG_VAAPI="--disable-vaapi" fi if [ "$VDPAU_SUPPORT" = "yes" -a "$DISPLAYSERVER" = "x11" ]; then PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET libvdpau" - FFMPEG_VDPAU="--enable-vdpau" + PKG_FFMPEG_VDPAU="--enable-vdpau" else - FFMPEG_VDPAU="--disable-vdpau" + PKG_FFMPEG_VDPAU="--disable-vdpau" fi if [ "$PROJECT" = "Rockchip" ]; then PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET rkmpp" - FFMPEG_RKMPP="--enable-rkmpp --enable-libdrm --enable-version3" + PKG_FFMPEG_RKMPP="--enable-rkmpp --enable-libdrm --enable-version3" else - FFMPEG_RKMPP="--disable-rkmpp" + PKG_FFMPEG_RKMPP="--disable-rkmpp" fi if build_with_debug; then - FFMPEG_DEBUG="--enable-debug --disable-stripping" + PKG_FFMPEG_DEBUG="--enable-debug --disable-stripping" else - FFMPEG_DEBUG="--disable-debug --enable-stripping" + PKG_FFMPEG_DEBUG="--disable-debug --enable-stripping" fi if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then @@ -68,23 +68,23 @@ fi case "$TARGET_ARCH" in arm) - FFMPEG_TABLES="--enable-hardcoded-tables" + PKG_FFMPEG_TABLES="--enable-hardcoded-tables" ;; *) - FFMPEG_TABLES="--disable-hardcoded-tables" + PKG_FFMPEG_TABLES="--disable-hardcoded-tables" ;; esac if target_has_feature neon; then - FFMPEG_FPU="--enable-neon" + PKG_FFMPEG_FPU="--enable-neon" else - FFMPEG_FPU="--disable-neon" + PKG_FFMPEG_FPU="--disable-neon" fi if [ "$TARGET_ARCH" = "x86_64" ]; then - FFMPEG_X86ASM="--enable-x86asm" + PKG_FFMPEG_X86ASM="--enable-x86asm" else - FFMPEG_X86ASM="--disable-x86asm" + PKG_FFMPEG_X86ASM="--disable-x86asm" fi pre_configure_target() { @@ -93,10 +93,10 @@ pre_configure_target() { if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux $CFLAGS" - FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm" - FFMPEG_RPI="--enable-rpi" + PKG_FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm" + PKG_FFMPEG_RPI="--enable-rpi" else - FFMPEG_RPI="--disable-rpi" + PKG_FFMPEG_RPI="--disable-rpi" fi } @@ -119,7 +119,7 @@ configure_target() { --host-ldflags="$HOST_LDFLAGS" \ --extra-cflags="$CFLAGS" \ --extra-ldflags="$LDFLAGS" \ - --extra-libs="$FFMPEG_LIBS" \ + --extra-libs="$PKG_FFMPEG_LIBS" \ --disable-static \ --enable-shared \ --enable-gpl \ @@ -127,7 +127,7 @@ configure_target() { --enable-nonfree \ --enable-logging \ --disable-doc \ - $FFMPEG_DEBUG \ + $PKG_FFMPEG_DEBUG \ --enable-pic \ --pkg-config="$TOOLCHAIN/bin/pkg-config" \ --enable-optimizations \ @@ -154,13 +154,13 @@ configure_target() { --enable-mdct \ --enable-rdft \ --disable-crystalhd \ - $FFMPEG_VAAPI \ - $FFMPEG_VDPAU \ - $FFMPEG_RPI \ - $FFMPEG_RKMPP \ + $PKG_FFMPEG_VAAPI \ + $PKG_FFMPEG_VDPAU \ + $PKG_FFMPEG_RPI \ + $PKG_FFMPEG_RKMPP \ --disable-dxva2 \ --enable-runtime-cpudetect \ - $FFMPEG_TABLES \ + $PKG_FFMPEG_TABLES \ --disable-encoders \ --enable-encoder=ac3 \ --enable-encoder=aac \ @@ -206,8 +206,8 @@ configure_target() { --enable-zlib \ --enable-asm \ --disable-altivec \ - $FFMPEG_FPU \ - $FFMPEG_X86ASM \ + $PKG_FFMPEG_FPU \ + $PKG_FFMPEG_X86ASM \ --disable-symver } From d1b27be7c5b92271a79274bbe6e0646dab6be46b Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 10/13] ffmpeg: hevc: Fixes for ffmpeg 4 --- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 100 ++++++++---------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 91ea9da3dd..32c0f1f17b 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -16771,10 +16771,10 @@ index 0000000000..4bfa000da4 + diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 -index 0000000000..93f3530ff5 +index 0000000000..93a6294c76 --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,761 @@ +@@ -0,0 +1,759 @@ +/* + * HEVC video decoder + * @@ -17017,8 +17017,7 @@ index 0000000000..93f3530ff5 + x < s->ps.sps->width) { + x &= ~15; + y &= ~15; -+ if (s->threads_type == FF_THREAD_FRAME) -+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); @@ -17031,8 +17030,7 @@ index 0000000000..93f3530ff5 + y = y0 + (nPbH >> 1); + x &= ~15; + y &= ~15; -+ if (s->threads_type == FF_THREAD_FRAME) -+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); @@ -19691,10 +19689,10 @@ index 0000000000..744e7cf248 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..1e7120a43d +index 0000000000..00c1f14614 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,441 @@ +@@ -0,0 +1,444 @@ +/* + * HEVC parameter set parsing + * @@ -19803,6 +19801,9 @@ index 0000000000..1e7120a43d + int num_entry_point_offsets; + int offsets_allocated; + ++ uint8_t offload_wpp; ++ uint8_t offload_tiles; ++ + int8_t slice_qp; + + uint8_t luma_log2_weight_denom; @@ -25929,10 +25930,10 @@ index 0000000000..1128a2c054 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..08686ff260 +index 0000000000..bddf0c3417 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5787 @@ +@@ -0,0 +1,5782 @@ +/* + * HEVC video Decoder + * @@ -26911,7 +26912,10 @@ index 0000000000..08686ff260 + goto fail; + + s->tab_ipm = av_mallocz(min_pu_size); -+ s->is_pcm = av_malloc_array(sps->pcm_width, sps->pcm_height); ++ // We can overread by 1 line & one byte in deblock so alloc & zero ++ // We don't need to zero the extra @ start of frame as it will never be ++ // written ++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); + if (!s->tab_ipm || !s->is_pcm) + goto fail; + @@ -27645,6 +27649,9 @@ index 0000000000..08686ff260 + } + + sh->num_entry_point_offsets = 0; ++ sh->offload_wpp = 0; ++ sh->offload_wpp = 0; ++ + if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { + unsigned num_entry_point_offsets = get_ue_golomb_long(gb); + // It would be possible to bound this tighter but this here is simpler @@ -27681,6 +27688,18 @@ index 0000000000..08686ff260 + } + sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size + } ++ ++ // Do we want to offload this ++ if (s->threads_type != 0) ++ { ++ sh->offload_wpp = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) && ++ s->ps.pps->num_tile_columns > 1; ++ // * We only cope with WPP in a single column ++ // Probably want to deal with that case as tiles rather than WPP anyway ++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly ++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->ps.pps->num_tile_columns == 1; ++ } + } + } + @@ -28231,7 +28250,7 @@ index 0000000000..08686ff260 +static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref, + const Mv * const mv, const int y0, const int height) +{ -+ if (s->threads_type == FF_THREAD_FRAME) { ++ if (s->threads_type != 0) { + const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); + + // Progress has to be attached to current job as the actual wait @@ -29408,7 +29427,7 @@ index 0000000000..08686ff260 + (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0); + + // Signal -+ if (s->threads_type == FF_THREAD_FRAME && y > 0) { ++ if (y > 0) { + // Cast away const as progress is held in s, but this really shouldn't confuse anything + ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1); + } @@ -30179,7 +30198,7 @@ index 0000000000..08686ff260 + ff_hevc_rpi_save_states(s, lc); + + // Report progress so we can use our MVs in other frames -+ if (s->threads_type == FF_THREAD_FRAME && (ctb_flags & CTB_TS_FLAGS_EOL) != 0) ++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0) + ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); + + // End of line || End of tile line || End of tile @@ -30593,9 +30612,7 @@ index 0000000000..08686ff260 + +#if RPI_EXTRA_BIT_THREADS > 0 + -+ if (s->sh.num_entry_point_offsets != 0 && -+ (!s->ps.pps->tile_wpp_inter_disable || s->sh.slice_type == HEVC_SLICE_I) && -+ s->ps.pps->num_tile_columns > 1) ++ if (s->sh.offload_tiles) + { + unsigned int slice_row = 0; + @@ -30640,14 +30657,7 @@ index 0000000000..08686ff260 + printf("%s: Done wait: ts=%d\n", __func__, lc->ts); +#endif + } -+ else -+ -+ // * We only cope with WPP in a single column -+ // Probably want to deal with that case as tiles rather than WPP anyway -+ // ?? Not actually sure that the main code deals with WPP + multi-col correctly -+ if (s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->ps.pps->num_tile_columns == 1 && -+ s->sh.num_entry_point_offsets != 0) ++ else if (s->sh.offload_wpp) + { +#if TRACE_WPP + printf("%s: Do WPP\n", __func__); @@ -31002,8 +31012,7 @@ index 0000000000..08686ff260 + s->nal_unit_type == HEVC_NAL_STSA_N || + s->nal_unit_type == HEVC_NAL_RADL_N || + s->nal_unit_type == HEVC_NAL_RASL_N); -+ s->offload_recon = s->used_for_ref; -+// s->offload_recon = 0; ++ s->offload_recon = s->threads_type != 0 && s->used_for_ref; + +#if DEBUG_DECODE_N + { @@ -31145,7 +31154,7 @@ index 0000000000..08686ff260 + +fail: // Also success path + if (s->ref != NULL) { -+ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) { ++ if (s->used_for_ref && s->threads_type != 0) { + ff_hevc_rpi_progress_signal_all_done(s); + } + else { @@ -31394,12 +31403,6 @@ index 0000000000..08686ff260 + s->ps.pps = NULL; + s->ps.vps = NULL; + -+ for (i = 1; i < s->threads_number; i++) { -+ if (s->sList[i] != NULL) { -+ av_freep(&s->sList[i]); -+ } -+ } -+ + // Free separately from sLists as used that way by RPI WPP + for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { + av_freep(s->HEVClcList + i); @@ -31428,7 +31431,6 @@ index 0000000000..08686ff260 + if (!s->HEVClc) + goto fail; + s->HEVClcList[0] = s->HEVClc; -+ s->sList[0] = s; + + // Whilst FFmpegs init fn is only called once the close fn is called as + // many times as we have threads (init_thread_copy is called for the @@ -31553,7 +31555,6 @@ index 0000000000..08686ff260 + s->is_nalff = s0->is_nalff; + s->nal_length_size = s0->nal_length_size; + -+ s->threads_number = s0->threads_number; + s->threads_type = s0->threads_type; + + if (s0->eos) { @@ -31611,11 +31612,6 @@ index 0000000000..08686ff260 + + atomic_init(&s->wpp_err, 0); + -+ if(avctx->active_thread_type & FF_THREAD_SLICE) -+ s->threads_number = avctx->thread_count; -+ else -+ s->threads_number = 1; -+ + if (avctx->extradata_size > 0 && avctx->extradata) { + ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1); + @@ -31632,7 +31628,7 @@ index 0000000000..08686ff260 + if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) + s->threads_type = FF_THREAD_FRAME; + else -+ s->threads_type = FF_THREAD_SLICE; ++ s->threads_type = 0; + + return 0; +} @@ -31722,10 +31718,10 @@ index 0000000000..08686ff260 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..df2bac1df4 +index 0000000000..d242727b2a --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1002 @@ +@@ -0,0 +1,1000 @@ +/* + * HEVC video decoder + * @@ -32430,13 +32426,10 @@ index 0000000000..df2bac1df4 + const AVClass *c; // needed by private avoptions + AVCodecContext *avctx; + -+ struct HEVCRpiContext *sList[MAX_NB_THREADS]; -+ + HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; + HEVCRpiLocalContext *HEVClc; + + uint8_t threads_type; -+ uint8_t threads_number; + + /** 1 if the independent slice segment header was successfully parsed */ + uint8_t slice_initialized; @@ -32641,12 +32634,13 @@ index 0000000000..df2bac1df4 +static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const HEVCFrame * const ref, const int y) +{ -+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++ if (s->threads_type != 0) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); +} + +static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) +{ -+ if (s->used_for_ref) ++ if (s->used_for_ref && s->threads_type != 0) + ff_hevc_rpi_progress_signal_field(s, y, 1); +} + @@ -32658,7 +32652,7 @@ index 0000000000..df2bac1df4 + +static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) +{ -+ if (s->used_for_ref) ++ if (s->used_for_ref && s->threads_type != 0) + { + ff_hevc_rpi_progress_signal_field(s, y, 0); + } @@ -33344,7 +33338,7 @@ index 0000000000..8c9bf725bf +#endif /* AVCODEC_RPI_HEVCDSP_H */ diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c new file mode 100644 -index 0000000000..cfe9264fc3 +index 0000000000..d1196a4440 --- /dev/null +++ b/libavcodec/rpi_hevcdsp_template.c @@ -0,0 +1,2278 @@ @@ -33929,7 +33923,7 @@ index 0000000000..cfe9264fc3 + pixel *src = (pixel *)_src; + int a_stride, b_stride; + int x, y; -+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); + stride_dst /= sizeof(pixel); + + a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; @@ -34157,7 +34151,7 @@ index 0000000000..cfe9264fc3 + pixel *src = (pixel *)_src; + int a_stride, b_stride; + int x, y; -+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); + + stride_dst /= sizeof(pixel); + width *= 2; From 27b1205d30483f16c1d06c638c7ae95b7b47c05e Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 11/13] ffmpeg: update mvc patch --- ...g-99.1004-added_upstream_mvc_patches.patch | 72 +++++++++++++++++-- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch index 981a88e102..551a27104a 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch @@ -1,7 +1,7 @@ From 20af7af23a9f366476e67669f14957dfaf58f141 Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Sat, 9 Jan 2016 16:34:09 +0100 -Subject: [PATCH 1/3] avcodec: add h264_mvc codec id and profiles +Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles --- libavcodec/avcodec.h | 3 +++ @@ -75,13 +75,13 @@ index 37a6aa8bff..52c5b659c4 100644 { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, -- -2.14.1 +2.17.0 From 0f3fda4e348e6b12570f5d279713f6da46511846 Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Sat, 9 Jan 2016 16:34:40 +0100 -Subject: [PATCH 2/3] h264_parser: add support for parsing h264 mvc NALUs +Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs --- libavcodec/h264.h | 2 ++ @@ -192,13 +192,13 @@ index f43b197d5e..f96e005ef3 100644 extern AVCodecParser ff_mjpeg_parser; extern AVCodecParser ff_mlp_parser; -- -2.14.1 +2.17.0 From cdd668dc436b9c78dcb31df477e329492356e7ec Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Tue, 28 Nov 2017 16:12:12 +0000 -Subject: [PATCH 3/3] h264_parser: force grabing a new timestamp until a frame +Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame start was found --- @@ -220,5 +220,65 @@ index be8b9db9b0..81c9a1bbae 100644 *poutbuf = NULL; *poutbuf_size = 0; -- -2.14.1 +2.17.0 + + +From fb0ec9a132d6eb8fd74348ef87b1176c7ca34a00 Mon Sep 17 00:00:00 2001 +From: popcornmix +Date: Mon, 28 May 2018 13:35:36 +0100 +Subject: [PATCH 4/4] fixup + +--- + libavcodec/extract_extradata_bsf.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c +index 082b3e749b..7612749efc 100644 +--- a/libavcodec/extract_extradata_bsf.c ++++ b/libavcodec/extract_extradata_bsf.c +@@ -59,7 +59,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt, + HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS, + }; + static const int extradata_nal_types_h264[] = { +- H264_NAL_SPS, H264_NAL_PPS, ++ H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS, + }; + + ExtractExtradataContext *s = ctx->priv_data; +@@ -90,7 +90,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt, + if (nal->type == HEVC_NAL_SPS) has_sps = 1; + if (nal->type == HEVC_NAL_VPS) has_vps = 1; + } else { +- if (nal->type == H264_NAL_SPS) has_sps = 1; ++ if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1; + } + } else if (s->remove) { + filtered_size += nal->raw_size + 3; +@@ -99,7 +99,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt, + + if (extradata_size && + ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) || +- (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) { ++ ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) { + AVBufferRef *filtered_buf; + uint8_t *extradata, *filtered_data; + +@@ -253,6 +253,7 @@ static const struct { + } extract_tab[] = { + { AV_CODEC_ID_CAVS, extract_extradata_mpeg4 }, + { AV_CODEC_ID_H264, extract_extradata_h2645 }, ++ { AV_CODEC_ID_H264_MVC, extract_extradata_h2645 }, + { AV_CODEC_ID_HEVC, extract_extradata_h2645 }, + { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12 }, + { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12 }, +@@ -317,6 +318,7 @@ static void extract_extradata_close(AVBSFContext *ctx) + static const enum AVCodecID codec_ids[] = { + AV_CODEC_ID_CAVS, + AV_CODEC_ID_H264, ++ AV_CODEC_ID_H264_MVC, + AV_CODEC_ID_HEVC, + AV_CODEC_ID_MPEG1VIDEO, + AV_CODEC_ID_MPEG2VIDEO, +-- +2.17.0 From 30594feb0f33388262971edac40c550522749782 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 12/13] ffmpeg: switch default x86 assembler back to yasm --- packages/multimedia/ffmpeg/package.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 9207d5450c..73136814e1 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -82,7 +82,7 @@ else fi if [ "$TARGET_ARCH" = "x86_64" ]; then - PKG_FFMPEG_X86ASM="--enable-x86asm" + PKG_FFMPEG_X86ASM="--enable-x86asm --x86asmexe=yasm" else PKG_FFMPEG_X86ASM="--disable-x86asm" fi From 1f1c5a778150b52eab5be8939f87a128a6fff6ac Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH 13/13] ffmpeg: hevc: Fix performance regression + latest ben optimisations --- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 3839 ++++++++++++----- 1 file changed, 2782 insertions(+), 1057 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 32c0f1f17b..5300c1252b 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644 /ffplay /ffprobe diff --git a/configure b/configure -index dee507cb6a..9a93189107 100755 +index dee507cb6a..0ee9efe1e7 100755 --- a/configure +++ b/configure @@ -318,6 +318,7 @@ External library support: @@ -30,6 +30,15 @@ index dee507cb6a..9a93189107 100755 --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] --disable-nvenc disable Nvidia video encoding code [autodetect] --enable-omx enable OpenMAX IL code [no] +@@ -1036,7 +1037,7 @@ EOF + + check_insn(){ + log check_insn "$@" +- check_inline_asm ${1}_inline "$2" ++ check_inline_asm ${1}_inline "\"$2\"" + check_as ${1}_external "$2" + } + @@ -1776,6 +1777,7 @@ FEATURE_LIST=" gray hardcoded_tables @@ -582,7 +591,7 @@ index 4d4ef530e4..fba8776c9f 100644 { const AVCodec *p, *experimental = NULL; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index e656011c3c..70c3f026b8 100644 +index e656011c3c..f8801dfab6 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ @@ -594,7 +603,7 @@ index e656011c3c..70c3f026b8 100644 OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o -@@ -136,10 +138,23 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -613,6 +622,7 @@ index e656011c3c..70c3f026b8 100644 + arm/rpi_hevcpred_init_neon.o \ + arm/rpi_hevcpred_intra_angular_neon.o \ + arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_filter_neon.o \ + arm/rpi_hevcpred_intra_hv_neon.o \ + arm/rpi_hevcpred_intra_planar_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o @@ -1680,10 +1690,10 @@ index 0000000000..0211e447a8 + diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S new file mode 100644 -index 0000000000..1bdf2ab09f +index 0000000000..3bbfb443bf --- /dev/null +++ b/libavcodec/arm/rpi_hevc_misc_neon.S -@@ -0,0 +1,26 @@ +@@ -0,0 +1,226 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" + @@ -1710,6 +1720,206 @@ index 0000000000..1bdf2ab09f + +2: vst1.8 {q0,q1}, [r0:256] + bx lr ++endfunc ++ ++@ PIC jump tables are more expensive than absolute for A32 code ++.set jent_pic, CONFIG_PIC || CONFIG_THUMB ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++T .short ((0 + \lab) - (0 + 98b)) / 2 ++A .short (0 + \lab) - (4 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.macro cpy_compound val, p1, p2 ++.if \p1 + \p2 != \val ++.error "Bad addition! \p1 + \p2 != \val" ++.endif ++.if \val <= 64 ++@ As max we deal with 128 vals above 64 will never be recursed to ++100\val\(): ++ push {r11, lr} ++.endif ++\val\(): ++ push {r0-r3} ++ bl 100\p1\()b ++ pop {r0-r3} ++ add r0, #\p1 ++ add r2, #\p1 ++ b \p2\()b ++.endm ++ ++@ ff_hevc_cpy_blks8x4_neon( ++@ dst [r0] ++@ dst_stride [r1] ++@ src [r2] ++@ src_stride [r3] ++@ width [sp, #0] (bytes) ++@ height) [sp, #4] ++@ ++@ Power of 2 widths are directly coded, all others are done in stripes ++@ We expect the vast majority of calls to be power of 2 ++@ ++@ Currently has min width of 8, but we could make that 4 without issue ++@ Min height is 4 ++ ++function ff_hevc_rpi_cpy_blks8x4_neon, export=1 ++ ldr r12, [sp, #0] ++ push {r11, lr} ++ sub r12, #1 ++A adr lr, 98f ++ ubfx r12, r12, #3, #4 ++ ldr r11, [sp, #(8 + 4)] ++.if jent_pic ++A lsl r12, #1 ++A ldrsh lr, [lr, r12] ++A add pc, lr ++T tbh [pc, r12, lsl #1] ++.else ++ @ A32 only, Thumb is always PIC ++ ldr pc, [lr, r12, lsl #2] ++.endif ++ ++98: ++ jent 8f ++ jent 16f ++ jent 24f ++ jent 32f ++ jent 40f ++ jent 48f ++ jent 56f ++ jent 64f ++ jent 72f ++ jent 80f ++ jent 88f ++ jent 96f ++ jent 104f ++ jent 112f ++ jent 120f ++ jent 128f ++ ++1008: ++ push {r11, lr} ++8: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {d0 }, [r2], r3 ++ vld1.32 {d1 }, [lr], r3 ++ vld1.32 {d2 }, [r2], r3 ++ vld1.32 {d3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {d0 }, [r0], r1 ++ vst1.32 {d1 }, [r12], r1 ++ vst1.32 {d2 }, [r0], r1 ++ vst1.32 {d3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10016: ++ push {r11, lr} ++16: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q0 }, [r2], r3 ++ vld1.32 {q1 }, [lr], r3 ++ vld1.32 {q2 }, [r2], r3 ++ vld1.32 {q3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q0 }, [r0], r1 ++ vst1.32 {q1 }, [r12], r1 ++ vst1.32 {q2 }, [r0], r1 ++ vst1.32 {q3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 24, 16, 8 ++ ++10032: ++ push {r11, lr} ++32: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 40, 32, 8 ++cpy_compound 48, 32, 16 ++cpy_compound 56, 32, 24 ++ ++10064: ++ push {r11, lr} ++64: ++ add lr, r2, #32 ++ add r12, r0, #32 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #2 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 72, 64, 8 ++cpy_compound 80, 64, 16 ++cpy_compound 88, 64, 24 ++cpy_compound 96, 64, 32 ++cpy_compound 104, 64, 40 ++cpy_compound 112, 64, 48 ++cpy_compound 120, 64, 56 ++ ++128: ++ push {r4, r5} ++ @ We could do this with fewer registers if we jump around but I ++ @ have a primative urge to load sequentially ++ mov r4, #64 ++ add lr, r2, #32 ++ add r12, r0, #32 ++ sub r3, r4 ++ sub r1, r4 ++1: ++ vld1.32 {q8, q9 }, [r2], r4 ++ vld1.32 {q10, q11}, [lr], r4 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #1 ++ vst1.32 {q8, q9 }, [r0], r4 ++ vst1.32 {q10, q11}, [r12], r4 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r4, r5, r11, pc} ++ ++endfunc ++ diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h new file mode 100644 index 0000000000..62b9326532 @@ -1744,10 +1954,10 @@ index 0000000000..62b9326532 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 -index 0000000000..f75c82671e +index 0000000000..98512d21dc --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1593 @@ +@@ -0,0 +1,1625 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -2472,6 +2682,7 @@ index 0000000000..f75c82671e +function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 + sub r12, r0, r1 + cmp r2, #0 ++ it eq + bxeq lr + vld1.8 {d26,d27}, [r0] + lsl r1, #1 @@ -2484,10 +2695,14 @@ index 0000000000..f75c82671e + "sub r12, r0, r1, asr #1" + + lsls r3, #29 @ b2 -> N, b3 -> C ++ it pl + vstrpl d26, [r0, #0] ++ it cc + vstrcc d27, [r0, #8] + lsls r3, #2 @ b0 -> N, b1 -> C ++ it pl + vstrpl d18, [r12, #0] ++ it cc + vstrcc d19, [r12, #8] + bx lr + @@ -2506,6 +2721,7 @@ index 0000000000..f75c82671e +.macro m_filter_h_uv_16 bit_depth + sub r12, r0, r1 + cmp r2, #0 ++ it eq + bxeq lr + vld1.16 {q12, q13}, [r0] + lsl r1, #1 @@ -2527,13 +2743,17 @@ index 0000000000..f75c82671e + @ Which means we need to break this apart in an ugly fashion +1: + lsls r3, #29 @ b2 -> N, b3 -> C ++ itt pl + vstrpl d24, [r0, #0] + vstrpl d25, [r0, #8] ++ itt cc + vstrcc d26, [r0, #16] + vstrcc d27, [r0, #24] + lsls r3, #2 @ b0 -> N, b1 -> C ++ itt pl + vstrpl d20, [r12, #0] + vstrpl d21, [r12, #8] ++ itt cc + vstrcc d22, [r12, #16] + vstrcc d23, [r12, #24] + bx lr @@ -2554,6 +2774,7 @@ index 0000000000..f75c82671e + +function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + cmp r2, #0 ++ it eq + bxeq lr + push {lr} + vld2.16 {d16[0], d18[0]}, [r3], r1 @@ -2610,6 +2831,7 @@ index 0000000000..f75c82671e +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f @@ -2619,6 +2841,7 @@ index 0000000000..f75c82671e + vst1.16 {d21[1]}, [r0], r1 + vst1.16 {d21[0]}, [r2], r1 +1: ++ ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f @@ -2700,6 +2923,7 @@ index 0000000000..f75c82671e + +.macro m_filter_v_uv2_16 bit_depth + cmp r2, #0 ++ it eq + bxeq lr + push {lr} + vld2.32 {d16[0], d18[0]}, [r3], r1 @@ -2756,6 +2980,7 @@ index 0000000000..f75c82671e +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f @@ -2765,6 +2990,7 @@ index 0000000000..f75c82671e + vst1.32 {d28[1]}, [r0], r1 + vst1.32 {d28[0]}, [r2], r1 +1: ++ ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f @@ -2938,9 +3164,11 @@ index 0000000000..f75c82671e + vmovl.s16 q5, d29 + teq lr, #1 + vmovl.s16 q14, d30 -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vmovl.s16 q15, d31 -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vbif q0, q1, q4 + vbif q2, q3, q14 + vbif q1, q0, q5 @@ -3022,13 +3250,21 @@ index 0000000000..f75c82671e + lsl ip, v2 + lsl lr, v2 + ldr v2, [sp, #6*8 + 10*4 + 1*4] -+ orr a2, ip, a2, lsr v1 ++T lsr a2, v1 ++T orr a2, ip, a2 ++A orr a2, ip, a2, lsr v1 + lsl ip, v1, #1 -+ orr v8, lr, v8, lsr v1 ++T lsr v8, v1 ++T orr v8, lr, v8 ++A orr v8, lr, v8, lsr v1 + lsl lr, v1, #2 -+ orr a2, v8, a2, lsr ip ++T lsr a2, ip ++T orr a2, v8, a2 ++A orr a2, v8, a2, lsr ip + ldr v1, [sp, #6*8 + 10*4] -+ orr v7, a2, v7, lsr lr ++T lsr v7, lr ++T orr v7, a2, v7 ++A orr v7, a2, v7, lsr lr + bhi 1b + + vpop {d8-d13} @@ -3094,11 +3330,12 @@ index 0000000000..f75c82671e + vtst.16 d22, d16, d18 + vadd.i16 d30, d16, d17 + vswp d2, d3 -+ ldr lr, [sp] ++ ldr lr, [sp] + vmovl.s16 q10, d20 -+ teq lr, #1 ++ teq lr, #1 + vmovl.s16 q11, d22 -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vbif d0, d1, d20 + vbif d4, d6, d20 + vbif d3, d2, d21 @@ -3124,7 +3361,8 @@ index 0000000000..f75c82671e + vshrn.i32 d7, q11, #8 + vmovn.i32 d3, q10 + vand q0, q3, q1 -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vrev16.8 q3, q3 + vand q0, q3 + vsra.u64 d30, #32 @@ -3141,6 +3379,7 @@ index 0000000000..f75c82671e + cmp a1, #2 + vmov.u16 a1, d0[1] + vmov.u16 a2, d0[0] ++ it eq + orreq a1, a2, a1, lsl #2 + pop {a2,v1-v8,pc} +10: @@ -3153,7 +3392,10 @@ index 0000000000..f75c82671e + pkhbt a1, a1, a1, lsl #16 + lsr a2, v2 + lsr a1, v2 -+ orreq a1, a2, a1, lsl v1 ++T itt eq ++T lsleq a1, v1 ++T orreq a1, a2, a1 ++A orreq a1, a2, a1, lsl v1 + pop {a2,v1-v8,pc} +endfunc + @@ -3570,10 +3812,10 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..ce7e6091f1 +index 0000000000..8a94a644a4 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c -@@ -0,0 +1,465 @@ +@@ -0,0 +1,467 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -3808,6 +4050,7 @@ index 0000000000..ce7e6091f1 +uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc); ++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); + + +static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) @@ -4038,6 +4281,7 @@ index 0000000000..ce7e6091f1 + assert(offsetof(MvField, ref_idx) == 8); + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; ++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; +} diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S new file mode 100644 @@ -7682,10 +7926,10 @@ index 0000000000..80724d4cf3 + diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c new file mode 100644 -index 0000000000..8c267a0368 +index 0000000000..21e7700174 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_init_neon.c -@@ -0,0 +1,188 @@ +@@ -0,0 +1,210 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -7708,6 +7952,15 @@ index 0000000000..8c267a0368 + +#include "rpi_hevcpred_arm.h" + ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; ++ +void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); @@ -7788,6 +8041,12 @@ index 0000000000..8c267a0368 + switch (bit_depth) + { + case 8: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; @@ -7829,6 +8088,13 @@ index 0000000000..8c267a0368 + c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; + break; + case 10: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; ++ + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; @@ -7876,10 +8142,10 @@ index 0000000000..8c267a0368 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S new file mode 100644 -index 0000000000..1a2d413ea2 +index 0000000000..8063a1521e --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S -@@ -0,0 +1,2352 @@ +@@ -0,0 +1,2373 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -8005,8 +8271,11 @@ index 0000000000..1a2d413ea2 + @ r2=left (variable), r1=up (const) + adds r8, r7 + vmov d24, d16 ++T itee mi + ldrbmi r12, [r2, #-1]! -+ ldrbpl r12, [r1, r8, asr #8] ++T asrpl r12, r8, #8 ++T ldrbpl r12, [r1, r12] ++A ldrbpl r12, [r1, r8, asr #8] + vext.8 d16, d16, d16, #7 + sub r6, #32 + vmov.8 d16[0], r12 @@ -8028,7 +8297,11 @@ index 0000000000..1a2d413ea2 + bne 2b + b store_tran_8x8_8 @ This will return + -+ ++.macro ADRT reg, val ++@ adr in T32 has enough range but not in A32 ++A adrl \reg, \val ++T adr \reg, \val ++.endm + +@ ff_hevc_rpi_pred_angular_4_neon_8 +@ uint8_t *_src, [r0] @@ -8040,8 +8313,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_4_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8101,8 +8374,11 @@ index 0000000000..1a2d413ea2 + @ r2=left (variable), r1=up (const) + adds r8, r7 + vmov d24, d16 ++T itee mi + ldrbmi r12, [r2, #-1]! -+ ldrbpl r12, [r1, r8, asr #8] ++T asrpl r12, r8, #8 ++T ldrbpl r12, [r1, r12] ++A ldrbpl r12, [r1, r8, asr #8] + vext.8 d16, d16, d16, #7 + sub r6, #32 + vmov.8 d16[0], r12 @@ -8135,7 +8411,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov d24, d16 + add r8, r7 @@ -8197,8 +8475,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_8_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8241,7 +8519,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov d24, d16 + add r8, r7 @@ -8301,8 +8581,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_16_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8370,7 +8650,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov q12, q8 + add r8, r7 @@ -8441,8 +8723,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_32_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8513,7 +8795,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov q12, q8 + add r8, r7 @@ -8641,6 +8925,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #2 ++ it pl + addpl r2, r1, r8, asr #7 + vext.16 d16, d16, d16, #3 + @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 @@ -8673,8 +8958,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -8779,8 +9064,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -8918,8 +9203,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -9145,6 +9430,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #2 ++ it pl + addpl r2, r1, r8, asr #7 + vext.16 d16, d16, d16, #3 + @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 @@ -9178,8 +9464,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_4_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9282,8 +9568,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_8_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9409,8 +9695,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_16_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9556,8 +9842,8 @@ index 0000000000..1a2d413ea2 + ldr r12, [sp, #0] + push {r4-r10, lr} + vpush {q4 } -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9791,6 +10077,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #4 ++ it pl + addpl r2, r1, r8, asr #6 + vext.32 q8, q8, #3 + @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1 @@ -9825,8 +10112,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9931,8 +10218,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -10071,8 +10358,8 @@ index 0000000000..1a2d413ea2 + ldr r12, [sp, #0] + push {r4-r10, lr} + vpush {q4 } -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -10234,10 +10521,10 @@ index 0000000000..1a2d413ea2 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S new file mode 100644 -index 0000000000..af7ba1f45e +index 0000000000..75a1789c25 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S -@@ -0,0 +1,682 @@ +@@ -0,0 +1,695 @@ +/* + * Copyright (c) 2017 John Cox (for Raspberry Pi) + * @@ -10284,7 +10571,7 @@ index 0000000000..af7ba1f45e + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10359,7 +10646,7 @@ index 0000000000..af7ba1f45e + + @ Average the els of top & left + vld1.8 {d0}, [r1] -+ mov r1, #2 ++ mov r1, #2 + vld1.8 {d16}, [r2] + vmov.i16 q2, #3 + vmov.i64 d7, #0xffff @@ -10367,7 +10654,7 @@ index 0000000000..af7ba1f45e + vmovl.u8 q0, d0 + vadd.i16 d6, d2, d3 @ d6 has 4 vals + vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10418,23 +10705,30 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 {q8 }, [r1] -+ vld1.8 {q12}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q2, d24, d25 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ mov r1, #4 -+ vpadd.i32 d0, d0 @ This add U & V separately -+ lsl r3, #1 @ pels -+ vrshrn.u16 d0, q0, #4 -+ vdup.u16 q0, d0[0] @ Dup results ++ vld1.8 {q0}, [r1] ++ mov r1, #8 ++ vld1.8 {q1}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vadd.i16 q1, q0 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshrn.u16 d0, q1, #4 ++ vrshrn.u16 d1, q1, #4 + + @ Store +1: -+ vst1.8 {q0 }, [r0], r3 -+ subs r1, #1 -+ vst1.8 {q0 }, [r0], r3 ++ vst1.8 {q0}, [r0], r3 ++ subs r1, #4 ++ vst1.8 {q0}, [r2], r3 ++ vst1.8 {q0}, [r0], r3 ++ vst1.8 {q0}, [r2], r3 + bne 1b + + bx lr @@ -10450,52 +10744,55 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_16_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 { q8}, [r1] -+ vld1.8 {q12}, [r2] -+ vaddl.u8 q0, d16, d24 -+ vaddl.u8 q2, d17, d25 -+ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ vrshr.u16 d0, #5 -+ -+ vmov.i64 d31, #0xff ++ vld1.8 {q8}, [r1] ++ mov r1, #2 ++ vld1.8 {q9}, [r2] ++ vaddl.u8 q10, d16, d17 ++ vaddl.u8 q11, d16, d18 ++ vaddl.u8 q0, d18, d19 ++ vmov.i16 q1, #3 ++ vadd.i16 q10, q0 ++ vmovl.u8 q0, d18 ++ vadd.i16 d20, d21 ++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left + @ top_line[0] is extra special -+ @ (top[0] + left[0] + dc * 2) ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 + -+ vmov.u16 r12, d0[0] @ dc -+ add r2, r12, r12, lsl #1 @ dc*3 -+ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 -+ -+ vdup.u16 q3, r2 -+ vaddw.u8 q1, q3, d16 -+ vaddw.u8 q2, q3, d17 -+ vmov.u16 d2[0], r1 -+ vrshrn.u16 d2, q1, #2 -+ vrshrn.u16 d3, q2, #2 -+ -+ @ Construct lhs pels -+ vaddw.u8 q2, q3, d24 -+ vaddw.u8 q3, q3, d25 -+ vrshrn.u16 d4, q2, #2 -+ vrshrn.u16 d5, q3, #2 ++ vmovl.u8 q2, d16 ++ vmovl.u8 q9, d19 ++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) ++ vmov.i64 d7, #0xffff ++ vmovl.u8 q8, d17 ++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] ++ vmov.i64 d7, #0xff ++ vpadd.i16 d20, d20 @ 1 (all the same) ++ vrshr.u16 d21, d20, #5 ++ vrshr.u16 d20, d20, #5 ++ vmla.i16 q0, q10, d2[1] ++ vmla.i16 q9, q10, d2[1] ++ vmla.i16 q2, q10, q1 ++ vmla.i16 q8, q10, d2[1] ++ vdup.8 q1, d20[0] ++ vrshrn.i16 d0, q0, #2 ++ vrshrn.i16 d1, q9, #2 ++ vrshrn.i16 d4, q2, #2 ++ vrshrn.i16 d5, q8, #2 ++ vext.8 q0, q0, q0, #1 + + @ Store top line -+ vst1.8 { q1}, [r0], r3 -+ -+ mov r1, #15 -+ vdup.u8 q0, d0[0] ++ vst1.8 {q2}, [r0], r3 + ++ @ Store the rest ++ mov r1, #15 +1: -+ vext.8 q2, q2, #1 -+ vbit d0, d4, d31 -+ subs r1, #1 -+ vst1.8 { q0}, [r0], r3 ++ vbit d2, d0, d7 ++ vext.8 q0, q0, q0, #1 ++ subs r1, #1 ++ vst1.8 {q1}, [r0], r3 + bne 1b + + bx lr @@ -10511,33 +10808,34 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 { q8, q9}, [r1] -+ vld1.8 {q12,q13}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q1, d18, d19 -+ vaddl.u8 q2, d24, d25 -+ vaddl.u8 q3, d26, d27 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ lsl r3, #1 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ mov r1, #4 -+ vpadd.i32 d0, d0 @ This add U & V separately -+ add r2, r0, r3 -+ vmov d1, d0 -+ lsl r3, #1 -+ vrshrn.u16 d0, q0, #5 -+ vmov d1, d0 @ Dup results -+ vmov q1, q0 ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #16 ++ vld1.8 {q2-q3}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vaddl.u8 q1, d2, d3 ++A lsl r3, #2 ++T lsl r3, #1 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshrn.u16 d0, q2, #5 ++ vrshrn.u16 d1, q2, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q2, #5 + + @ Store +1: -+ vst1.8 { q0, q1}, [r0], r3 -+ vst1.8 { q0, q1}, [r2], r3 -+ subs r1, #1 -+ vst1.8 { q0, q1}, [r0], r3 -+ vst1.8 { q0, q1}, [r2], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10553,32 +10851,32 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_32_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 {q8, q9 }, [r1] -+ vld1.8 {q12, q13}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q1, d18, d19 -+ vaddl.u8 q2, d24, d25 -+ vaddl.u8 q3, d26, d27 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ mov r1, #8 -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ add r2, r0, r3 -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ lsl r3, #1 -+ vrshrn.u16 d0, q0, #6 -+ vdup.u8 q1, d0[0] @ Dup results -+ vdup.u8 q0, d0[0] ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #32 ++ vld1.8 {q2-q3}, [r2] ++ add r2, r0, r3 ++ vaddl.u8 q0, d0, d1 ++ lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshrn.u16 d0, q2, #6 ++ vrshrn.u16 d1, q2, #6 ++ vrshrn.u16 d2, q2, #6 ++ vrshrn.u16 d3, q2, #6 + + @ Store +1: -+ vst1.8 {q0, q1 }, [r0], r3 -+ vst1.8 {q0, q1 }, [r2], r3 -+ subs r1, #1 -+ vst1.8 {q0, q1 }, [r0], r3 -+ vst1.8 {q0, q1 }, [r2], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10616,7 +10914,7 @@ index 0000000000..af7ba1f45e +T lsl r3, #1 + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vmov.i64 d7, #0xffff -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10701,7 +10999,7 @@ index 0000000000..af7ba1f45e + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vadd.i16 d6, d2, d3 @ d6 has 4 vals -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10748,24 +11046,30 @@ index 0000000000..af7ba1f45e +@ ptrdiff_t stride) [r3] (In pels - needs * 4) + +function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 -+ vld1.8 { q8, q9 }, [r1] -+ vld1.8 {q12, q13}, [r2] -+ vadd.i16 q8, q9 -+ vadd.i16 q12, q13 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 2 pairs -+ mov r1, #4 -+ vpadd.i32 d16, d16 -+ lsl r3, #2 @ stride in pels -+ vrshr.u16 d16, #4 -+ vdup.u32 q9, d16[0]; -+ vdup.u32 q8, d16[0]; ++ ++ @ Average the els of top & left ++ vld1.16 {q0-q1}, [r1] ++ mov r1, #8 ++ vld1.16 {q2-q3}, [r2] ++T lsl r3, #2 ++ vadd.i16 q1, q0 ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q2, q3 ++ vadd.i16 q1, q2 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshr.u16 q0, q1, #4 ++ vrshr.u16 q1, q1, #4 + + @ Store +1: -+ vst1.16 {q8, q9 }, [r0], r3 -+ subs r1, #1 -+ vst1.16 {q8, q9 }, [r0], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10781,55 +11085,57 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_16_neon_10, export=1 + + @ Average the els of top & left -+ vld1.16 {q8, q9 }, [r1] -+ vld1.16 {q12, q13}, [r2] -+ lsl r3, #1 @ stride given in pels -+ vadd.u16 q0, q8, q12 -+ vadd.u16 q2, q9, q13 -+ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ vrshr.u16 d0, #5 -+ ++ vld1.16 {q8-q9}, [r1] ++ mov r1, #2 ++ vld1.16 {q10-q11}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.i16 q0, q8, q9 ++ vadd.i16 q1, q10, q11 ++ vmov.i16 q3, #3 ++ vadd.i16 q1, q0 ++ vadd.i16 d0, d16, d20 + vmov.i64 d31, #0xffff ++ vadd.i16 d3, d2 ++ vmov.16 d6[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + dc * 2) ++ @ as does left ++ @ topline[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 + -+ vmov.u16 r12, d0[0] @ dc -+ add r2, r12, r12, lsl #1 @ dc*3 -+ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 -+ -+ vdup.u16 q3, r2 -+ vadd.u16 q8, q3 -+ vadd.u16 q9, q3 -+ vmov.u16 d16[0], r1 -+ vrshr.u16 q8, #2 -+ vrshr.u16 q9, #2 -+ -+ @ Construct lhs pels -+ vadd.u16 q12, q3 -+ vadd.u16 q13, q3 -+ vrshr.u16 q12, #2 -+ vrshr.u16 q13, #2 ++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] ++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d3, d3 @ 1 (all the same) ++ vrshr.u16 d2, d3, #5 ++ vrshr.u16 d3, d3, #5 ++ vmov q0, q1 ++ vmla.i16 q10, q1, d6[1] ++ vmla.i16 q11, q1, d6[1] ++ vmla.i16 q8, q1, q3 ++ vmla.i16 q9, q1, d6[1] ++ vrshr.u16 q2, q10, #2 ++ vrshr.u16 q3, q11, #2 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vext.16 q2, q2, q2, #1 ++ mov r1, #7<<29 + + @ Store top line -+ vst1.16 {q8, q9 }, [r0], r3 -+ -+ mov r1, #15 -+ vdup.u16 q1, d0[0] -+ vdup.u16 q0, d0[0] ++ vst1.16 {q8-q9}, [r0], r3 + ++ @ Store the rest +1: -+ vext.16 q12, q13, #1 -+ vext.16 q13, q13, #1 -+ vbit d0, d24, d31 -+ subs r1, #1 -+ vst1.16 {q0, q1 }, [r0], r3 ++ vbit d0, d4, d31 ++ vext.16 q2, q2, q2, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++1: ++ vbit d0, d6, d31 ++ vext.16 q3, q3, q3, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 + bne 1b + + bx lr @@ -10845,33 +11151,30 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 + + @ Average the els of top & left -+ vldm r1, { q8-q11} -+ vldm r2, {q12-q15} -+ vadd.i16 q8, q9 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #16 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #2 + vadd.i16 q10, q11 -+ vadd.i16 q12, q13 -+ vadd.i16 q14, q15 -+ vadd.i16 q8, q10 -+ vadd.i16 q12, q14 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 2 pairs -+ mov r1, #8 -+ vpadd.i32 d16, d16 -+ lsl r3, #2 @ stride in pels -+ vrshr.u16 d16, #5 -+ vmov d17, d16 @ Dup results -+ vmov q9, q8 -+ vmov q10, q8 -+ vmov q11, q8 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshr.u16 q0, q2, #5 ++ vrshr.u16 q1, q2, #5 + + @ Store +1: -+ vstm r0, {q8-q11} -+ add r0, r3 -+ subs r1, #1 -+ vstm r0, {q8-q11} -+ add r0, r3 -+ bne 1b ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b + + bx lr +endfunc @@ -10887,39 +11190,920 @@ index 0000000000..af7ba1f45e + + @ Average the els of top & left + @ With 10 bits we are (just) safe from overflow in i16 -+ vldm r1, { q8-q11} -+ vldm r2, {q12-q15} -+ vadd.i16 q8, q9 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #32 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #1 + vadd.i16 q10, q11 -+ vadd.i16 q12, q13 -+ vadd.i16 q14, q15 -+ vadd.i16 q8, q10 -+ vadd.i16 q12, q14 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 4 vals -+ mov r1, #16 -+ vpadd.i16 d16, d16 @ 2 (top & bottom the same) -+ lsl r3, #1 @ stride in pels -+ vpadd.i16 d16, d16 @ 1 (all the same) -+ vrshr.u16 d16, #6 -+ vmov d17, d16 @ Dup results -+ vmov q9, q8 -+ vmov q10, q8 -+ vmov q11, q8 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshr.u16 q0, q2, #6 ++ vrshr.u16 q1, q2, #6 + + @ Store +1: -+ vstm r0, { q8-q11} -+ add r0, r3 -+ subs r1, #1 -+ vstm r0, { q8-q11} -+ add r0, r3 -+ bne 1b ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b + + bx lr +endfunc + + +diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +new file mode 100644 +index 0000000000..11773f918e +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +@@ -0,0 +1,878 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ All functions have the call ++@ ++@ int ff_hevc_rpi_intra_filter_N_neon_PW( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++@ ++@ Assumptions: ++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware ++@ if reuseing this code) ++@ ++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for ++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore ++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger ++@ ++@ We always have at least 64 pixel H frame width rounding - this lets us ++@ load UR widthout having to worry about exactly how many pixels are actually ++@ within the frame. As partial loads will only occur very occasionally this ++@ should be a win in nearly all cases. ++@ ++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters ++@ so we do no maths on the contents ++@ ++@ No filtering in 32bit fns as they are chroma only ++ ++ ++.equ AVAIL_UR, 1 ++.equ AVAIL_U, 2 ++.equ AVAIL_UL, 4 ++.equ AVAIL_L, 8 ++.equ AVAIL_DL, 16 ++ ++.equ FILTER_LIGHT, 0x40 ++.equ FILTER_STRONG, 0x80 ++ ++.equ AVAIL_S_UR_N_U_C, 32 - 1 ++.equ AVAIL_S_U_N_UL_C, 32 - 2 ++.equ AVAIL_S_UL_N_L_C, 32 - 3 ++.equ AVAIL_S_L_N_DL_C, 32 - 4 ++ ++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr ++ ++@ On entry ++@ r2 req ++@ r3 avail ++@ [sp, #sp_offset...] args ++@ ++@ On Exit: ++@ ++@ Extend values: ++@ d_l scalar contains value for L & DL ++@ d_ul scalar containing value for UL ++@ d_u scalar containing value for U ++@ d_ur scalar containing value for UR ++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... ++@ This means that L-filter works even if nreq DL (we never filter ++@ req-DL without req-L, but we do filter req-L without req-DL) ++@ If UR avail then d_ur == a_ur so U-filter good too ++@ ++@ Data load pointers (only load if req & avail): ++@ r4 DL ++@ r10 L ++@ r6 U ++@ r5 UR ++@ ++@ Others: ++@ r2 req ++@ r7 req & avail ++@ r3 L + stride ++@ r8 DL + stride ++@ r9 stride * 2 ++@ cs Load U ++@ mi Load UR ++@ ++@ Clobbered: ++@ r12 ++ ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur ++ ++.equ src_l\@, \sp_offset + 0 ++.equ src_u\@, \sp_offset + 4 ++.equ src_ur\@, \sp_offset + 8 ++.equ stride\@, \sp_offset + 12 ++.equ pw\@, (1 << \pw_s) @ pel width in bytes ++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes ++ ++@ r9 stride ++@ r7 = ab_ul, r6 = a_u, r5 = a_ur ++@ r4 = b_dl, r10 = b_l, r8 = b_u ++ ++ ldr r5, [sp, #src_ur\@] ++ lsl r12, r3, #AVAIL_S_U_DL_CPSR ++ ldr r10, [sp, #src_l\@] ++ ldr r9, [sp, #stride\@] ++ ldr r6, [sp, #src_u\@] ++ ++ @ This is quite a slow instruction but it replaces ++ @ a decent number of tests that yield a max of 2 flags/op ++ @ It is annoying we can't branch on Q! ++ @ If L navail (ne) then DL must be navail (pl) ++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur ++ ++ mov r4, r5 ++ sub r7, r10, r9 ++ it vs ++ movvs r4, r6 ++ add r8, r6, #b_size\@ - pw\@ ++ it cs ++ movcs r4, r7 ++ ite ne ++ movne r10, r4 ++ addeq r4, r7, r9, lsl #\log2_s ++ it cc ++ movcc r7, r10 ++ it mi ++ addmi r4, r10, r9, lsl #\log2_s ++ vld1.\d_type {\d_ul}, [r7] ++ itt vc ++ movvc r8, r7 ++ movvc r6, r7 ++ vld1.\d_type {\d_l }, [r4] ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_u }, [r6] ++ it eq ++ moveq r5, r8 ++ and r7, r2, r3 ++ add r8, r4, r9 ++ vld1.\d_type {\d_ur}, [r5] ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ add r3, r10, r9 ++ lsl r9, #1 ++.endm ++ ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] ++ ++ it cs ++ vldrcs s2, [r6] ++ ite pl ++ vmovpl s3, s4 ++ vldrmi s3, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10] ++ vld1.8 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d0[4]}, [r4], r9 ++ vld1.8 {d0[5]}, [r8], r9 ++ vld1.8 {d0[6]}, [r4] ++ vld1.8 {d0[7]}, [r8] ++1: ++ vstr d1, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] ++ vstr d0, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] ++ ++ it cs ++ vldrcs d2, [r6] ++ it mi ++ vldrmi d3, [r5] ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10] ++ vld1.16 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d1[0]}, [r4], r9 ++ vld1.16 {d1[1]}, [r8], r9 ++ vld1.16 {d1[2]}, [r4] ++ vld1.16 {d1[3]}, [r8] ++1: ++ vst1.16 {q1}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] ++ vst1.16 {q0}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++ ++function ff_hevc_rpi_intra_filter_8_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] ++ ++ it cs ++ vldrcs d4, [r6] ++ it mi ++ vldrmi d5, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10], r9 ++ vld1.8 {d0[3]}, [r3], r9 ++ vld1.8 {d0[4]}, [r10], r9 ++ vld1.8 {d0[5]}, [r3], r9 ++ vld1.8 {d0[6]}, [r10] ++ vld1.8 {d0[7]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d1[0]}, [r4], r9 ++ vld1.8 {d1[1]}, [r8], r9 ++ vld1.8 {d1[2]}, [r4], r9 ++ vld1.8 {d1[3]}, [r8], r9 ++ vld1.8 {d1[4]}, [r4], r9 ++ vld1.8 {d1[5]}, [r8], r9 ++ vld1.8 {d1[6]}, [r4] ++ vld1.8 {d1[7]}, [r8] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.8 q8, q15, q2, #15 ++ vext.8 q12, q15, q0, #15 ++ vaddl.u8 q9, d17, d5 ++ vaddl.u8 q8, d16, d4 ++ vaddl.u8 q13, d25, d1 ++ vaddl.u8 q12, d24, d0 ++ vmov.u8 r3, d5[7] @ Save final pel ++ vmov.u8 r2, d1[7] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ vrshrn.u16 d0, q0, #2 ++ vrshrn.u16 d1, q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u8 d5[7], r3 @ Restore final pel ++ vmov.u8 d1[7], r2 @ Restore final pel ++ vdup.u8 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.8 {q2 }, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] @ Up-left ++ vst1.8 {q0 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #4 ++ vldm r5, {d6, d7} ++ bgt 1f ++ vdup.16 d7, d6[3] ++1: ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vdup.16 q1, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10] ++ vld1.16 {d1[3]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.16 {d2[0]}, [r4], r9 ++ vld1.16 {d2[1]}, [r8], r9 ++ cmp r12, #p_size ++ vld1.16 {d2[2]}, [r4], r9 ++ vld1.16 {d2[3]}, [r8], r9 ++ blt 2f ++ vld1.16 {d3[0]}, [r4], r9 ++ vld1.16 {d3[1]}, [r8], r9 ++ vld1.16 {d3[2]}, [r4] ++ vld1.16 {d3[3]}, [r8] ++ b 1f ++2: ++ vdup.16 d3, d2[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.16 q9, q2, q3, #7 ++ vext.16 q8, q15, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ vadd.u16 q9, q3 ++ vadd.u16 q8, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r3, d7[3] @ Save final pel ++ vmov.u16 r2, d3[3] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r3 @ Restore final pel ++ vmov.u16 d3[3], r2 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.16 {q2, q3}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vst1.16 {q0, q1}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.16 q9, d16[0] ++ vdup.16 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {d16-d19} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #12 ++ @ Given chroma frame layout, if UR exists then it is always legit to ++ @ load all of it even if most of it is outside the frame. ++ vldm r5, {d20-d23} ++ bgt 1f ++ bge 4f ++ cmp r5, #8 ++ bge 3f ++ vdup.16 d21, d20[3] ++3: vdup.16 d22, d21[3] ++4: vdup.16 d23, d22[3] ++ ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ ldr r12, [sp, #dl_size] ++ vdup.16 q1, d0[0] ++ vdup.16 q2, d0[0] ++ vdup.16 q3, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10], r9 ++ vld1.16 {d1[3]}, [r3], r9 ++ vld1.16 {d2[0]}, [r10], r9 ++ vld1.16 {d2[1]}, [r3], r9 ++ vld1.16 {d2[2]}, [r10], r9 ++ vld1.16 {d2[3]}, [r3], r9 ++ vld1.16 {d3[0]}, [r10], r9 ++ vld1.16 {d3[1]}, [r3], r9 ++ vld1.16 {d3[2]}, [r10] ++ vld1.16 {d3[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d4[0]}, [r4], r9 ++ vld1.16 {d4[1]}, [r8], r9 ++ cmp r12, #4 ++ vld1.16 {d4[2]}, [r4], r9 ++ vld1.16 {d4[3]}, [r8], r9 ++ ble 2f ++ vld1.16 {d5[0]}, [r4], r9 ++ vld1.16 {d5[1]}, [r8], r9 ++ cmp r12, #12 ++ vld1.16 {d5[2]}, [r4], r9 ++ vld1.16 {d5[3]}, [r8], r9 ++ blt 3f ++ vld1.16 {d6[0]}, [r4], r9 ++ vld1.16 {d6[1]}, [r8], r9 ++ vld1.16 {d6[2]}, [r4], r9 ++ vld1.16 {d6[3]}, [r8], r9 ++ ble 4f ++ vld1.16 {d7[0]}, [r4], r9 ++ vld1.16 {d7[1]}, [r8], r9 ++ vld1.16 {d7[2]}, [r4] ++ vld1.16 {d7[3]}, [r8] ++ b 1f ++2: vdup.16 d5, d4[3] ++3: vdup.16 d6, d5[3] ++4: vdup.16 d7, d6[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ vpush {q5} ++ @ Luma light filter ++ @ Left ++ vext.16 q5, q2, q3, #7 ++ vext.16 q14, q1, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ ++ vadd.u16 q5, q3 ++ vadd.u16 q14, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r2, d7[3] @ Save final pel ++ ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q14, #1 ++ vext.16 q2, q14, q5, #1 ++ vext.16 q3, q5, q5, #1 ++ ++ vmov d30, d24 @ d30[0] = l[0] + ul ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ vadd.u16 q2, q14 ++ vadd.u16 q3, q5 ++ ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ ++ @ Up ++ vext.16 q5, q10, q11, #7 ++ vext.16 q14, q9, q10, #7 ++ vext.16 q13, q8, q9, #7 ++ vext.16 q12, q15, q8, #7 ++ ++ vadd.u16 q5, q11 ++ vadd.u16 q14, q10 ++ vadd.u16 q13, q9 ++ vadd.u16 q12, q8 ++ vmov.u16 r3, d23[3] @ Save final pel ++ ++ vext.16 q8, q12, q13, #1 ++ vext.16 q9, q13, q14, #1 ++ vext.16 q10, q14, q5, #1 ++ vext.16 q11, q5, q5, #1 ++ ++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q8, q12 ++ vadd.u16 q9, q13 ++ vadd.u16 q10, q14 ++ vadd.u16 q11, q5 ++ ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vrshr.u16 q10, #2 ++ vrshr.u16 q11, #2 ++ ++ @ Misc ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r2 @ Restore final pel ++ vmov.u16 d23[3], r3 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vpop {q5} ++ ++10: ++ vstm r1, {d16-d23} @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vstm r0, { d0-d7 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ it mi ++ vldmmi r5, {d6, d7} ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10] ++ vld1.32 {d1[1]}, [r3] ++1: ++ bcc 1f ++ vld1.32 {d2[0]}, [r4], r9 ++ vld1.32 {d2[1]}, [r8], r9 ++ vld1.32 {d3[0]}, [r4] ++ vld1.32 {d3[1]}, [r8] ++1: ++ vst1.32 {q2, q3 }, [r1] @ Up ++ vst1.32 {d31[1]}, [r12] ++ vst1.32 {q0, q1 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.32 q9, d16[0] ++ vdup.32 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {q8, q9 } ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #p_size ++ vldm r5, {q10, q11} ++ bge 1f ++ vdup.32 q11, d21[1] ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ vdup.32 q2, d0[0] ++ vdup.32 q3, d0[0] ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10], r9 ++ vld1.32 {d1[1]}, [r3], r9 ++ vld1.32 {d2[0]}, [r10], r9 ++ vld1.32 {d2[1]}, [r3], r9 ++ vld1.32 {d3[0]}, [r10] ++ vld1.32 {d3[1]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.32 {d4[0]}, [r4], r9 ++ vld1.32 {d4[1]}, [r8], r9 ++ cmp r12, #p_size ++ vld1.32 {d5[0]}, [r4], r9 ++ vld1.32 {d5[1]}, [r8], r9 ++ blt 2f ++ vld1.32 {d6[0]}, [r4], r9 ++ vld1.32 {d6[1]}, [r8], r9 ++ vld1.32 {d7[0]}, [r4] ++ vld1.32 {d7[1]}, [r8] ++ b 1f ++2: ++ vdup.32 q3, d5[1] ++1: ++ add r12, r0, #-pw ++ vstm r1, { q8-q11} @ Up ++ vst1.32 {d31[1]}, [r12] ++ vstm r0, { q0-q3 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] ++ ++ @ Once we get this big we have run out of neon regs to store ++ @ everything at once so do in pieces ++ ++ @ Up (have) ++ it cs ++ vldmcs r6, { q0-q3 } ++ ldr r12, [sp, #ur_size] ++ it mi ++ vldmmi r5, { q8-q11} ++ it cs ++ vstmcs r1, { q0-q3 } ++ bpl 1f ++ cmp r12, #12 ++ add lr, r1, #(pw << log2_s) ++ bgt 2f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 q9, d17[1] ++4: vdup.16 d10, d19[1] ++3: vdup.16 q11, d21[1] ++2: vstm lr, { q8-q11} ++1: ++ ++ @ Left (have) ++ add lr, r0, #-pw ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vst1.32 {d30[1]}, [lr] @ UL ++ bpl 1f ++ vld1.32 { d0[0]}, [r10], r9 ++ vld1.32 { d0[1]}, [r3], r9 ++ vld1.32 { d1[0]}, [r10], r9 ++ vld1.32 { d1[1]}, [r3], r9 ++ vld1.32 { d2[0]}, [r10], r9 ++ vld1.32 { d2[1]}, [r3], r9 ++ vld1.32 { d3[0]}, [r10], r9 ++ vld1.32 { d3[1]}, [r3], r9 ++ vld1.32 { d4[0]}, [r10], r9 ++ vld1.32 { d4[1]}, [r3], r9 ++ vld1.32 { d5[0]}, [r10], r9 ++ vld1.32 { d5[1]}, [r3], r9 ++ vld1.32 { d6[0]}, [r10], r9 ++ vld1.32 { d6[1]}, [r3], r9 ++ vld1.32 { d7[0]}, [r10] ++ vld1.32 { d7[1]}, [r3] ++ vstm r0, { q0-q3 } ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ add lr, r0, #(pw << log2_s) ++ vld1.32 {d16[0]}, [r4], r9 ++ vld1.32 {d16[1]}, [r8], r9 ++ cmp r12, #4 ++ vld1.32 {d17[0]}, [r4], r9 ++ vld1.32 {d17[1]}, [r8], r9 ++ ble 2f ++ vld1.32 {d18[0]}, [r4], r9 ++ vld1.32 {d18[1]}, [r8], r9 ++ cmp r12, #12 ++ vld1.32 {d19[0]}, [r4], r9 ++ vld1.32 {d19[1]}, [r8], r9 ++ blt 3f ++ vld1.32 {d20[0]}, [r4], r9 ++ vld1.32 {d20[1]}, [r8], r9 ++ vld1.32 {d21[0]}, [r4], r9 ++ vld1.32 {d21[1]}, [r8], r9 ++ ble 4f ++ vld1.32 {d22[0]}, [r4], r9 ++ vld1.32 {d22[1]}, [r8], r9 ++ vld1.32 {d23[0]}, [r4] ++ vld1.32 {d23[1]}, [r8] ++ b 5f ++2: vdup.32 q9, d17[1] ++3: vdup.32 q10, d19[1] ++4: vdup.32 q11, d21[1] ++5: vstm lr, { q8-q11} ++1: ++ eors r7, r2 ++ beq 99f ++ ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ vdup.32 q0, d31[0] ++ vdup.32 q1, d31[0] ++ vdup.32 q2, d31[0] ++ vdup.32 q3, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ vdup.32 q10, d31[1] ++ vdup.32 q11, d31[1] ++ it cs ++ vstmcs r1, { q0-q3 } ++ it mi ++ vstmmi lr, { q8-q11} ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q0, d30[0] ++ vdup.32 q1, d30[0] ++ vdup.32 q2, d30[0] ++ vdup.32 q3, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0, { q0-q3 } ++ it cs ++ vstmcs lr, { q0-q3 } ++ ++99: ++ pop {r4-r10, pc} ++endfunc ++ ++ ++ ++ diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S new file mode 100644 index 0000000000..ccf13a081f @@ -12940,10 +14124,10 @@ index d181b74570..c52c450956 100644 if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c new file mode 100644 -index 0000000000..4891a79eb5 +index 0000000000..f053ebcc59 --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c -@@ -0,0 +1,2269 @@ +@@ -0,0 +1,2266 @@ +/* + * HEVC CABAC decoding + * @@ -13831,9 +15015,9 @@ index 0000000000..4891a79eb5 + int x_cb = x0 >> s->ps.sps->log2_min_cb_size; + int y_cb = y0 >> s->ps.sps->log2_min_cb_size; + -+ if (lc->ctb_left_flag || x0b) ++ if ((lc->ctb_avail & AVAIL_L) != 0 || x0b) + depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1]; -+ if (lc->ctb_up_flag || y0b) ++ if ((lc->ctb_avail & AVAIL_U) != 0 || y0b) + depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb]; + + inc += (depth_left > ct_depth); @@ -14371,7 +15555,6 @@ index 0000000000..4891a79eb5 + + // Rewrite as add residual - must rewrite all fields as different union member + pc->type = RPI_PRED_ADD_RESIDUAL_V; -+ pc->c_idx = c_idx; + pc->ta.buf = coeffs; + pc->ta.dst = dst; + pc->ta.stride = stride; @@ -14384,7 +15567,6 @@ index 0000000000..4891a79eb5 + + cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); + cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; + cmd->ta.buf = coeffs; + cmd->ta.dst = dst; + cmd->ta.stride = stride; @@ -14440,7 +15622,6 @@ index 0000000000..4891a79eb5 + + cmd->type = RPI_PRED_ADD_DC + c_idx; + cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; + cmd->dc.dst = dst; + cmd->dc.stride = stride; + cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; @@ -15215,7 +16396,7 @@ index 0000000000..4891a79eb5 +#endif diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h new file mode 100644 -index 0000000000..a360815a36 +index 0000000000..f6daf936ca --- /dev/null +++ b/libavcodec/rpi_hevc_cabac_fns.h @@ -0,0 +1,190 @@ @@ -15349,9 +16530,9 @@ index 0000000000..a360815a36 + const uint8_t * const skip_bits = s->skip_flag + y_cb * stride; + + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + -+ ((!lc->ctb_left_flag && (x0 & ctb_mask) == 0) ? 0 : ++ (((lc->ctb_avail & AVAIL_L) == 0 && (x0 & ctb_mask) == 0) ? 0 : + (skip_bits[((x_cb - 1) >> 3)] >> ((x_cb - 1) & 7)) & 1) + -+ ((!lc->ctb_up_flag && (y0 & ctb_mask) == 0) ? 0 : ++ (((lc->ctb_avail & AVAIL_U) == 0 && (y0 & ctb_mask) == 0) ? 0 : + (skip_bits[(x_cb >> 3) - stride] >> (x_cb & 7)) & 1)); +} + @@ -15529,10 +16710,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..4bfa000da4 +index 0000000000..05d447eaa5 --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1236 @@ +@@ -0,0 +1,1210 @@ +/* + * HEVC video decoder + * @@ -15653,28 +16834,6 @@ index 0000000000..4bfa000da4 + return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; +} + -+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+int i, j; -+ -+ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=8) -+ AV_COPY64U(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } else { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=16) -+ AV_COPY128(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } -+} -+ +// "DSP" these? +static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) +{ @@ -15914,7 +17073,7 @@ index 0000000000..4bfa000da4 + [2*MAX_PB_SIZE*MAX_PB_SIZE]; + dst = dstbuf; + stride_dst = 2*MAX_PB_SIZE; -+ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, @@ -16028,10 +17187,7 @@ index 0000000000..4bfa000da4 + } + } + -+ copy_CTB(dst, -+ src, -+ width << sh, -+ height, stride_dst, stride_src); ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); @@ -16063,7 +17219,6 @@ index 0000000000..4bfa000da4 + horiz_edge, + diag_edge); + } -+ // ??? Does this actually work for chroma ??? + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + sao->type_idx[c_idx] = SAO_APPLIED; @@ -16771,10 +17926,10 @@ index 0000000000..4bfa000da4 + diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 -index 0000000000..93a6294c76 +index 0000000000..f283f01489 --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,759 @@ +@@ -0,0 +1,704 @@ +/* + * HEVC video decoder + * @@ -16816,43 +17971,6 @@ index 0000000000..93a6294c76 + { 3, 2, }, +}; + -+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH) -+{ -+ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); -+ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); -+ -+ lc->na.cand_up = (lc->ctb_up_flag || y0b); -+ lc->na.cand_left = (lc->ctb_left_flag || x0b); -+ lc->na.cand_up_left = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up; -+ lc->na.cand_up_right = (x0 + nPbW) >= lc->end_of_ctb_x ? -+ (lc->ctb_up_right_flag && !y0b) : lc->na.cand_up; -+ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_ctb_y) ? 0 : lc->na.cand_left; -+} -+ -+/* -+ * 6.4.1 Derivation process for z-scan order block availability -+ */ -+static av_always_inline int z_scan_block_avail(const HEVCRpiContext * const s, const int xCurr, const int yCurr, -+ const int xN, const int yN) -+{ -+#define MIN_TB_ADDR_ZS(x, y) \ -+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] -+ -+ int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size; -+ int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size; -+ int xN_ctb = xN >> s->ps.sps->log2_ctb_size; -+ int yN_ctb = yN >> s->ps.sps->log2_ctb_size; -+ if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb ) -+ return 1; -+ else { -+ int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, -+ (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); -+ int N = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, -+ (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); -+ return N <= Curr; -+ } -+} + +//check if the two luma locations belong to the same motion estimation region +static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP) @@ -17042,9 +18160,6 @@ index 0000000000..93a6294c76 +#define AVAILABLE(cand, v) \ + (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA)) + -+#define PRED_BLOCK_AVAILABLE(v) \ -+ z_scan_block_avail(s, x0, y0, x ## v, y ## v) -+ +#define COMPARE_MV_REFIDX(a, b) \ + compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b)) + @@ -17053,7 +18168,7 @@ index 0000000000..93a6294c76 + */ +static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, + int nPbW, int nPbH, -+ int log2_cb_size, ++ int log2_cb_size, const unsigned int avail, + int singleMCLFlag, int part_idx, + int merge_idx, + struct MvField mergecandlist[]) @@ -17062,13 +18177,6 @@ index 0000000000..93a6294c76 + const MvField * const tab_mvf = s->ref->tab_mvf; + + const int min_pu_width = s->ps.sps->min_pu_width; -+ -+ const int cand_bottom_left = lc->na.cand_bottom_left; -+ const int cand_left = lc->na.cand_left; -+ const int cand_up_left = lc->na.cand_up_left; -+ const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right; -+ + const int xA1 = x0 - 1; + const int yA1 = y0 + nPbH - 1; + @@ -17106,7 +18214,7 @@ index 0000000000..93a6294c76 + is_diff_mer(s, xA1, yA1, x0, y0)) { + is_available_a1 = 0; + } else { -+ is_available_a1 = AVAILABLE(cand_left, A1); ++ is_available_a1 = AVAILABLE((avail & AVAIL_L) != 0, A1); + if (is_available_a1) { + mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1); + if (merge_idx == 0) @@ -17122,7 +18230,7 @@ index 0000000000..93a6294c76 + is_diff_mer(s, xB1, yB1, x0, y0)) { + is_available_b1 = 0; + } else { -+ is_available_b1 = AVAILABLE(cand_up, B1); ++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); + if (is_available_b1 && + !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) { + mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1); @@ -17133,8 +18241,7 @@ index 0000000000..93a6294c76 + } + + // above right spatial merge candidate -+ is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ PRED_BLOCK_AVAILABLE(B0) && ++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0) && + !is_diff_mer(s, xB0, yB0, x0, y0); + + if (is_available_b0 && @@ -17146,8 +18253,7 @@ index 0000000000..93a6294c76 + } + + // left bottom spatial merge candidate -+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ PRED_BLOCK_AVAILABLE(A0) && ++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0) && + !is_diff_mer(s, xA0, yA0, x0, y0); + + if (is_available_a0 && @@ -17159,7 +18265,7 @@ index 0000000000..93a6294c76 + } + + // above left spatial merge candidate -+ is_available_b2 = AVAILABLE(cand_up_left, B2) && ++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2) && + !is_diff_mer(s, xB2, yB2, x0, y0); + + if (is_available_b2 && @@ -17261,8 +18367,8 @@ index 0000000000..93a6294c76 + part_idx = 0; + } + -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); + derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), + singleMCLFlag, part_idx, + merge_idx, mergecand_list); + @@ -17344,8 +18450,9 @@ index 0000000000..93a6294c76 + (y ## v) >> s->ps.sps->log2_min_pu_size, \ + pred, &mx, ref_idx_curr, ref_idx) + -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, ++ int x0, int y0, int nPbW, int nPbH, ++ int log2_cb_size, const unsigned int avail, int part_idx, + int merge_idx, MvField * const mv, + int mvp_lx_flag, int LX) +{ @@ -17375,11 +18482,6 @@ index 0000000000..93a6294c76 + int pred_flag_index_l0; + int pred_flag_index_l1; + -+ const int cand_bottom_left = lc->na.cand_bottom_left; -+ const int cand_left = lc->na.cand_left; -+ const int cand_up_left = lc->na.cand_up_left; -+ const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right; + ref_idx_curr = LX; + ref_idx = mv->ref_idx[LX]; + pred_flag_index_l0 = LX; @@ -17389,14 +18491,13 @@ index 0000000000..93a6294c76 + xA0 = x0 - 1; + yA0 = y0 + nPbH; + -+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ PRED_BLOCK_AVAILABLE(A0); ++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0); + + //left spatial merge candidate + xA1 = x0 - 1; + yA1 = y0 + nPbH - 1; + -+ is_available_a1 = AVAILABLE(cand_left, A1); ++ is_available_a1 = AVAILABLE((avail & AVAIL_L), A1); + if (is_available_a0 || is_available_a1) + isScaledFlag_L0 = 1; + @@ -17443,18 +18544,17 @@ index 0000000000..93a6294c76 + xB0 = x0 + nPbW; + yB0 = y0 - 1; + -+ is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ PRED_BLOCK_AVAILABLE(B0); ++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0); + + // above spatial merge candidate + xB1 = x0 + nPbW - 1; + yB1 = y0 - 1; -+ is_available_b1 = AVAILABLE(cand_up, B1); ++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); + + // above left spatial merge candidate + xB2 = x0 - 1; + yB2 = y0 - 1; -+ is_available_b2 = AVAILABLE(cand_up_left, B2); ++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2); + + // above right spatial merge candidate + if (is_available_b0) { @@ -17726,10 +18826,10 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..744e7cf248 +index 0000000000..4967b3f44c --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1957 @@ +@@ -0,0 +1,1934 @@ +/* + * HEVC Parameter Set decoding + * @@ -19107,7 +20207,6 @@ index 0000000000..744e7cf248 + av_freep(&pps->tile_size); + av_freep(&pps->tile_id); + av_freep(&pps->ctb_ts_flags); -+ av_freep(&pps->min_tb_addr_zs_tab); + + av_freep(&pps); +} @@ -19172,7 +20271,6 @@ index 0000000000..744e7cf248 +static inline int setup_pps(AVCodecContext * const avctx, + HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) +{ -+ int log2_diff; + int pic_area_in_ctbs; + int i, j, x, y, ctb_addr_rs, tile_id; + @@ -19276,9 +20374,8 @@ index 0000000000..744e7cf248 + pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); + pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); + pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); -+ pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab)); + if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || -+ !pps->tile_id || !pps->min_tb_addr_zs_tab || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { ++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { + return AVERROR(ENOMEM); + } + @@ -19374,26 +20471,6 @@ index 0000000000..744e7cf248 + } + } + -+ log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size; -+ pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1]; -+ for (y = 0; y < sps->tb_mask+2; y++) { -+ pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1; -+ pps->min_tb_addr_zs_tab[y] = -1; -+ } -+ for (y = 0; y < sps->tb_mask+1; y++) { -+ for (x = 0; x < sps->tb_mask+1; x++) { -+ int tb_x = x >> log2_diff; -+ int tb_y = y >> log2_diff; -+ int rs = sps->ctb_width * tb_y + tb_x; -+ int val = pps->ctb_addr_rs_to_ts[rs] << (log2_diff * 2); -+ for (i = 0; i < log2_diff; i++) { -+ int m = 1 << i; -+ val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0); -+ } -+ pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val; -+ } -+ } -+ + return 0; +} + @@ -19689,10 +20766,10 @@ index 0000000000..744e7cf248 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..00c1f14614 +index 0000000000..77af463e31 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,444 @@ +@@ -0,0 +1,442 @@ +/* + * HEVC parameter set parsing + * @@ -20099,8 +21176,6 @@ index 0000000000..00c1f14614 + uint16_t *tile_id; ///< TileId + uint16_t *tile_pos_ts; ///< TilePosRS + uint16_t *tile_size; ///< TileSize -+ int *min_tb_addr_zs; ///< MinTbAddrZS -+ int *min_tb_addr_zs_tab;///< MinTbAddrZS + uint8_t * ctb_ts_flags; + + uint8_t data[4096]; @@ -20108,14 +21183,14 @@ index 0000000000..00c1f14614 +} HEVCRpiPPS; + +typedef struct HEVCRpiParamSets { -+ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; -+ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; -+ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; -+ + /* currently active parameter sets */ + const HEVCRpiVPS *vps; + const HEVCRpiSPS *sps; + const HEVCRpiPPS *pps; ++ ++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; ++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; ++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; +} HEVCRpiParamSets; + +int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, @@ -25730,210 +26805,234 @@ index 0000000000..3caef20137 + diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h new file mode 100644 -index 0000000000..1c364492d0 +index 0000000000..18128f4311 --- /dev/null +++ b/libavcodec/rpi_hevc_transform10.h -@@ -0,0 +1,94 @@ +@@ -0,0 +1,106 @@ +static const unsigned char rpi_hevc_transform10 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 ++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 ++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 ++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 ++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 ++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 ++0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, 0x39, 0xef, // 0040 ++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 ++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 ++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 ++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 ++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 ++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 ++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 ++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 ++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 ++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 ++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 ++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 ++0x30, 0xc0, 0x06, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 ++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 ++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 ++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 ++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 ++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 ++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 ++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 ++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 ++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 ++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 ++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 ++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 ++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 ++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 ++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 ++0x00, 0x00, 0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, // 0128 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 ++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 ++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 ++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 ++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 ++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 ++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 ++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 ++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 ++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 ++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 ++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 ++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 ++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 ++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 ++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 ++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 ++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 ++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 ++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 ++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 ++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 ++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 ++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 ++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x02, // 0220 ++0x00, 0x00, 0x65, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 ++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 ++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 ++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 ++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 ++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 ++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 ++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 ++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 ++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 ++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 ++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 ++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 ++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 ++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 ++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 ++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 ++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 ++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 ++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 ++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 ++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 ++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 ++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 ++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 ++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 ++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 +}; diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h new file mode 100644 -index 0000000000..1128a2c054 +index 0000000000..3557348e30 --- /dev/null +++ b/libavcodec/rpi_hevc_transform8.h -@@ -0,0 +1,94 @@ +@@ -0,0 +1,106 @@ +static const unsigned char rpi_hevc_transform8 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 ++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 ++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 ++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 ++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 ++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 ++0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, 0x39, 0xef, // 0040 ++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 ++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 ++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 ++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 ++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 ++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 ++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 ++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 ++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 ++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 ++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 ++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 ++0x30, 0xc0, 0x04, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 ++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 ++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 ++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 ++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 ++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 ++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 ++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 ++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 ++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 ++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 ++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 ++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 ++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 ++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 ++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 ++0x00, 0x00, 0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, // 0128 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 ++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 ++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 ++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 ++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 ++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 ++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 ++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 ++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 ++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 ++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 ++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 ++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 ++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 ++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 ++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 ++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 ++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 ++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 ++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 ++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 ++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 ++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 ++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 ++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x08, // 0220 ++0x00, 0x00, 0x45, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 ++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 ++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 ++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 ++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 ++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 ++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 ++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 ++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 ++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 ++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 ++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 ++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 ++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 ++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 ++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 ++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 ++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 ++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 ++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 ++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 ++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 ++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 ++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 ++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 ++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 ++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..bddf0c3417 +index 0000000000..7c98f707d3 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5782 @@ +@@ -0,0 +1,5850 @@ +/* + * HEVC video Decoder + * @@ -27742,7 +28841,7 @@ index 0000000000..bddf0c3417 + + if (s->sh.slice_sample_adaptive_offset_flag[0] || + s->sh.slice_sample_adaptive_offset_flag[1]) { -+ if (lc->ctb_left_flag) ++ if ((lc->ctb_avail & AVAIL_L) != 0) + { + const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_left_flag) { @@ -27750,7 +28849,7 @@ index 0000000000..bddf0c3417 + return; + } + } -+ if (lc->ctb_up_flag) ++ if ((lc->ctb_avail & AVAIL_U) != 0) + { + const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_up_flag) { @@ -27832,19 +28931,97 @@ index 0000000000..bddf0c3417 + return jb->intra.cmds + jb->intra.n++; +} + -+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx) ++#define A0(x, y, U, L, UL, UR, DL) \ ++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) ++ ++#define A1(x, y, U, L, UL, UR, DL) \ ++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) ++ ++#define A2(x, y, U, L, UL, UR, DL) \ ++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) ++ ++#define A3(x, y, U, L, UL, UR, DL) \ ++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) ++ ++#define A4(x, y, U, L, UL, UR, DL) \ ++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) ++ ++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) ++{ ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_mask = ctb_size - 1; ++ const unsigned int tb_x = x & ctb_mask; ++ const unsigned int tb_y = y & ctb_mask; ++ ++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; ++ ++ unsigned int f = (lc->ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); ++ ++ if ((tb_x != 0 || tb_y != 0) && (~f & (AVAIL_L | AVAIL_U)) == 0) ++ f |= AVAIL_UL; ++ ++ ++ if (x + w >= lc->end_of_ctb_x) ++ { ++ if (tb_y == 0) ++ f |= (lc->ctb_avail & AVAIL_UR); ++ } ++ else ++ { ++ f |= (tb_y != 0) ? (tb_f[(w - 1) >> 2] & AVAIL_UR) : (lc->ctb_avail >> (AVAIL_S_U - AVAIL_S_UR)) & AVAIL_UR; ++ } ++#if AVAIL_S_U - AVAIL_S_UR < 0 ++#error Shift problem ++#endif ++ ++ // Never any D if Y beyond eoctb ++ if (y + h < lc->end_of_ctb_y) ++ { ++ if (tb_x == 0) ++ f |= (lc->ctb_avail << (AVAIL_S_DL - AVAIL_S_L)) & AVAIL_DL; ++ else ++ f |= tb_f[((h - 1) >> 2) * 16] & AVAIL_DL; ++ } ++#if AVAIL_S_DL - AVAIL_S_L < 0 ++#error Shift problem ++#endif ++ ++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, ++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], ++// lc->end_of_ctb_x, lc->end_of_ctb_y); ++ ++ return f; ++} ++ ++#undef A0 ++#undef A1 ++#undef A2 ++#undef A3 ++#undef A4 ++ ++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, ++ unsigned int avail) +{ + // If rpi_enabled then sand - U & V done on U call + if (c_idx <= 1) + { + HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_INTRA; ++ cmd->type = RPI_PRED_INTRA + c_idx; + cmd->size = log2_trafo_size; -+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; -+ cmd->c_idx = c_idx; ++ cmd->avail = avail; + cmd->i_pred.x = x0; + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ ++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); + } +} + @@ -27872,8 +29049,8 @@ index 0000000000..bddf0c3417 + + if (lc->cu.pred_mode == MODE_INTRA) { + const unsigned int trafo_size = 1 << log2_trafo_size; -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size); -+ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0); ++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size)); + } + + if (cbf_luma || cbf_chroma != 0) @@ -27940,6 +29117,8 @@ index 0000000000..bddf0c3417 + + if (cbf_luma) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); ++ ++ + if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) { + const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); + const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); @@ -27952,8 +29131,8 @@ index 0000000000..bddf0c3417 + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v)); + } + if (((cbf_chroma >> i) & CBF_CB0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), @@ -27979,10 +29158,10 @@ index 0000000000..bddf0c3417 + hls_cross_component_pred(lc, 1); + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { -+ if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); -+ } ++// if (lc->cu.pred_mode == MODE_INTRA) { ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// } + if (((cbf_chroma >> i) & CBF_CR0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), + log2_trafo_size_c, scan_idx_c, 2); @@ -27993,11 +29172,12 @@ index 0000000000..bddf0c3417 + int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer; + int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2; + const int size = 1 << log2_trafo_size_c; ++ int j; + + uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride + + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; -+ for (i = 0; i < (size * size); i++) { -+ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ for (j = 0; j < (size * size); j++) { ++ coeffs[j] = ((lc->tu.res_scale_val * coeffs_y[j]) >> 3); + } + s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride); + } @@ -28007,20 +29187,18 @@ index 0000000000..bddf0c3417 + int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v)); + } + if (((cbf_chroma >> i) & CBF_CB0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), + log2_trafo_size, scan_idx_c, 1); + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { -+ if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); -+ } ++// if (lc->cu.pred_mode == MODE_INTRA) { ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v)); ++// } + if (((cbf_chroma >> i) & CBF_CR0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), + log2_trafo_size, scan_idx_c, 2); @@ -28030,28 +29208,29 @@ index 0000000000..bddf0c3417 + if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) { + int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); + int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2); -+ if (ctx_cfmt(s) == 2) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); -+ } ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v)); ++// if (ctx_cfmt(s) == 2) { ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// } + } else if (blk_idx == 3) { + int trafo_size_h = 1 << (log2_trafo_size + 1); + int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase, -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2); -+ if (ctx_cfmt(s) == 2) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); -+ } ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v)); ++// if (ctx_cfmt(s) == 2) { ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v)); ++// } + } + } + @@ -28269,8 +29448,8 @@ index 0000000000..bddf0c3417 +{ + enum InterPredIdc inter_pred_idc = PRED_L0; + int mvp_flag; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); + -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); + mv->pred_flag = 0; + if (s->sh.slice_type == HEVC_SLICE_B) + inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); @@ -28282,7 +29461,7 @@ index 0000000000..bddf0c3417 + mv->pred_flag = PF_L0; + ff_hevc_rpi_hls_mvd_coding(lc); + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, + part_idx, merge_idx, mv, mvp_flag, 0); + mv->mv[0].x += lc->pu.mvd.x; + mv->mv[0].y += lc->pu.mvd.y; @@ -28300,7 +29479,7 @@ index 0000000000..bddf0c3417 + + mv->pred_flag += PF_L1; + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, + part_idx, merge_idx, mv, mvp_flag, 1); + mv->mv[1].x += lc->pu.mvd.x; + mv->mv[1].y += lc->pu.mvd.y; @@ -28996,12 +30175,10 @@ index 0000000000..bddf0c3417 + int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); + int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); + -+ int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size); -+ + // intra_pred_mode prediction does not cross vertical CTB boundaries -+ const unsigned int cand_up = (lc->ctb_up_flag || y0b) && (y0 > y_ctb) ? ++ const unsigned int cand_up = y0b != 0 ? + s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC; -+ const unsigned int cand_left = (lc->ctb_left_flag || x0b) ? ++ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) != 0 || x0b) ? + s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC; + + int intra_pred_mode; @@ -29408,16 +30585,17 @@ index 0000000000..bddf0c3417 + if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) + lc->boundary_flags |= BOUNDARY_UPPER_SLICE; + -+ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0; -+ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0; -+ + // Use line width rather than tile width for addr_in_slice test as + // addr_in_slice is in raster units -+ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && -+ (ctb_addr_rs_in_slice >= line_w + 1); + -+ lc->ctb_up_right_flag = (ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && -+ (ctb_addr_rs_in_slice + 1 >= line_w); ++ lc->ctb_avail = ++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | ++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | ++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | ++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && ++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); ++ // Down-left never avail at CTB level +} + + @@ -29455,22 +30633,11 @@ index 0000000000..bddf0c3417 + switch (cmd->type) + { + case RPI_PRED_INTRA: -+ { -+ HEVCRpiLocalContextIntra lci; // Abbreviated local context -+ HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci; -+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; -+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; -+ lc->na.cand_left = (cmd->na >> 3) & 1; -+ lc->na.cand_up_left = (cmd->na >> 2) & 1; -+ lc->na.cand_up = (cmd->na >> 1) & 1; -+ lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ if (cmd->c_idx == 0) -+ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); -+ else -+ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail); ++ break; ++ case RPI_PRED_INTRA_C: ++ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail); + break; -+ } -+ + case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; @@ -31718,10 +32885,10 @@ index 0000000000..bddf0c3417 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..d242727b2a +index 0000000000..d2ac038c9b --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1000 @@ +@@ -0,0 +1,958 @@ +/* + * HEVC video decoder + * @@ -31956,46 +33123,6 @@ index 0000000000..d242727b2a + PF_BI, +}; + -+enum IntraPredMode { -+ INTRA_PLANAR = 0, -+ INTRA_DC, -+ INTRA_ANGULAR_2, -+ INTRA_ANGULAR_3, -+ INTRA_ANGULAR_4, -+ INTRA_ANGULAR_5, -+ INTRA_ANGULAR_6, -+ INTRA_ANGULAR_7, -+ INTRA_ANGULAR_8, -+ INTRA_ANGULAR_9, -+ INTRA_ANGULAR_10, -+ INTRA_ANGULAR_11, -+ INTRA_ANGULAR_12, -+ INTRA_ANGULAR_13, -+ INTRA_ANGULAR_14, -+ INTRA_ANGULAR_15, -+ INTRA_ANGULAR_16, -+ INTRA_ANGULAR_17, -+ INTRA_ANGULAR_18, -+ INTRA_ANGULAR_19, -+ INTRA_ANGULAR_20, -+ INTRA_ANGULAR_21, -+ INTRA_ANGULAR_22, -+ INTRA_ANGULAR_23, -+ INTRA_ANGULAR_24, -+ INTRA_ANGULAR_25, -+ INTRA_ANGULAR_26, -+ INTRA_ANGULAR_27, -+ INTRA_ANGULAR_28, -+ INTRA_ANGULAR_29, -+ INTRA_ANGULAR_30, -+ INTRA_ANGULAR_31, -+ INTRA_ANGULAR_32, -+ INTRA_ANGULAR_33, -+ INTRA_ANGULAR_34, -+}; -+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 -+#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 -+ +enum SAOType { + SAO_NOT_APPLIED = 0, + SAO_BAND, @@ -32042,14 +33169,6 @@ index 0000000000..d242727b2a + uint8_t cu_transquant_bypass_flag; +} RpiCodingUnit; + -+typedef struct RpiNeighbourAvailable { -+ char cand_bottom_left; -+ char cand_left; -+ char cand_up; -+ char cand_up_left; -+ char cand_up_right; -+} RpiNeighbourAvailable; -+ +typedef struct RpiPredictionUnit { + uint8_t intra_pred_mode[4]; + uint8_t intra_pred_mode_c[4]; @@ -32115,14 +33234,8 @@ index 0000000000..d242727b2a + uint8_t dpb_no; +} HEVCFrame; + -+typedef struct HEVCRpiLocalContextIntra { -+ TransformUnit tu; -+ RpiNeighbourAvailable na; -+} HEVCRpiLocalContextIntra; -+ +typedef struct HEVCRpiLocalContext { -+ TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!) -+ RpiNeighbourAvailable na; ++ TransformUnit tu; + + CABACContext cc; + @@ -32163,10 +33276,20 @@ index 0000000000..d242727b2a + int8_t curr_qp_y; + int8_t qPy_pred; + -+ uint8_t ctb_left_flag; -+ uint8_t ctb_up_flag; -+ uint8_t ctb_up_right_flag; -+ uint8_t ctb_up_left_flag; ++// N.B. Used by asm (neon) - do not change ++#define AVAIL_S_UR 0 ++#define AVAIL_S_U 1 ++#define AVAIL_S_UL 2 ++#define AVAIL_S_L 3 ++#define AVAIL_S_DL 4 ++ ++#define AVAIL_U (1 << AVAIL_S_U) ++#define AVAIL_L (1 << AVAIL_S_L) ++#define AVAIL_UL (1 << AVAIL_S_UL) ++#define AVAIL_UR (1 << AVAIL_S_UR) ++#define AVAIL_DL (1 << AVAIL_S_DL) ++ ++ uint8_t ctb_avail; + int end_of_ctb_x; + int end_of_ctb_y; + @@ -32206,6 +33329,7 @@ index 0000000000..d242727b2a + RPI_PRED_ADD_DC_U, // Both U & V are effectively C + RPI_PRED_ADD_DC_V, + RPI_PRED_INTRA, ++ RPI_PRED_INTRA_C, + RPI_PRED_I_PCM, + RPI_PRED_CMD_MAX +}; @@ -32213,8 +33337,8 @@ index 0000000000..d242727b2a +typedef struct HEVCPredCmd { + uint8_t type; + uint8_t size; // log2 "size" used by all variants -+ uint8_t na; // i_pred - but left here as they pack well -+ uint8_t c_idx; // i_pred ++ uint8_t avail; // i_pred - but left here as they pack well ++ uint8_t dummy; + union { + struct { // TRANSFORM_ADD + uint8_t * dst; @@ -32544,7 +33668,6 @@ index 0000000000..d242727b2a + + // Put structures that allocate non-trivial storage at the end + // These are mostly used indirectly so position in the structure doesn't matter -+ HEVCRpiLocalContextIntra HEVClcIntra; + HEVCRpiPassQueue passq[RPI_PASSES]; +#if RPI_EXTRA_BIT_THREADS > 0 + int bt_started; @@ -32597,13 +33720,15 @@ index 0000000000..d242727b2a + +void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags); + -+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH); ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); ++ +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, + int merge_idx, MvField * const mv); +void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, ++ int nPbH, int log2_cb_size, const unsigned int avail, int part_idx, + int merge_idx, MvField * const mv, + int mvp_lx_flag, int LX); +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); @@ -32724,10 +33849,10 @@ index 0000000000..d242727b2a +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 -index 0000000000..c5d130c377 +index 0000000000..b041e0fd3f --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,419 @@ +@@ -0,0 +1,444 @@ +/* + * HEVC video decoder + * @@ -32970,6 +34095,30 @@ index 0000000000..c5d130c377 + return bs >> shift; +} + ++ ++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) ++{ ++ unsigned int i, j; ++ ++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=8) ++ AV_COPY64U(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } else { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=16) ++ AV_COPY128(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } ++} ++ ++ ++ +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +{ +#undef FUNC @@ -33137,6 +34286,7 @@ index 0000000000..c5d130c377 + } + + hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; ++ hevcdsp->cpy_blk = cpy_blk; + + if (ARCH_PPC) + ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); @@ -33149,10 +34299,10 @@ index 0000000000..c5d130c377 +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 -index 0000000000..8c9bf725bf +index 0000000000..0b532f874b --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h -@@ -0,0 +1,183 @@ +@@ -0,0 +1,185 @@ +/* + * HEVC video decoder + * @@ -33324,6 +34474,8 @@ index 0000000000..8c9bf725bf + uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc); ++ ++ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height); +} HEVCDSPContext; + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth); @@ -35622,10 +36774,10 @@ index 0000000000..d1196a4440 + diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c new file mode 100644 -index 0000000000..113ed33d64 +index 0000000000..62135b83c2 --- /dev/null +++ b/libavcodec/rpi_hevcpred.c -@@ -0,0 +1,150 @@ +@@ -0,0 +1,166 @@ +/* + * HEVC video Decoder + * @@ -35704,6 +36856,10 @@ index 0000000000..113ed33d64 + hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ + hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ + hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \ ++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ ++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ ++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ ++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ + hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ @@ -35723,13 +36879,21 @@ index 0000000000..113ed33d64 + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ -+ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); ++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ ++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ ++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ ++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); + +#define HEVC_PRED_C(depth) \ + hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ + hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ + hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ + hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ ++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ ++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ ++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ ++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ + hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ + hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ + hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ @@ -35749,7 +36913,11 @@ index 0000000000..113ed33d64 + hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ ++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ ++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ ++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); + +#define HEVC_PRED(depth) \ + HEVC_PRED_Y(depth); \ @@ -35778,10 +36946,10 @@ index 0000000000..113ed33d64 +} diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h new file mode 100644 -index 0000000000..31d7d57d95 +index 0000000000..6e594277c0 --- /dev/null +++ b/libavcodec/rpi_hevcpred.h -@@ -0,0 +1,68 @@ +@@ -0,0 +1,121 @@ +/* + * HEVC video Decoder + * @@ -35814,9 +36982,58 @@ index 0000000000..31d7d57d95 +struct HEVCRpiContext; +struct HEVCRpiLocalContext; + -+typedef struct HEVCRpiPredContext { -+ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); ++enum IntraPredMode { ++ INTRA_PLANAR = 0, ++ INTRA_DC, ++ INTRA_ANGULAR_2, ++ INTRA_ANGULAR_3, ++ INTRA_ANGULAR_4, ++ INTRA_ANGULAR_5, ++ INTRA_ANGULAR_6, ++ INTRA_ANGULAR_7, ++ INTRA_ANGULAR_8, ++ INTRA_ANGULAR_9, ++ INTRA_ANGULAR_10, ++ INTRA_ANGULAR_11, ++ INTRA_ANGULAR_12, ++ INTRA_ANGULAR_13, ++ INTRA_ANGULAR_14, ++ INTRA_ANGULAR_15, ++ INTRA_ANGULAR_16, ++ INTRA_ANGULAR_17, ++ INTRA_ANGULAR_18, ++ INTRA_ANGULAR_19, ++ INTRA_ANGULAR_20, ++ INTRA_ANGULAR_21, ++ INTRA_ANGULAR_22, ++ INTRA_ANGULAR_23, ++ INTRA_ANGULAR_24, ++ INTRA_ANGULAR_25, ++ INTRA_ANGULAR_26, ++ INTRA_ANGULAR_27, ++ INTRA_ANGULAR_28, ++ INTRA_ANGULAR_29, ++ INTRA_ANGULAR_30, ++ INTRA_ANGULAR_31, ++ INTRA_ANGULAR_32, ++ INTRA_ANGULAR_33, ++ INTRA_ANGULAR_34, ++}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 + ++typedef void intra_filter_fn_t( ++ uint8_t * const left, uint8_t * const top, ++ const unsigned int req, const unsigned int avail, ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size); ++ ++typedef struct HEVCRpiPredContext { ++ void (*intra_pred[4])(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail); ++ ++ intra_filter_fn_t *intra_filter[4]; + void (*pred_planar[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, @@ -35830,8 +37047,11 @@ index 0000000000..31d7d57d95 + void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); -+ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); ++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); + ++ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail); ++ intra_filter_fn_t *intra_filter_c[4]; + void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, @@ -35845,6 +37065,7 @@ index 0000000000..31d7d57d95 + void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); ++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); +} HEVCRpiPredContext; + +void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); @@ -35852,10 +37073,10 @@ index 0000000000..31d7d57d95 +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 -index 0000000000..a76ba4c442 +index 0000000000..23835a320e --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,983 @@ +@@ -0,0 +1,1487 @@ +/* + * HEVC video decoder + * @@ -35967,7 +37188,7 @@ index 0000000000..a76ba4c442 +#endif + + -+#if DUMP_PRED && !defined(INCLUDE_ONCE) ++#if DUMP_PRED && !defined(INCLUDED_ONCE) +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) +{ + for (unsigned int y = 0; y != size; y++, data += stride * 2) { @@ -35980,105 +37201,705 @@ index 0000000000..a76ba4c442 +} +#endif + -+static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, -+ int log2_size, int c_idx_arg) ++#ifndef INCLUDED_ONCE ++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) +{ ++ if ((n >>= 2) != 0) { ++ uint32_t v4 = v | (v << 8); ++ uint32_t * p = (uint32_t *)ptr; ++ v4 = v4 | (v4 << 16); ++ do { ++ *p++ = v4; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v2 = v | (v << 16); ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v2; ++ *p++ = v2; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ } while (--n != 0); ++ } ++} ++ ++// Beware that this inverts the avail ordering ++// For CIP it seems easier this way round ++static unsigned int cip_avail(const MvField * mvf, const int mvf_stride, const unsigned int log2_pu_size, const unsigned int avail, unsigned int size, ++ unsigned int s0, unsigned int s1) ++{ ++ const unsigned int n = 1 << (log2_pu_size - 2); ++ unsigned int fa = 0; ++ unsigned int i = 0; ++ ++ size >>= 2; // Now in 4-pel units ++ s0 >>= 2; ++ s1 >>= 2; ++ ++ if ((avail & 4) != 0) ++ fa |= ((1 << s0) - 1) << (size - s0); ++ if ((avail & 2) != 0) ++ fa |= ((1 << s1) - 1) << size; ++ if ((avail & 1) != 0) ++ fa |= 1 << (size << 1); ++ ++ for (i = 0; (fa >> i) != 0; i += n, mvf += mvf_stride) { ++ if ((fa & (((1 << n) - 1) << i)) != 0 && mvf->pred_flag != PF_INTRA) ++ fa &= ~(((1 << n) - 1) << i); ++ } ++ ++ return fa; ++} ++ ++static inline unsigned int rmbd(unsigned int x) ++{ ++#if 1 ++ return __builtin_ctz(x); ++#else ++ unsigned int n = 0; ++ if ((x & 0xffff) == 0) { ++ x >>= 16; ++ n += 16; ++ } ++ if ((x & 0xff) == 0) { ++ x >>= 8; ++ n += 8; ++ } ++ if ((x & 0xf) == 0) { ++ x >>= 4; ++ n += 4; ++ } ++ if ((x & 0x3) == 0) { ++ x >>= 2; ++ n += 2; ++ } ++ ++ return (x & 1) == 0 ? n + 1 : n; ++#endif ++} ++#endif ++ ++ ++static void FUNC(cip_fill)(pixel * const left, pixel * const top, ++ const unsigned int avail_l, const unsigned int avail_u, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int size) ++{ ++ pixel a; ++ unsigned int i; ++ ++ // 1st find DL value ++ if ((avail_l & 1) == 0) { ++ if (avail_l != 0) ++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; ++ else ++ { ++ // (avail_l | avail_u) != 0 so this must be good ++ const unsigned int n = rmbd(avail_u)*4; ++ a = (n >= size) ? src_ur[n - size] : src_u[n]; ++ } ++ } ++ ++ // L ++ { ++ pixel * d = left + size * 2 - 1; ++ const pixel * s = src_l + (size * 2 - 1) * stride; ++ unsigned int x = avail_l; ++ for (i = 0; i < size * 2; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = a = *s; ++ s -= stride; ++ } ++ else ++ { ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ s -= stride * 4; ++ } ++ } ++ // UL ++ *d = a = (x & 1) != 0 ? *s : a; ++ } ++ ++ // U ++ { ++ pixel * d = top; ++ const pixel * s = src_u; ++ unsigned int x = avail_u; ++ ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ ++ // UR ++ s = src_ur; ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ } ++} ++ ++ ++#if !PRED_C && PW == 1 ++#define EXTEND(ptr, val, len) extend_8(ptr, val, len) ++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) ++#define EXTEND(ptr, val, len) extend_16(ptr, val, len) ++#else ++#define EXTEND(ptr, val, len) extend_32(ptr, val, len) ++#endif ++ ++ +#define PU(x) \ + ((x) >> s->ps.sps->log2_min_pu_size) +#define MVF(x, y) \ -+ (s->ref->tab_mvf[(x) + (y) * min_pu_width]) ++ (s->ref->tab_mvf[(x) + (y) * s->ps.sps->min_pu_width]) +#define MVF_PU(x, y) \ + MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift)))) -+#define IS_INTRA(x, y) \ -+ (MVF_PU(x, y).pred_flag == PF_INTRA) -+#define MIN_TB_ADDR_ZS(x, y) \ -+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] -+#define EXTEND(ptr, val, len) \ -+do { \ -+ pixel4 pix = PIXEL_SPLAT_X4(val); \ -+ for (i = 0; i < (len); i += 4) \ -+ AV_WN4P(ptr + i, pix); \ -+} while (0) + -+#define EXTEND_RIGHT_CIP(ptr, start, length) \ -+ for (i = start; i < (start) + (length); i += 4) \ -+ if (!IS_INTRA(i, -1)) \ -+ AV_WN4P(&ptr[i], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i+3]) -+#define EXTEND_LEFT_CIP(ptr, start, length) \ -+ for (i = start; i > (start) - (length); i--) \ -+ if (!IS_INTRA(i - 1, -1)) \ -+ ptr[i - 1] = ptr[i] -+#define EXTEND_UP_CIP(ptr, start, length) \ -+ for (i = (start); i > (start) - (length); i -= 4) \ -+ if (!IS_INTRA(-1, i - 3)) \ -+ AV_WN4P(&ptr[i - 3], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i - 3]) -+#define EXTEND_DOWN_CIP(ptr, start, length) \ -+ for (i = start; i < (start) + (length); i += 4) \ -+ if (!IS_INTRA(-1, i)) \ -+ AV_WN4P(&ptr[i], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i + 3]) ++// Reqs: ++// ++// Planar: DL[0], L, ul, U, UR[0] ++// DC: dl, L, ul, U, ur ++// A2-9: DL, L, ul, u, ur ++// A10: dl, L, ul, u, ur ++// A11-17 dl, L, UL, U, ur ++// A18-25 dl, L, Ul, U, ur ++// A26 dl, l, ul, U, ur ++// A27-34 dl, l, ul, U, UR ++ ++#ifndef INCLUDED_ONCE ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++ ++#define FILTER_LIGHT 0x40 ++#define FILTER_STRONG 0x80 ++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) ++ ++static const uint8_t req_avail_c[35] = ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}; ++ ++static const uint8_t req_avail[4][35] = { ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}, ++{ // 3 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | 0, // 3 ++ AVAIL_DL | AVAIL_L | 0, // 4 ++ AVAIL_DL | AVAIL_L | 0, // 5 ++ AVAIL_DL | AVAIL_L | 0, // 6 ++ AVAIL_DL | AVAIL_L | 0, // 7 ++ AVAIL_DL | AVAIL_L | 0, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | 0, // 28 ++ AVAIL_U | AVAIL_UR | 0, // 29 ++ AVAIL_U | AVAIL_UR | 0, // 30 ++ AVAIL_U | AVAIL_UR | 0, // 31 ++ AVAIL_U | AVAIL_UR | 0, // 32 ++ AVAIL_U | AVAIL_UR | 0, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 4 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 5 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 ++ AVAIL_L | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 ++ AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 ++} ++}; ++ ++ ++#endif ++ ++#define filter_light1 FUNC(filter_light1) ++static inline pixel filter_light1(pixel a, pixel b, pixel c) ++{ ++ return (a + b*2 + c + 2) >> 2; ++} ++ ++#define filter_light FUNC(filter_light) ++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) ++{ ++ pixel p0; ++ pixel p2 = *src; ++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels ++ unsigned int n_minus_1 = n - 1; ++ ++ do ++ { ++ src += sstride; ++ p0 = p1; ++ p1 = p2; ++ p2 = *src; ++ *dst++ = filter_light1(p0, p1, p2); ++ } while (--n_minus_1 != 0); ++ *dst = filter_light1(p1, p2, pn); ++} ++ ++#define filter_strong FUNC(filter_strong) ++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) ++{ ++ unsigned int a = 64 * p0 + 32; ++ const int v = p1 - p0; ++ ++ do ++ { ++ *dst++ = (a += v) >> 6; ++ } while (--n != 0); ++} ++ ++#define intra_filter FUNC(intra_filter) ++static av_always_inline void intra_filter( ++ pixel * const left, pixel * const top, ++ const unsigned int req, const unsigned int avail, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size, ++ const unsigned int log2_size) ++{ ++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); ++ const unsigned int size = 1 << log2_size; ++ ++ // a_ is the first pel in a section working round dl -> ur ++ // b_ is the last ++ // Beware that top & left work out from UL so usage of a_ & b_ may ++ // swap between them. It is a bad naming scheme but I have found no ++ // better ++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; ++ const pixel * b_dl = src_l + size * stride; ++ const pixel * a_l = src_l + (size - 1) * stride; ++ const pixel * b_l = src_l; ++ const pixel * ab_ul = src_l - stride; ++ const pixel * a_u = src_u; ++ const pixel * b_u = src_u + size - 1; ++ const pixel * a_ur = src_ur; ++ const pixel * b_ur = src_ur + top_right_size - 1; ++ ++ const unsigned int want = req & ~avail; ++ const unsigned int have = req & avail; ++ unsigned int i; ++ ++ if ((avail & AVAIL_DL) == 0) ++ { ++ a_dl = a_ur; ++ if ((avail & AVAIL_U) != 0) ++ a_dl = a_u; ++ if ((avail & AVAIL_UL) != 0) ++ a_dl = ab_ul; ++ if ((avail & AVAIL_L) != 0) ++ a_dl = a_l; ++ b_dl = a_dl; ++ } ++ ++ if ((avail & AVAIL_L) == 0) ++ { ++ a_l = b_dl; ++ b_l = b_dl; ++ } ++ if ((avail & AVAIL_UL) == 0) ++ { ++ ab_ul = b_l; ++ } ++ if ((avail & AVAIL_U) == 0) ++ { ++ a_u = ab_ul; ++ b_u = ab_ul; ++ } ++ if ((avail & AVAIL_UR) == 0) ++ { ++ a_ur = b_u; ++ b_ur = b_u; ++ } ++ ++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints ++ { ++ if ((req & AVAIL_UL) != 0) ++ left[-1] = *ab_ul; ++ ++ if ((want & AVAIL_L) != 0) ++ EXTEND(left, *a_l, size); ++ if ((want & AVAIL_DL) != 0) ++ EXTEND(left + size, *a_dl, size); ++ if ((want & AVAIL_U) != 0) ++ EXTEND(top, *a_u, size); ++ if ((want & AVAIL_UR) != 0) ++ EXTEND(top + size, *a_ur, size); ++ ++ if ((have & AVAIL_U) != 0) ++ // Always good - even with sand ++ memcpy(top, a_u, size * sizeof(pixel)); ++ if ((have & AVAIL_UR) != 0) ++ { ++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, *b_ur, ++ size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ for (i = 0; i < size; i++) ++ left[i] = b_l[stride * i]; ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ for (i = 0; i < down_left_size; i++) ++ left[i + size] = b_dl[stride * i]; ++ EXTEND(left + size + down_left_size, *a_dl, ++ size - down_left_size); ++ } ++ } ++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint ++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && ++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) ++ { ++ if ((req & (AVAIL_U | AVAIL_UR)) != 0) ++ filter_strong(top, *ab_ul, *b_ur, size * 2); ++ left[-1] = *ab_ul; ++ if ((req & (AVAIL_L | AVAIL_DL)) != 0) ++ filter_strong(left, *ab_ul, *a_dl, size*2); ++ } ++ else ++ { ++ // Same code for both have & want for UL ++ if ((req & AVAIL_UL) != 0) ++ { ++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u); ++ } ++ ++ if ((want & AVAIL_L) != 0) ++ { ++ EXTEND(left, *a_l, size); ++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; ++ } ++ if ((want & AVAIL_DL) != 0) ++ { ++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding ++ EXTEND(left + size, *a_l, size); ++ } ++ if ((want & AVAIL_U) != 0) ++ { ++ EXTEND(top, *a_u, size); ++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; ++ } ++ if ((want & AVAIL_UR) != 0) ++ { ++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding ++ EXTEND(top + size, *a_ur, size); ++ } ++ ++ if ((have & AVAIL_U) != 0) ++ { ++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size); ++ } ++ if ((have & AVAIL_UR) != 0) { ++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); ++ top[size*2 - 1] = *b_ur; ++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size); ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); ++ left[size*2 - 1] = *a_dl; ++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); ++ } ++ } ++} ++ ++#define INTRA_FILTER(log2_size) \ ++static void FUNC(intra_filter_ ## log2_size)( \ ++ uint8_t * const left, uint8_t * const top, \ ++ const unsigned int req, const unsigned int avail, \ ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ ++ const unsigned int stride, \ ++ const unsigned int top_right_size, const unsigned int down_left_size) \ ++{ \ ++ intra_filter((pixel *)left, (pixel *)top, req, avail, \ ++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ ++} ++ ++INTRA_FILTER(2) ++INTRA_FILTER(3) ++INTRA_FILTER(4) ++INTRA_FILTER(5) ++ ++#undef intra_filter ++#undef INTRA_FILTER ++ ++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, ++ const unsigned int log2_size) ++{ + // c_idx will alaways be 1 for _c versions and 0 for y + const unsigned int c_idx = PRED_C; -+ int i; + const unsigned int hshift = ctx_hshift(s, c_idx); + const unsigned int vshift = ctx_vshift(s, c_idx); -+ int size = (1 << log2_size); -+ int size_in_luma_h = size << hshift; -+ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; -+ int size_in_luma_v = size << vshift; -+ int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; -+ const int x = x0 >> hshift; -+ const int y = y0 >> vshift; -+ int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; -+ int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; -+ -+ int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); ++ const unsigned int size = (1 << log2_size); ++ const unsigned int x = x0 >> hshift; ++ const unsigned int y = y0 >> vshift; + + const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); + pixel *const src = c_idx == 0 ? + (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : + (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); + -+ int min_pu_width = s->ps.sps->min_pu_width; -+ -+ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : -+ lc->tu.intra_pred_mode; -+ pixel4 a; -+ + // Align so we can do multiple loads in the asm + // Padded to 16 byte boundary so as not to confuse anything + DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); + DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ ++ pixel * const left = left_array + 16 / sizeof(pixel); ++ pixel * const top = top_array + 16 / sizeof(pixel); ++ const pixel * top_pred = top; ++ ++ const pixel * src_l = src - 1; ++ const pixel * src_u = src - stride; ++ const pixel * src_ur = src_u + size; +#if !PRED_C -+ DECLARE_ALIGNED(16, pixel, filtered_left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); -+ DECLARE_ALIGNED(16, pixel, filtered_top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ unsigned int req = req_avail[log2_size - 2][mode]; ++#else ++ unsigned int req = req_avail_c[mode]; +#endif + -+ pixel *left = left_array + 16 / sizeof(pixel); -+ pixel *top = top_array + 16 / sizeof(pixel); ++ // If we have nothing to pred from then fill with grey ++ // This isn't a common case but dealing with it here means we don't have to ++ // test for it later ++ if (avail == 0) ++ { ++dc_only: +#if !PRED_C -+ pixel *filtered_left = filtered_left_array + 16 / sizeof(pixel); -+ pixel *filtered_top = filtered_top_array + 16 / sizeof(pixel); ++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); ++#else ++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); +#endif -+ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); -+ int cand_left = lc->na.cand_left; -+ int cand_up_left = lc->na.cand_up_left; -+ int cand_up = lc->na.cand_up; -+ int cand_up_right = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1); ++ return; ++ } + -+ int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) - -+ (y0 + size_in_luma_v)) >> vshift; -+ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - -+ (x0 + size_in_luma_h)) >> hshift; -+ -+ pixel * src_l = src - 1; -+ pixel * src_u = src - stride; -+ pixel * src_ur = src_u + size; ++ // There will be no filtering on C so no point worrying about disabling it ++#if !PRED_C ++ if (s->ps.sps->intra_smoothing_disabled_flag) ++ req &= ~FILTER_EITHER; ++ if (!s->ps.sps->sps_strong_intra_smoothing_enable_flag) ++ req &= ~FILTER_STRONG; ++#endif + + { + // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs @@ -36091,248 +37912,96 @@ index 0000000000..a76ba4c442 + src_ur += stripe_adj; + } + -+ if (s->ps.pps->constrained_intra_pred_flag == 1) { -+ int size_in_luma_pu_v = PU(size_in_luma_v); -+ int size_in_luma_pu_h = PU(size_in_luma_h); -+ int on_pu_edge_x = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size); -+ int on_pu_edge_y = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size); -+ if (!size_in_luma_pu_h) -+ size_in_luma_pu_h++; -+ if (cand_bottom_left == 1 && on_pu_edge_x) { -+ int x_left_pu = PU(x0 - 1); -+ int y_bottom_pu = PU(y0 + size_in_luma_v); -+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu); -+ cand_bottom_left = 0; -+ for (i = 0; i < max; i += 2) -+ cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA); -+ } -+ if (cand_left == 1 && on_pu_edge_x) { -+ int x_left_pu = PU(x0 - 1); -+ int y_left_pu = PU(y0); -+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu); -+ cand_left = 0; -+ for (i = 0; i < max; i += 2) -+ cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA); -+ } -+ if (cand_up_left == 1) { -+ int x_left_pu = PU(x0 - 1); -+ int y_top_pu = PU(y0 - 1); -+ cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA; -+ } -+ if (cand_up == 1 && on_pu_edge_y) { -+ int x_top_pu = PU(x0); -+ int y_top_pu = PU(y0 - 1); -+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu); -+ cand_up = 0; -+ for (i = 0; i < max; i += 2) -+ cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA); -+ } -+ if (cand_up_right == 1 && on_pu_edge_y) { -+ int y_top_pu = PU(y0 - 1); -+ int x_right_pu = PU(x0 + size_in_luma_h); -+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu); -+ cand_up_right = 0; -+ for (i = 0; i < max; i += 2) -+ cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA); -+ } -+ memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel)); -+ memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel)); -+ top[-1] = 128; -+ } -+ if (cand_up_left) { -+ left[-1] = src_l[-stride]; -+ top[-1] = left[-1]; -+ } -+ if (cand_up) -+ // Always good - even with sand -+ memcpy(top, src_u, size * sizeof(pixel)); -+ if (cand_up_right) { -+ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); -+ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], -+ size - top_right_size); -+ } -+ if (cand_left) -+ for (i = 0; i < size; i++) -+ left[i] = src_l[stride * i]; -+ if (cand_bottom_left) { -+ for (i = size; i < size + bottom_left_size; i++) -+ left[i] = src_l[stride * i]; -+ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], -+ size - bottom_left_size); -+ } ++ if (s->ps.pps->constrained_intra_pred_flag == 1 && ++ s->sh.slice_type != HEVC_SLICE_I) // Can deal with I-slices in 'normal' code ++ { ++ const unsigned int l2_pu_s = FFMAX(s->ps.sps->log2_min_pu_size - hshift, 2); ++ const unsigned int l2_pu_stride_s = l2_pu_s - (s->ps.sps->log2_min_pu_size - hshift); + -+ if (s->ps.pps->constrained_intra_pred_flag == 1) { -+ if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) { -+ int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ? -+ 2 * size : (s->ps.sps->width - x0) >> hshift; -+ int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ? -+ 2 * size : (s->ps.sps->height - y0) >> vshift; -+ int j = size + (cand_bottom_left? bottom_left_size: 0) -1; -+ if (!cand_up_right) { -+ size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ? -+ size : (s->ps.sps->width - x0) >> hshift; -+ } -+ if (!cand_bottom_left) { -+ size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ? -+ size : (s->ps.sps->height - y0) >> vshift; -+ } -+ if (cand_bottom_left || cand_left || cand_up_left) { -+ while (j > -1 && !IS_INTRA(-1, j)) -+ j--; -+ if (!IS_INTRA(-1, j)) { -+ j = 0; -+ while (j < size_max_x && !IS_INTRA(j, -1)) -+ j++; -+ EXTEND_LEFT_CIP(top, j, j + 1); -+ left[-1] = top[-1]; -+ } -+ } else { -+ j = 0; -+ while (j < size_max_x && !IS_INTRA(j, -1)) -+ j++; -+ if (j > 0) -+ if (x0 > 0) { -+ EXTEND_LEFT_CIP(top, j, j + 1); -+ } else { -+ EXTEND_LEFT_CIP(top, j, j); -+ top[-1] = top[0]; -+ } -+ left[-1] = top[-1]; -+ } -+ left[-1] = top[-1]; -+ if (cand_bottom_left || cand_left) { -+ a = PIXEL_SPLAT_X4(left[-1]); -+ EXTEND_DOWN_CIP(left, 0, size_max_y); -+ } -+ if (!cand_left) -+ EXTEND(left, left[-1], size); -+ if (!cand_bottom_left) -+ EXTEND(left + size, left[size - 1], size); -+ if (x0 != 0 && y0 != 0) { -+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); -+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); -+ if (!IS_INTRA(-1, - 1)) -+ left[-1] = left[0]; -+ } else if (x0 == 0) { -+ EXTEND(left, 0, size_max_y); -+ } else { -+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); -+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); -+ } -+ top[-1] = left[-1]; -+ if (y0 != 0) { -+ a = PIXEL_SPLAT_X4(left[-1]); -+ EXTEND_RIGHT_CIP(top, 0, size_max_x); -+ } -+ } -+ } -+ // Infer the unavailable samples -+ if (!cand_bottom_left) { -+ if (cand_left) { -+ EXTEND(left + size, left[size - 1], size); -+ } else if (cand_up_left) { -+ EXTEND(left, left[-1], 2 * size); -+ cand_left = 1; -+ } else if (cand_up) { -+ left[-1] = top[0]; -+ EXTEND(left, left[-1], 2 * size); -+ cand_up_left = 1; -+ cand_left = 1; -+ } else if (cand_up_right) { -+ EXTEND(top, top[size], size); -+ left[-1] = top[size]; -+ EXTEND(left, left[-1], 2 * size); -+ cand_up = 1; -+ cand_up_left = 1; -+ cand_left = 1; -+ } else { // No samples available -+#if PRED_C -+ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); -+#else -+ left[-1] = (1 << (BIT_DEPTH - 1)); -+#endif -+ EXTEND(top, left[-1], 2 * size); -+ EXTEND(left, left[-1], 2 * size); -+ } -+ } ++ unsigned int avail_l = cip_avail(&MVF_PU(-1, size * 2 - 1), ++ -(int)(s->ps.sps->min_pu_width << l2_pu_stride_s), ++ l2_pu_s, ++ avail >> AVAIL_S_UL, ++ size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), size); ++ unsigned int avail_u = cip_avail(&MVF_PU(0, -1), ++ 1 << l2_pu_stride_s, ++ l2_pu_s, ++ avail << 1, ++ size, ++ size, FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size)); + -+ if (!cand_left) -+ EXTEND(left, left[size], size); -+ if (!cand_up_left) { -+ left[-1] = left[0]; -+ } -+ if (!cand_up) -+ EXTEND(top, left[-1], size); -+ if (!cand_up_right) -+ EXTEND(top + size, top[size - 1], size); ++ // Anything left? ++ if ((avail_l | avail_u) == 0) ++ goto dc_only; + -+ top[-1] = left[-1]; ++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); + -+ // Filtering process -+ // Sand can only apply to chroma_format_idc == 1 so we don't need to -+ // worry about chroma smoothing for that case +#if !PRED_C -+ if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || ctx_cfmt(s) == 3)) { -+ if (mode != INTRA_DC && size != 4){ -+ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; -+ int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)), -+ FFABS((int)(mode - 10U))); -+ if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) { -+ int threshold = 1 << (BIT_DEPTH - 5); -+ if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 && -+ log2_size == 5 && -+ FFABS(top[-1] + top[63] - 2 * top[31]) < threshold && -+ FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) { -+ // We can't just overwrite values in top because it could be -+ // a pointer into src -+ filtered_top[-1] = top[-1]; -+ filtered_top[63] = top[63]; -+ for (i = 0; i < 63; i++) -+ filtered_top[i] = ((64 - (i + 1)) * top[-1] + -+ (i + 1) * top[63] + 32) >> 6; -+ for (i = 0; i < 63; i++) -+ left[i] = ((64 - (i + 1)) * left[-1] + -+ (i + 1) * left[63] + 32) >> 6; -+ top = filtered_top; -+ } else { -+ filtered_left[2 * size - 1] = left[2 * size - 1]; -+ filtered_top[2 * size - 1] = top[2 * size - 1]; -+ for (i = 2 * size - 2; i >= 0; i--) -+ filtered_left[i] = (left[i + 1] + 2 * left[i] + -+ left[i - 1] + 2) >> 2; -+ filtered_top[-1] = -+ filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2; -+ for (i = 2 * size - 2; i >= 0; i--) -+ filtered_top[i] = (top[i + 1] + 2 * top[i] + -+ top[i - 1] + 2) >> 2; -+ left = filtered_left; -+ top = filtered_top; -+ } ++ if ((req & FILTER_LIGHT) != 0) ++ { ++ const unsigned threshold = 1 << (BIT_DEPTH - 5); ++ if ((req & FILTER_STRONG) != 0 && ++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && ++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) ++ { ++ filter_strong(top, left[-1], top[63], 64); ++ filter_strong(left, left[-1], left[63], 64); ++ } else ++ { ++ // LHS writes UL too so copy for top ++ const pixel p_ul = left[-1]; ++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); ++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); + } + } ++#endif ++ } ++ else ++ { ++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); ++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && ++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) ++ { ++ top_pred = src_u; ++ } ++ else ++ { ++#if !PRED_C ++ s->hpc.intra_filter[log2_size - 2] ++#else ++ s->hpc.intra_filter_c[log2_size - 2] ++#endif ++ ((uint8_t *)left, (uint8_t *)top, req, avail, ++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), ++ ur_size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); ++ } + } + ++ ++#if !PRED_C + switch (mode) { + case INTRA_PLANAR: -+ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: -+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; @@ -36340,25 +38009,25 @@ index 0000000000..a76ba4c442 +#else + switch (mode) { + case INTRA_PLANAR: -+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: -+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; @@ -36373,10 +38042,11 @@ index 0000000000..a76ba4c442 +#endif +} + -+#define INTRA_PRED(size) \ -+static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx) \ -+{ \ -+ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \ ++#define INTRA_PRED(log2_size) \ ++static void FUNC(intra_pred_ ## log2_size)(const struct HEVCRpiContext * const s, \ ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail) \ ++{ \ ++ FUNC(intra_pred)(s, mode, x0, y0, avail, log2_size); \ +} + +INTRA_PRED(2) @@ -36521,6 +38191,56 @@ index 0000000000..a76ba4c442 + +#undef PRED_DC + ++ ++ ++ ++#if !PRED_C ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ int i, j; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++} ++#else ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const pixel a = (1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = a; ++ src[j][1] = a; ++ } ++ } ++} ++#endif ++ ++#define PRED_DC0(size)\ ++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc0)(src, stride, size + 2); \ ++} ++ ++PRED_DC0(0) ++PRED_DC0(1) ++PRED_DC0(2) ++PRED_DC0(3) ++ ++#undef PRED_DC0 ++ ++ ++ ++ +#ifndef ANGLE_CONSTS +#define ANGLE_CONSTS +static const int intra_pred_angle[] = { @@ -36835,6 +38555,11 @@ index 0000000000..a76ba4c442 +#undef POS +#undef PW + ++#undef filter_light1 ++#undef filter_light ++#undef filter_strong ++#undef ref_gen ++ +#ifndef INCLUDED_ONCE +#define INCLUDED_ONCE +#endif @@ -40917,7 +42642,7 @@ index 0000000000..59c0d3959e +# -Wa,-ahls diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh new file mode 100755 -index 0000000000..28b7a4f483 +index 0000000000..c8da66514b --- /dev/null +++ b/pi-util/conf_pi2.sh @@ -0,0 +1,32 @@ @@ -40928,7 +42653,7 @@ index 0000000000..28b7a4f483 + +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" -+RPI_DEFINES="-D__VCCOREVER__=0x4000000" ++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon" +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" +