From 35f659918f75ee23eb6a52432d20726bdbad5533 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Fri, 21 Jul 2023 20:16:20 +0200 Subject: [PATCH 1/2] RPi: update kodi patch to support bwdif deinterlace Signed-off-by: Matthias Reichl --- ...et-max-bpc-for-high-bit-depth-videos.patch | 4 +- ...DRMPRIME-Also-support-YUV420-buffers.patch | 12 +- ...MPRIME-Adjust-av-formats-to-match-re.patch | 14 +- ...PRIME-Add-support-for-arbitrary-outp.patch | 30 +- ...PRIME-Remove-obsolete-thread_safe_ca.patch | 8 +- ...eoCodecDRMPRIME-Clear-m_pFilterGraph.patch | 24 ++ ...PRIME-Move-FilterTest-from-open-to-f.patch | 70 +++++ ...PRIME-Rework-filtering-code-to-handl.patch | 277 ++++++++++++++++++ ...eg-change-default-software-deinterla.patch | 74 +++++ ...MPRIME-Support-decoding-to-DRMPRIME-.patch | 55 ++++ ...PRIME-Request-v4l2-buffers-be-alloca.patch | 30 ++ ...PRIME-Add-setting-to-enable-hw-deint.patch | 123 ++++++++ 12 files changed, 687 insertions(+), 34 deletions(-) create mode 100644 projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch create mode 100644 projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch create mode 100644 projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch create mode 100644 projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch create mode 100644 projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch create mode 100644 projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch create mode 100644 projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch diff --git a/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch b/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch index 4e0946e41c..5861fab6c9 100644 --- a/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch +++ b/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch @@ -1,7 +1,7 @@ -From e181af5b2b97b3fbc69b9ad7318a3c02f6186ca5 Mon Sep 17 00:00:00 2001 +From 89a7f05ee85fca27f1140a035fec804d84959dbe Mon Sep 17 00:00:00 2001 From: Dom Cobley Date: Fri, 3 Dec 2021 16:00:50 +0000 -Subject: [PATCH 1/5] gbm: Set max bpc for high bit depth videos +Subject: [PATCH 01/12] gbm: Set max bpc for high bit depth videos --- .../HwDecRender/VideoLayerBridgeDRMPRIME.cpp | 15 +++++++++++++++ diff --git a/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch b/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch index 991028312f..0bf0799852 100644 --- a/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch +++ b/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch @@ -1,7 +1,7 @@ -From 05fbbc78734827304edd3eb10de0a0117d10a8b9 Mon Sep 17 00:00:00 2001 +From 7d18280622c8ac12dbf1f6d4d5ca9589e1a61b02 Mon Sep 17 00:00:00 2001 From: popcornmix Date: Sat, 11 Sep 2021 14:03:05 +0100 -Subject: [PATCH 2/5] CDVDVideoCodecDRMPRIME: Also support YUV420 buffers +Subject: [PATCH 02/12] CDVDVideoCodecDRMPRIME: Also support YUV420 buffers CDVDVideoCodecDRMPRIME: Add support for deinterlace of sw decoded buffers @@ -11,10 +11,10 @@ Need to call SetDimensions earlier and store the drm descriptor in expected plac 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp -index b221cdaf75..c0efa91770 100644 +index f5e26b203c..90f1fb07a9 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp -@@ -619,7 +619,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) +@@ -622,7 +622,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) pVideoPicture->videoBuffer = nullptr; } @@ -23,7 +23,7 @@ index b221cdaf75..c0efa91770 100644 { CVideoBufferDRMPRIMEFFmpeg* buffer = dynamic_cast(m_videoBufferPool->Get()); -@@ -697,7 +697,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -700,7 +700,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) const AVFilter* srcFilter = avfilter_get_by_name("buffer"); const AVFilter* outFilter = avfilter_get_by_name("buffersink"); @@ -32,7 +32,7 @@ index b221cdaf75..c0efa91770 100644 std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:" "pixel_aspect={}/{}", -@@ -845,6 +845,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose() +@@ -848,6 +848,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose() CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() { diff --git a/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch b/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch index ad0b7598a0..f5ed2b0231 100644 --- a/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch +++ b/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch @@ -1,15 +1,15 @@ -From 8fbcf5fada25720b5c6f66959d5ee1c28cff04f9 Mon Sep 17 00:00:00 2001 +From e36845fd7e48b364f68a43bd8c66e06a570a6f4c Mon Sep 17 00:00:00 2001 From: Dom Cobley Date: Wed, 18 Jan 2023 16:41:00 +0000 -Subject: [PATCH 3/5] CDVDVideoCodecDRMPRIME: Adjust av formats to match recent - ffmpeg changes +Subject: [PATCH 03/12] CDVDVideoCodecDRMPRIME: Adjust av formats to match + recent ffmpeg changes --- .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp -index c0efa91770..29d38a3ec0 100644 +index 90f1fb07a9..169e8544de 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp @@ -355,6 +355,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio @@ -19,8 +19,8 @@ index c0efa91770..29d38a3ec0 100644 + m_pCodecContext->thread_safe_callbacks = 1; m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount(); - if (hints.extradata && hints.extrasize > 0) -@@ -697,13 +698,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) + if (hints.extradata) +@@ -700,13 +701,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) const AVFilter* srcFilter = avfilter_get_by_name("buffer"); const AVFilter* outFilter = avfilter_get_by_name("buffersink"); @@ -36,7 +36,7 @@ index c0efa91770..29d38a3ec0 100644 m_pCodecContext->time_base.num ? m_pCodecContext->time_base.num : 1, m_pCodecContext->time_base.num ? -@@ -855,6 +856,7 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() +@@ -858,6 +859,7 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() m_pFrame->data[0] = reinterpret_cast(descriptor); } diff --git a/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch b/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch index 7e4ec85ce4..9a03fdee20 100644 --- a/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch +++ b/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch @@ -1,7 +1,7 @@ -From 56117d2874dcc36ac779609c63f1a8b0bace5366 Mon Sep 17 00:00:00 2001 +From 092ae2d56a5b8ed1558e82c2beae6e4223df57ff Mon Sep 17 00:00:00 2001 From: Dom Cobley Date: Mon, 6 Feb 2023 15:19:51 +0000 -Subject: [PATCH 4/5] DVDVideoCodecDRMPRIME: Add support for arbitrary output +Subject: [PATCH 04/12] DVDVideoCodecDRMPRIME: Add support for arbitrary output pixel formats This enables any ffmpeg pixel formats to be supported by DRMPRIME decoder @@ -20,7 +20,7 @@ And it happens automatically without requiring user video settings 2 files changed, 77 insertions(+), 50 deletions(-) diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp -index 29d38a3ec0..611876ba8d 100644 +index 169e8544de..28bd0a9bc7 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp @@ -219,7 +219,7 @@ enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avct @@ -67,7 +67,7 @@ index 29d38a3ec0..611876ba8d 100644 buffer->Export(frame, width, height); buffer->SyncStart(); -@@ -628,9 +628,9 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) +@@ -631,9 +631,9 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) buffer->SetRef(m_pFrame); pVideoPicture->videoBuffer = buffer; } @@ -79,7 +79,7 @@ index 29d38a3ec0..611876ba8d 100644 buffer->SetPictureParams(*pVideoPicture); buffer->Acquire(); buffer->SyncEnd(); -@@ -664,13 +664,13 @@ void CDVDVideoCodecDRMPRIME::FilterTest() +@@ -667,13 +667,13 @@ void CDVDVideoCodecDRMPRIME::FilterTest() if (name.find("deinterlace") != std::string::npos) { @@ -96,7 +96,7 @@ index 29d38a3ec0..611876ba8d 100644 return; } } -@@ -680,14 +680,31 @@ void CDVDVideoCodecDRMPRIME::FilterTest() +@@ -683,14 +683,31 @@ void CDVDVideoCodecDRMPRIME::FilterTest() __FUNCTION__); } @@ -130,7 +130,7 @@ index 29d38a3ec0..611876ba8d 100644 return true; if (!(m_pFilterGraph = avfilter_graph_alloc())) -@@ -698,13 +715,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -701,13 +718,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) const AVFilter* srcFilter = avfilter_get_by_name("buffer"); const AVFilter* outFilter = avfilter_get_by_name("buffersink"); @@ -146,7 +146,7 @@ index 29d38a3ec0..611876ba8d 100644 m_pCodecContext->time_base.num ? m_pCodecContext->time_base.num : 1, m_pCodecContext->time_base.num ? -@@ -723,7 +740,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -726,7 +743,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: src: {} ({})", err, result); @@ -154,7 +154,7 @@ index 29d38a3ec0..611876ba8d 100644 return false; } -@@ -731,7 +747,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -734,7 +750,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) if (!par) { CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc buffersrc"); @@ -162,7 +162,7 @@ index 29d38a3ec0..611876ba8d 100644 return false; } -@@ -747,7 +762,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -750,7 +765,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersrc_parameters_set: {} ({})", err, result); @@ -170,7 +170,7 @@ index 29d38a3ec0..611876ba8d 100644 return false; } av_freep(&par); -@@ -761,7 +775,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -764,7 +778,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: out: {} ({})", err, result); @@ -178,7 +178,7 @@ index 29d38a3ec0..611876ba8d 100644 return false; } -@@ -770,32 +783,46 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -773,32 +786,46 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) if (result < 0) { CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - failed settings pix formats"); @@ -243,7 +243,7 @@ index 29d38a3ec0..611876ba8d 100644 } if ((result = avfilter_graph_config(m_pFilterGraph, nullptr)) < 0) -@@ -804,15 +831,11 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) +@@ -807,15 +834,11 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test) av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE); CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_config: {} ({})", err, result); @@ -259,7 +259,7 @@ index 29d38a3ec0..611876ba8d 100644 m_processInfo.SetVideoDeintMethod(filters); -@@ -847,16 +870,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose() +@@ -850,16 +873,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose() CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() { // sw decoded buffers need cache flush and for descripter to be set @@ -279,7 +279,7 @@ index 29d38a3ec0..611876ba8d 100644 int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame); if (ret < 0) { -@@ -949,25 +972,28 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo +@@ -960,25 +983,28 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo return VC_ERROR; } diff --git a/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch b/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch index 468e4c0bfb..64e5f3d123 100644 --- a/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch +++ b/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch @@ -1,7 +1,7 @@ -From 85c8218d79f042c6d16b1d1ff6479743f095994e Mon Sep 17 00:00:00 2001 +From 4a3cb2af8b0751807d212044ba424d07f2a7ba55 Mon Sep 17 00:00:00 2001 From: Dom Cobley Date: Fri, 14 Apr 2023 19:59:42 +0100 -Subject: [PATCH 5/5] DVDVideoCodecDRMPRIME: Remove obsolete +Subject: [PATCH 05/12] DVDVideoCodecDRMPRIME: Remove obsolete thread_safe_callbacks --- @@ -9,7 +9,7 @@ Subject: [PATCH 5/5] DVDVideoCodecDRMPRIME: Remove obsolete 1 file changed, 1 deletion(-) diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp -index 611876ba8d..f7022e1854 100644 +index 28bd0a9bc7..670b5f22ce 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp @@ -355,7 +355,6 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio @@ -19,7 +19,7 @@ index 611876ba8d..f7022e1854 100644 - m_pCodecContext->thread_safe_callbacks = 1; m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount(); - if (hints.extradata && hints.extrasize > 0) + if (hints.extradata) -- 2.39.2 diff --git a/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch b/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch new file mode 100644 index 0000000000..3751c0d06e --- /dev/null +++ b/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch @@ -0,0 +1,24 @@ +From 018e080fb3fea185df01d2659d59231aef787759 Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Wed, 31 May 2023 19:40:37 +0100 +Subject: [PATCH 06/12] DVDVideoCodecDRMPRIME: Clear m_pFilterGraph + +--- + xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index 670b5f22ce..8568f162ae 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -866,6 +866,7 @@ void CDVDVideoCodecDRMPRIME::FilterClose() + // Disposed by above code + m_pFilterIn = nullptr; + m_pFilterOut = nullptr; ++ m_pFilterGraph = nullptr; + } + } + +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch b/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch new file mode 100644 index 0000000000..32af8bc1bb --- /dev/null +++ b/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch @@ -0,0 +1,70 @@ +From b62d5e56d76ce179e3a1169566aa2146da48b147 Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Fri, 2 Jun 2023 11:34:22 +0100 +Subject: [PATCH 07/12] DVDVideoCodecDRMPRIME: Move FilterTest from open to + first frame returned + +The pixel format is not accurate until the first frame is returned +and it may (later) influence the choice of deinterlacers available. +--- + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 24 ++++++++++++------- + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h | 1 + + 2 files changed, 16 insertions(+), 9 deletions(-) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index 8568f162ae..f515c5d5f1 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -387,15 +387,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio + m_processInfo.SetVideoDAR(hints.aspect); + m_processInfo.SetVideoDeintMethod("none"); + +- FilterTest(); +- +- if (!m_deintFilterName.empty()) +- { +- std::list methods; +- methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE); +- m_processInfo.UpdateDeinterlacingMethods(methods); +- m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE); +- } ++ m_checkedDeinterlace = false; + + return true; + } +@@ -983,6 +975,20 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo + return VC_ERROR; + } + ++ if (!m_checkedDeinterlace) ++ { ++ FilterTest(); ++ ++ if (!m_deintFilterName.empty()) ++ { ++ std::list methods; ++ methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE); ++ m_processInfo.UpdateDeinterlacingMethods(methods); ++ m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE); ++ } ++ m_checkedDeinterlace = true; ++ } ++ + // we need to scale if the buffer isn't in DRM_PRIME format + bool need_scale = !IsSupportedSwFormat(static_cast(m_pFrame->format)) && !IsSupportedHwFormat(static_cast(m_pFrame->format)); + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h +index bb88fde1f9..df17f89b96 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h +@@ -56,6 +56,7 @@ protected: + int m_codecControlFlags = 0; + CDVDStreamInfo m_hints; + double m_DAR = 1.0; ++ bool m_checkedDeinterlace = false; + AVCodecContext* m_pCodecContext = nullptr; + AVFrame* m_pFrame = nullptr; + AVFrame* m_pFilterFrame = nullptr; +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch b/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch new file mode 100644 index 0000000000..a62af7f15d --- /dev/null +++ b/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch @@ -0,0 +1,277 @@ +From b359d89684418cc3a6f894434d212611c7c12cd5 Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Wed, 31 May 2023 14:19:20 +0100 +Subject: [PATCH 08/12] DVDVideoCodecDRMPRIME: Rework filtering code to handle + sw deinterlace + +--- + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 134 +++++++++--------- + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h | 4 +- + 2 files changed, 68 insertions(+), 70 deletions(-) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index f515c5d5f1..b614312a77 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -207,11 +207,7 @@ static const AVCodec* FindDecoder(CDVDStreamInfo& hints) + return codec; + } + +- codec = avcodec_find_decoder(hints.codec); +- if (codec && (codec->capabilities & AV_CODEC_CAP_DR1) == AV_CODEC_CAP_DR1) +- return codec; +- +- return nullptr; ++ return avcodec_find_decoder(hints.codec); + } + + enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avctx, +@@ -645,27 +641,33 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) + return true; + } + +-void CDVDVideoCodecDRMPRIME::FilterTest() ++void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt) + { +- const AVFilter* filter; +- void* opaque{}; +- + m_deintFilterName.clear(); + +- while ((filter = av_filter_iterate(&opaque)) != nullptr) ++ // look twice, first for DRM_PRIME support, then for actual pixel format ++ for (int i=0; i < 2; i++) + { +- std::string name(filter->name); ++ const AVFilter* filter; ++ void* opaque{}; + +- if (name.find("deinterlace") != std::string::npos) ++ while ((filter = av_filter_iterate(&opaque)) != nullptr) + { +- bool ret = FilterOpen(name, false, true); +- FilterClose(); +- if (ret) ++ std::string name(filter->name); ++ ++ if (name.find(i == 0 ? "deinterlace" : "bwdif") != std::string::npos) + { +- m_deintFilterName = name; +- CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}", +- __FUNCTION__, name); +- return; ++ bool ret = FilterOpen(name, pix_fmt, true); ++ FilterClose(); ++ if (ret) ++ { ++ m_deintFilterName = name; ++ if (name == "bwdif" || name == "yadif") ++ m_deintFilterName += "=1:-1:1"; ++ CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}", ++ __FUNCTION__, name); ++ return; ++ } + } + } + } +@@ -691,14 +693,17 @@ AVFrame *CDVDVideoCodecDRMPRIME::alloc_filter_frame(AVFilterContext * ctx, void + return frame; + } + +-bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, bool test) ++bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test) + { + int result; + ++ if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P) ++ pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ + if (m_pFilterGraph) + FilterClose(); + +- if (filters.empty() && !scale) ++ if (filters.empty()) + return true; + + if (!(m_pFilterGraph = avfilter_graph_alloc())) +@@ -709,13 +714,12 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, + + const AVFilter* srcFilter = avfilter_get_by_name("buffer"); + const AVFilter* outFilter = avfilter_get_by_name("buffersink"); +- enum AVPixelFormat pix_fmts[] = { scale ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE }; + + std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:" + "pixel_aspect={}/{}", + m_pCodecContext->width, + m_pCodecContext->height, +- scale ? m_pCodecContext->pix_fmt : AV_PIX_FMT_DRM_PRIME, ++ pix_fmt, + m_pCodecContext->time_base.num ? + m_pCodecContext->time_base.num : 1, + m_pCodecContext->time_base.num ? +@@ -772,6 +776,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, + return false; + } + ++ enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE }; + result = av_opt_set_int_list(m_pFilterOut, "pix_fmts", &pix_fmts[0], + AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN); + if (result < 0) +@@ -780,43 +785,32 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, + return false; + } + +- if (!filters.empty()) ++ if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast(this))) < 0) + { +- AVFilterInOut* outputs = avfilter_inout_alloc(); +- AVFilterInOut* inputs = avfilter_inout_alloc(); ++ CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result); ++ return result; ++ } ++ AVFilterInOut* outputs = avfilter_inout_alloc(); ++ AVFilterInOut* inputs = avfilter_inout_alloc(); + +- outputs->name = av_strdup("in"); +- outputs->filter_ctx = m_pFilterIn; +- outputs->pad_idx = 0; +- outputs->next = nullptr; ++ outputs->name = av_strdup("in"); ++ outputs->filter_ctx = m_pFilterIn; ++ outputs->pad_idx = 0; ++ outputs->next = nullptr; + +- inputs->name = av_strdup("out"); +- inputs->filter_ctx = m_pFilterOut; +- inputs->pad_idx = 0; +- inputs->next = nullptr; ++ inputs->name = av_strdup("out"); ++ inputs->filter_ctx = m_pFilterOut; ++ inputs->pad_idx = 0; ++ inputs->next = nullptr; + +- result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL); +- avfilter_inout_free(&outputs); +- avfilter_inout_free(&inputs); ++ result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL); ++ avfilter_inout_free(&outputs); ++ avfilter_inout_free(&inputs); + +- if (result < 0) +- { +- CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse"); +- return false; +- } +- } +- else ++ if (result < 0) + { +- if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast(this))) < 0) +- { +- CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result); +- return result; +- } +- if ((result = avfilter_link(m_pFilterIn, 0, m_pFilterOut, 0)) < 0) +- { +- CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_link"); +- return false; +- } ++ CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse"); ++ return false; + } + + if ((result = avfilter_graph_config(m_pFilterGraph, nullptr)) < 0) +@@ -831,8 +825,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, + if (test) + return true; + +- m_processInfo.SetVideoDeintMethod(filters); +- + if (CServiceBroker::GetLogging().CanLogComponent(LOGVIDEO)) + { + char* graphDump = avfilter_graph_dump(m_pFilterGraph, nullptr); +@@ -864,8 +856,8 @@ void CDVDVideoCodecDRMPRIME::FilterClose() + + CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() + { +- // sw decoded buffers need cache flush and for descripter to be set +- if (!IsSupportedHwFormat(static_cast(m_pFrame->format)) && IsSupportedSwFormat(static_cast(m_pFrame->format))) ++ // sw decoded buffers submitted to hw decoder need cache flush and for descripter to be set ++ if (m_pFrame->format != AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_DRM_PRIME) + { + CVideoBufferDMA* buffer = static_cast(av_buffer_get_opaque(m_pFrame->buf[0])); + buffer->SetDimensions(m_pFrame->width, m_pFrame->height); +@@ -975,9 +967,10 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo + return VC_ERROR; + } + ++ AVPixelFormat pix_fmt = static_cast(m_pFrame->format); + if (!m_checkedDeinterlace) + { +- FilterTest(); ++ FilterTest(pix_fmt); + + if (!m_deintFilterName.empty()) + { +@@ -989,28 +982,33 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo + m_checkedDeinterlace = true; + } + +- // we need to scale if the buffer isn't in DRM_PRIME format +- bool need_scale = !IsSupportedSwFormat(static_cast(m_pFrame->format)) && !IsSupportedHwFormat(static_cast(m_pFrame->format)); +- + if (!m_processInfo.GetVideoInterlaced() && m_pFrame->interlaced_frame) + m_processInfo.SetVideoInterlaced(true); + + std::string filterChain = GetFilterChain(m_pFrame->interlaced_frame); +- if (!filterChain.empty() || need_scale) ++ ++ // we need to scale if the buffer isn't in DRM_PRIME format ++ if (!IsSupportedSwFormat(pix_fmt) && !IsSupportedHwFormat(pix_fmt)) ++ filterChain = "scale"; ++ // we need to copy if the buffer wasn't allocated by us ++ else if (!IsSupportedHwFormat(pix_fmt) && !(m_pCodecContext->codec->capabilities & AV_CODEC_CAP_DR1)) ++ filterChain = "copy"; ++ ++ if (!filterChain.empty()) + { +- bool reopenFilter = false; +- if (m_filters != filterChain) +- reopenFilter = true; ++ bool reopenFilter = m_filters != filterChain; + + if (m_pFilterGraph && + (m_pFilterIn->outputs[0]->w != m_pFrame->width || + m_pFilterIn->outputs[0]->h != m_pFrame->height)) + reopenFilter = true; + +- if (reopenFilter || (need_scale && m_pFilterGraph == nullptr)) ++ if (reopenFilter) + { + m_filters = filterChain; +- if (!FilterOpen(filterChain, need_scale, false)) ++ m_processInfo.SetVideoDeintMethod(m_filters); ++ ++ if (!FilterOpen(filterChain, pix_fmt, false)) + FilterClose(); + } + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h +index df17f89b96..55675c3c2e 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h +@@ -45,9 +45,9 @@ protected: + static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt); + static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags); + static AVFrame *alloc_filter_frame(AVFilterContext * ctx, void * v, int w, int h); +- bool FilterOpen(const std::string& filters, bool scale, bool test); ++ bool FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test); + void FilterClose(); +- void FilterTest(); ++ void FilterTest(AVPixelFormat pix_fmt); + std::string GetFilterChain(bool interlaced); + + std::string m_name; +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch b/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch new file mode 100644 index 0000000000..141aa2c5cd --- /dev/null +++ b/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch @@ -0,0 +1,74 @@ +From c9a70db5879a6ac37b5840621aa102812104087f Mon Sep 17 00:00:00 2001 +From: Alan Swanson +Date: Thu, 18 May 2023 16:12:43 +0100 +Subject: [PATCH 09/12] VideoPlayer: ffmpeg change default software + deinterlacer from yadif to bwdif + +--- + .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 10 +++++----- + xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp | 4 ++-- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp +index 032ee16454..5d1b7162f9 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp +@@ -55,7 +55,7 @@ enum DecoderState + + enum EFilterFlags { + FILTER_NONE = 0x0, +- FILTER_DEINTERLACE_YADIF = 0x1, //< use first deinterlace mode ++ FILTER_DEINTERLACE_BWDIF = 0x1, //< use first deinterlace mode + FILTER_DEINTERLACE_ANY = 0xf, //< use any deinterlace mode + FILTER_DEINTERLACE_FLAGGED = 0x10, //< only deinterlace flagged frames + FILTER_DEINTERLACE_HALFED = 0x20, //< do half rate deinterlacing +@@ -526,12 +526,12 @@ void CDVDVideoCodecFFmpeg::SetFilters() + } + } + +- if (filters & FILTER_DEINTERLACE_YADIF) ++ if (filters & FILTER_DEINTERLACE_BWDIF) + { + if (filters & FILTER_DEINTERLACE_HALFED) +- m_filters_next = "yadif=0:-1"; ++ m_filters_next = "bwdif=0:-1"; + else +- m_filters_next = "yadif=1:-1"; ++ m_filters_next = "bwdif=1:-1"; + + if (filters & FILTER_DEINTERLACE_FLAGGED) + m_filters_next += ":1"; +@@ -1226,7 +1226,7 @@ int CDVDVideoCodecFFmpeg::FilterOpen(const std::string& filters, bool scale) + return result; + } + +- if (filters.compare(0,5,"yadif") == 0) ++ if (filters.compare(0,5,"bwdif") == 0) + { + m_processInfo.SetVideoDeintMethod(filters); + } +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp +index 24edd058e9..f9b6f17824 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp +@@ -3007,7 +3007,7 @@ bool CFFmpegPostproc::Init(EINTERLACEMETHOD method) + { + std::string filter; + +- filter = "yadif=1:-1"; ++ filter = "bwdif=1:-1"; + + if (avfilter_graph_parse_ptr(m_pFilterGraph, filter.c_str(), &inputs, &outputs, NULL) < 0) + { +@@ -3026,7 +3026,7 @@ bool CFFmpegPostproc::Init(EINTERLACEMETHOD method) + return false; + } + +- m_config.processInfo->SetVideoDeintMethod("yadif"); ++ m_config.processInfo->SetVideoDeintMethod("bwdif"); + } + else if (method == VS_INTERLACEMETHOD_RENDER_BOB || + method == VS_INTERLACEMETHOD_NONE) +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch b/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch new file mode 100644 index 0000000000..97f403f138 --- /dev/null +++ b/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch @@ -0,0 +1,55 @@ +From 88d0dd1bb5be849f2066f92f55bd7d8c80eb7edf Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Tue, 20 Jun 2023 15:13:09 +0100 +Subject: [PATCH 10/12] CDVDVideoCodecDRMPRIME: Support decoding to DRMPRIME + with sw deinterlace + +We can map a YUV style DRM_PRIME buffer back to AV_PIX_FMT_YUV420P +to allow subsquent sw deinterlace +--- + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index b614312a77..023334f5db 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -700,6 +700,9 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelForma + if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P) + pix_fmt = AV_PIX_FMT_DRM_PRIME; + ++ if (filters.find("bwdif") != std::string::npos && pix_fmt == AV_PIX_FMT_DRM_PRIME) ++ pix_fmt = AV_PIX_FMT_YUV420P; ++ + if (m_pFilterGraph) + FilterClose(); + +@@ -866,6 +869,25 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn() + m_pFrame->data[0] = reinterpret_cast(descriptor); + m_pFrame->format = AV_PIX_FMT_DRM_PRIME; + } ++ // hw decoded buffers submitted to sw decoder need mapping of planes for cpu to access ++ else if (m_pFrame->format == AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_YUV420P) ++ { ++ AVFrame *frame = av_frame_alloc(); ++ frame->width = m_pFrame->width; ++ frame->height = m_pFrame->height; ++ frame->format = AV_PIX_FMT_YUV420P; ++ int ret = av_hwframe_map(frame, m_pFrame, (int)AV_HWFRAME_MAP_READ); ++ if (ret < 0) ++ { ++ char err[AV_ERROR_MAX_STRING_SIZE] = {}; ++ av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE); ++ CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - av_hwframe_map failed: {} ({})", ++ __FUNCTION__, err, ret); ++ return VC_ERROR; ++ } ++ av_frame_unref(m_pFrame); ++ av_frame_move_ref(m_pFrame, frame); ++ } + + int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame); + if (ret < 0) +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch b/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch new file mode 100644 index 0000000000..c75d4c73dc --- /dev/null +++ b/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch @@ -0,0 +1,30 @@ +From c2ced5695054a42fe4ba8520669d7c69e583e2a1 Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Tue, 20 Jun 2023 15:14:02 +0100 +Subject: [PATCH 11/12] DVDVideoCodecDRMPRIME: Request v4l2 buffers be + allocated through cache + +This is an optional request, but will improve performance of sw deinterlace +if supported. +--- + .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index 023334f5db..0182f30a61 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -367,6 +367,10 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio + for (auto&& option : options.m_keys) + av_opt_set(m_pCodecContext, option.m_name.c_str(), option.m_value.c_str(), 0); + ++ // this requests v4l2 buffers are allocated through cache. It will work if this is not supported, ++ // but subsequent operations like deinterlace may be less efficient ++ av_opt_set(m_pCodecContext->priv_data, "dmabuf_alloc", "cma", 0); ++ + if (avcodec_open2(m_pCodecContext, pCodec, nullptr) < 0) + { + CLog::Log(LOGINFO, "CDVDVideoCodecDRMPRIME::{} - unable to open codec", __FUNCTION__); +-- +2.39.2 + diff --git a/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch b/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch new file mode 100644 index 0000000000..c3534352b4 --- /dev/null +++ b/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch @@ -0,0 +1,123 @@ +From 4eded8af13fe44c12ed2c26e40abfe9e9d08281f Mon Sep 17 00:00:00 2001 +From: Dom Cobley +Date: Wed, 21 Jun 2023 13:16:01 +0100 +Subject: [PATCH 12/12] DVDVideoCodecDRMPRIME: Add setting to enable hw + deinterlace + +HW deinterlace has lower cpu, but may have higher quality, +so allow user to choose appropriate setting. +--- + .../resource.language.en_gb/resources/strings.po | 11 +++++++++++ + system/settings/linux.xml | 12 ++++++++++++ + .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 16 +++++++++++++++- + xbmc/settings/Settings.h | 1 + + 4 files changed, 39 insertions(+), 1 deletion(-) + +diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po +index 062d3afd2b..8c6e31965a 100644 +--- a/addons/resource.language.en_gb/resources/strings.po ++++ b/addons/resource.language.en_gb/resources/strings.po +@@ -7311,6 +7311,11 @@ msgctxt "#13438" + msgid "Allow hardware acceleration with DRM PRIME" + msgstr "" + ++#: system/settings/settings.xml ++msgctxt "#13500" ++msgid "Allow hardware deinterlace with DRM PRIME" ++msgstr "" ++ + #: system/settings/settings.xml + msgctxt "#13439" + msgid "Allow hardware acceleration - MediaCodec" +@@ -19424,6 +19429,12 @@ msgctxt "#36172" + msgid "Enable PRIME decoding of video files" + msgstr "" + ++#. Description of setting with label #13500 "Allow hardware deinterlace - PRIME" ++#: system/settings/settings.xml ++msgctxt "#36290" ++msgid "Enable PRIME hardware deinterlace of video files" ++msgstr "" ++ + #. Description of setting with label #14109 "Short date format" + #: system/settings/settings.xml + msgctxt "#36173" +diff --git a/system/settings/linux.xml b/system/settings/linux.xml +index 531974f3f4..c2df62c047 100644 +--- a/system/settings/linux.xml ++++ b/system/settings/linux.xml +@@ -180,6 +180,18 @@ + true + + ++ ++ HAS_GLES ++ false ++ ++ ++ true ++ ++ ++ 3 ++ true ++ ++ + + HAS_GLES + false +diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +index 0182f30a61..cd3b4e89a2 100644 +--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp ++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +@@ -41,6 +41,7 @@ namespace + { + + constexpr const char* SETTING_VIDEOPLAYER_USEPRIMEDECODERFORHW{"videoplayer.useprimedecoderforhw"}; ++constexpr const char* SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE{"videoplayer.primeallowhwdeinterlace"}; + + static void ReleaseBuffer(void* opaque, uint8_t* data) + { +@@ -149,6 +150,15 @@ void CDVDVideoCodecDRMPRIME::Register() + + setting->SetVisible(true); + ++ setting = settings->GetSetting(SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE); ++ if (!setting) ++ { ++ CLog::Log(LOGERROR, "Failed to load setting for: {}", SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE); ++ return; ++ } ++ ++ setting->SetVisible(true); ++ + CDVDFactoryCodec::RegisterHWVideoCodec("drm_prime", CDVDVideoCodecDRMPRIME::Create); + } + +@@ -650,7 +660,11 @@ void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt) + m_deintFilterName.clear(); + + // look twice, first for DRM_PRIME support, then for actual pixel format +- for (int i=0; i < 2; i++) ++ ++ bool hw = CServiceBroker::GetSettingsComponent()->GetSettings()->GetBool( ++ SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE); ++ ++ for (int i = hw ? 0 : 1; i < 2; i++) + { + const AVFilter* filter; + void* opaque{}; +diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h +index a4f91e9f92..e9cb3dc2be 100644 +--- a/xbmc/settings/Settings.h ++++ b/xbmc/settings/Settings.h +@@ -117,6 +117,7 @@ public: + static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODEC = "videoplayer.usemediacodec"; + static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE = + "videoplayer.usemediacodecsurface"; ++ static constexpr auto SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE = "videoplayer.primeallowhwdeinterlace"; + static constexpr auto SETTING_VIDEOPLAYER_USEVDPAU = "videoplayer.usevdpau"; + static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMIXER = "videoplayer.usevdpaumixer"; + static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMPEG2 = "videoplayer.usevdpaumpeg2"; +-- +2.39.2 + From 3ac1df1390a627fea6e7c7a88e2adfa5504edffc Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Mon, 10 Jul 2023 20:54:33 +0200 Subject: [PATCH 2/2] ffmpeg: update rpi patch Patch created using revisions ea3d24b..120058b from branch dev/6.0/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg --- .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch | 3107 ++++++++++++++++- 1 file changed, 2971 insertions(+), 136 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 27c1326476..72cacc605c 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -1,7 +1,7 @@ From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 26 Apr 2021 12:34:50 +0100 -Subject: [PATCH 001/136] Add pi configs and scripts +Subject: [PATCH 001/151] Add pi configs and scripts --- pi-util/BUILD.txt | 59 ++++++++ @@ -1682,7 +1682,7 @@ index 0000000000..5935a11ca5 From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 11:27:39 +0100 -Subject: [PATCH 002/136] Add sand pix fmts & conversion fns +Subject: [PATCH 002/151] Add sand pix fmts & conversion fns --- configure | 3 + @@ -3503,7 +3503,7 @@ index 0000000000..634b55e800 From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 11:36:47 +0100 -Subject: [PATCH 003/136] Add aarch64 asm sand conv functions +Subject: [PATCH 003/151] Add aarch64 asm sand conv functions Many thanks to eiler.mike@gmail.com (Michael Eiler) for these optimizations @@ -4310,7 +4310,7 @@ index ed0261b02f..1f543e9357 100644 From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 11:56:02 +0100 -Subject: [PATCH 004/136] Add raw encoding for sand +Subject: [PATCH 004/151] Add raw encoding for sand --- libavcodec/raw.c | 6 +++ @@ -4459,7 +4459,7 @@ index 8c577006d9..594a77c42a 100644 From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 12:02:09 +0100 -Subject: [PATCH 005/136] Deal with the lack of trivial sand cropping +Subject: [PATCH 005/151] Deal with the lack of trivial sand cropping --- fftools/ffmpeg.c | 4 ++-- @@ -4559,7 +4559,7 @@ index 2580269549..3a9d323325 100644 From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 12:31:16 +0100 -Subject: [PATCH 006/136] Add an unsand filter +Subject: [PATCH 006/151] Add an unsand filter --- configure | 1 + @@ -4857,7 +4857,7 @@ index 0000000000..7100f2fc9b From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 12:37:07 +0100 -Subject: [PATCH 007/136] Reduce mmal compile warnings +Subject: [PATCH 007/151] Reduce mmal compile warnings --- libavcodec/mmaldec.c | 4 ++++ @@ -4889,7 +4889,7 @@ index 3092f58510..6f41b41ac4 100644 From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 17:56:16 +0100 -Subject: [PATCH 008/136] Add chroma location to hevc parse +Subject: [PATCH 008/151] Add chroma location to hevc parse --- libavcodec/hevc_parser.c | 13 +++++++++++++ @@ -4948,7 +4948,7 @@ index 567e8d81d4..b6cfea64d3 100644 From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 26 Sep 2022 18:20:50 +0100 -Subject: [PATCH 009/136] hwaccel: Add .abort_frame & use in hevcdec +Subject: [PATCH 009/151] hwaccel: Add .abort_frame & use in hevcdec --- libavcodec/avcodec.h | 11 +++++++++++ @@ -5000,7 +5000,7 @@ index b6cfea64d3..8a0246fa21 100644 From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 26 Sep 2022 18:26:17 +0100 -Subject: [PATCH 010/136] hwaccel: Add CAP_MT_SAFE for accels that can use +Subject: [PATCH 010/151] hwaccel: Add CAP_MT_SAFE for accels that can use multi-thread --- @@ -5049,7 +5049,7 @@ index d9d5afaa82..2cc89a41f5 100644 From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 17:59:08 +0100 -Subject: [PATCH 011/136] Weak link utils +Subject: [PATCH 011/151] Weak link utils --- libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++ @@ -5199,7 +5199,7 @@ index 0000000000..415b6a27a0 From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 19:23:26 +0100 -Subject: [PATCH 012/136] Add v4l2_req V4L2 request H265 drm_prime decode +Subject: [PATCH 012/151] Add v4l2_req V4L2 request H265 drm_prime decode Has the abiliy to switch between kernel API versions at runtime. This could be removed later once teher is no chance of usage on an old @@ -10674,7 +10674,7 @@ index 0000000000..f14f594564 From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 27 Apr 2021 19:30:36 +0100 -Subject: [PATCH 013/136] Add no_cvt_hw option to ffmpeg +Subject: [PATCH 013/151] Add no_cvt_hw option to ffmpeg --- fftools/ffmpeg.c | 6 ++++-- @@ -10744,7 +10744,7 @@ index 055275d813..761db36588 100644 From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 28 Apr 2021 10:16:39 +0100 -Subject: [PATCH 014/136] Add vout_drm +Subject: [PATCH 014/151] Add vout_drm --- configure | 4 + @@ -11457,7 +11457,7 @@ index 0000000000..cfb33ce7c3 From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 28 Apr 2021 11:34:18 +0100 -Subject: [PATCH 015/136] Add vout_egl +Subject: [PATCH 015/151] Add vout_egl --- configure | 6 + @@ -12357,7 +12357,7 @@ index 0000000000..7b9c610ace From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 28 Apr 2021 12:51:22 +0100 -Subject: [PATCH 016/136] V4L2 stateful rework +Subject: [PATCH 016/151] V4L2 stateful rework --- libavcodec/Makefile | 3 +- @@ -14780,7 +14780,7 @@ index 4944d08511..7f6033ac2c 100644 From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 10 Jun 2021 18:46:21 +0100 -Subject: [PATCH 017/136] Fix crash in hw_device_default_name if type not found +Subject: [PATCH 017/151] Fix crash in hw_device_default_name if type not found (NONE) --- @@ -14804,7 +14804,7 @@ index 88fa782470..740a5e7153 100644 From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 10 Jun 2021 18:59:18 +0100 -Subject: [PATCH 018/136] Allow v4l2m2m to select non-drm_prime output formats +Subject: [PATCH 018/151] Allow v4l2m2m to select non-drm_prime output formats --- libavcodec/v4l2_buffers.c | 2 +- @@ -14871,7 +14871,7 @@ index 7f6033ac2c..a4b5a4e7e9 100644 From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 10 Jun 2021 18:59:38 +0100 -Subject: [PATCH 019/136] Fix YUV420P output from v4l2m2m +Subject: [PATCH 019/151] Fix YUV420P output from v4l2m2m Also put get_width get_height inlines in header as they are generally useful. @@ -14988,7 +14988,7 @@ index 24a9c94864..8f054f2f50 100644 From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 10 Jun 2021 19:23:44 +0100 -Subject: [PATCH 020/136] Report buffer overflows in v4l2m2m +Subject: [PATCH 020/151] Report buffer overflows in v4l2m2m --- libavcodec/v4l2_buffers.c | 14 ++++++++++---- @@ -15064,7 +15064,7 @@ index 6fe2586627..81aced0c2b 100644 From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 14 Jun 2021 11:55:16 +0100 -Subject: [PATCH 021/136] Increase V4L2 H264 stateful coded buffer size +Subject: [PATCH 021/151] Increase V4L2 H264 stateful coded buffer size Try to set a min size of frame size / 2 for bitbuffers passed to V4l2. This fixes a few streams that have large I-frames. You would hope @@ -15188,7 +15188,7 @@ index a4b5a4e7e9..1851acbc93 100644 From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 28 Jun 2021 12:13:35 +0100 -Subject: [PATCH 022/136] Fix raw video s.t. it respects any remaining cropping +Subject: [PATCH 022/151] Fix raw video s.t. it respects any remaining cropping This fixes the long standing CONFWIN_A conformance test failure for drm. --- @@ -15458,7 +15458,7 @@ index 7a9fdbd263..baf18920fa 100644 From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 13 Aug 2021 15:38:28 +0100 -Subject: [PATCH 023/136] Set frame interlace from V4L2 buffer field +Subject: [PATCH 023/151] Set frame interlace from V4L2 buffer field --- libavcodec/v4l2_buffers.c | 12 ++++++++++++ @@ -15498,7 +15498,7 @@ index de31f7ced9..97b8eb1db3 100644 From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 13 Aug 2021 16:11:53 +0100 -Subject: [PATCH 024/136] Fix V4L2 stateful to avoid crash if flush before +Subject: [PATCH 024/151] Fix V4L2 stateful to avoid crash if flush before start --- @@ -15524,7 +15524,7 @@ index a17ae027a6..eb901e8fab 100644 From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 9 Sep 2021 17:44:13 +0100 -Subject: [PATCH 025/136] Copy properties from frame to v4l2 buffer +Subject: [PATCH 025/151] Copy properties from frame to v4l2 buffer Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that ff_v4l2_buffer_buf_to_avframe copies @@ -15695,7 +15695,7 @@ index 97b8eb1db3..126d2a17f4 100644 From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 17 Nov 2021 16:49:01 +0000 -Subject: [PATCH 026/136] ffmpeg: Do not inc DTS on no decode output +Subject: [PATCH 026/151] ffmpeg: Do not inc DTS on no decode output V4L2 H264 decode has long latency and sometimes spits out a long stream of output without input. In this case incrementing DTS is wrong. There @@ -15727,7 +15727,7 @@ index 5dc2cd73c1..ba0c1898cf 100644 From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 17 Nov 2021 17:32:59 +0000 -Subject: [PATCH 027/136] v4l2_m2m_dec: Adjust timebase if H264 +Subject: [PATCH 027/151] v4l2_m2m_dec: Adjust timebase if H264 Adjust AVCodecContext time_base if H264 in the same way that the software decoder does. @@ -15760,7 +15760,7 @@ index 1851acbc93..aa1e5c1597 100644 From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 17 Nov 2021 17:38:27 +0000 -Subject: [PATCH 028/136] v4l2_m2m_dec: Produce best guess PTSs if none +Subject: [PATCH 028/151] v4l2_m2m_dec: Produce best guess PTSs if none supplied Filter scheduling gets confused by missing PTSs and makes poor guesses @@ -15895,7 +15895,7 @@ index aa1e5c1597..a5a2afbd27 100644 From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 17 Nov 2021 17:59:27 +0000 -Subject: [PATCH 029/136] v4l2_m2m_dec: Try harder to get an initial frame +Subject: [PATCH 029/151] v4l2_m2m_dec: Try harder to get an initial frame If the input Q is full then wait on a short timeout for a capture frame rather than stuffing yet still another frame into the input if we could @@ -15936,7 +15936,7 @@ index a5a2afbd27..b49f470c0a 100644 From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 17 Nov 2021 18:04:56 +0000 -Subject: [PATCH 030/136] Add a V4L2 M2M deinterlace filter +Subject: [PATCH 030/151] Add a V4L2 M2M deinterlace filter Add a V4L2 deinterlace filter that will accept DRMPRIME frames. @@ -17277,7 +17277,7 @@ index 0000000000..1a933b7e0a From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 2 Dec 2021 17:49:55 +0000 -Subject: [PATCH 031/136] Put no_pts_rescale in context which makes more sense +Subject: [PATCH 031/151] Put no_pts_rescale in context which makes more sense than an arg --- @@ -17558,7 +17558,7 @@ index b49f470c0a..36754b314a 100644 From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 8 Dec 2021 15:00:37 +0000 -Subject: [PATCH 032/136] Use bitbuf min size for all streams +Subject: [PATCH 032/151] Use bitbuf min size for all streams --- libavcodec/v4l2_m2m_dec.c | 5 +---- @@ -17589,7 +17589,7 @@ index 36754b314a..48a6810d18 100644 From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 3 Dec 2021 12:54:18 +0000 -Subject: [PATCH 033/136] Track pending frames in v4l2 stateful +Subject: [PATCH 033/151] Track pending frames in v4l2 stateful Track which frames are pending decode in the v4l2 stateful decoder. This relies on DTS & PTS having some relationship to reality, so @@ -17847,7 +17847,7 @@ index 48a6810d18..d8ebb466cd 100644 From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 15 Dec 2021 17:58:21 +0000 -Subject: [PATCH 034/136] Use pending tracking to reduce v4l2 latency +Subject: [PATCH 034/151] Use pending tracking to reduce v4l2 latency If there are more than 5 pending decodes outstanding then add a small timeout to the capture poll to reduce the rate at which frames are @@ -17970,7 +17970,7 @@ index d8ebb466cd..7e7e4729d0 100644 From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 15 Dec 2021 12:23:54 +0000 -Subject: [PATCH 035/136] Allow logger() to take const ctx +Subject: [PATCH 035/151] Allow logger() to take const ctx --- libavcodec/v4l2_buffers.c | 2 +- @@ -18015,7 +18015,7 @@ index 64540a37b3..d3df48aed4 100644 From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 15 Dec 2021 13:00:27 +0000 -Subject: [PATCH 036/136] Track numbere of bufs qed with an atomic +Subject: [PATCH 036/151] Track numbere of bufs qed with an atomic Safer and faster than counting status --- @@ -18089,7 +18089,7 @@ index 4cc164886c..a4176448d5 100644 From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 9 Dec 2021 12:01:25 +0000 -Subject: [PATCH 037/136] Clear pkt_buf on flush +Subject: [PATCH 037/151] Clear pkt_buf on flush --- libavcodec/v4l2_m2m_dec.c | 3 +++ @@ -18113,7 +18113,7 @@ index 7e7e4729d0..09ec496351 100644 From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 15 Dec 2021 12:52:56 +0000 -Subject: [PATCH 038/136] Rework v4l2 buffer dequeue +Subject: [PATCH 038/151] Rework v4l2 buffer dequeue --- libavcodec/v4l2_context.c | 543 ++++++++++++++++++-------------------- @@ -19150,7 +19150,7 @@ index 09ec496351..e4b6569ba5 100644 From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 9 Dec 2021 18:51:00 +0000 -Subject: [PATCH 039/136] Honor result of ff_get_format if possible +Subject: [PATCH 039/151] Honor result of ff_get_format if possible --- libavcodec/v4l2_m2m_dec.c | 6 +++++- @@ -19185,7 +19185,7 @@ index e4b6569ba5..c9655bcc3b 100644 From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 14 Dec 2021 16:11:10 +0000 -Subject: [PATCH 040/136] Add an always-reinit quirk +Subject: [PATCH 040/151] Add an always-reinit quirk --- libavcodec/v4l2_context.c | 7 +++++-- @@ -19291,7 +19291,7 @@ index c9655bcc3b..e2b10f5e3a 100644 From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 4 Jan 2022 16:58:31 +0000 -Subject: [PATCH 041/136] v4l2_buffers: rework flags for keyframe +Subject: [PATCH 041/151] v4l2_buffers: rework flags for keyframe Previously flags could become confused and keyframe info could be lost. This fixes that and removes the duplicate flags field in V4L2Buffer. @@ -19400,7 +19400,7 @@ index c11b5e6863..53b522d43e 100644 From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 22 Mar 2022 11:44:30 +0000 -Subject: [PATCH 042/136] v4l2m2m: Rework decode to wait for missing buffer, +Subject: [PATCH 042/151] v4l2m2m: Rework decode to wait for missing buffer, add dynamic pending Previously receive_frame exited with EAGAIN if no capture buffer @@ -19620,7 +19620,7 @@ index e2b10f5e3a..2e30449dfc 100644 From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 25 Mar 2022 15:37:58 +0000 -Subject: [PATCH 043/136] v4l2_m2m2_dec: Avoid loop if unable to resize buffers +Subject: [PATCH 043/151] v4l2_m2m2_dec: Avoid loop if unable to resize buffers If source change signals a buffer size that cannot be honored give up rather than looping indefinitely. This happens on Pi if (say) a @@ -19667,7 +19667,7 @@ index 7ddb759810..007a58c8f1 100644 From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 25 Mar 2022 18:14:40 +0000 -Subject: [PATCH 044/136] v4l2dec: Improve size/format validation on init +Subject: [PATCH 044/151] v4l2dec: Improve size/format validation on init --- libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++-- @@ -19809,7 +19809,7 @@ index b0a5930844..76ab0916cd 100644 From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 13 Apr 2022 16:05:56 +0000 -Subject: [PATCH 045/136] v4l2 stateless hevc: Add another API variation for +Subject: [PATCH 045/151] v4l2 stateless hevc: Add another API variation for linux 5.18 This is probably going to be a short lived variation and may end up @@ -20255,7 +20255,7 @@ index f14f594564..ed48d62e2d 100644 From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 3 May 2022 12:44:42 +0000 -Subject: [PATCH 046/136] Remove V4l2 frame size check for meson-vdec +Subject: [PATCH 046/151] Remove V4l2 frame size check for meson-vdec --- libavcodec/v4l2_m2m.h | 3 ++- @@ -20315,7 +20315,7 @@ index 8dcadf461b..888ba67fea 100644 From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 23 May 2022 18:05:20 +0100 -Subject: [PATCH 047/136] v4l2m2m_dec: Make some error rturns a bit more robust +Subject: [PATCH 047/151] v4l2m2m_dec: Make some error rturns a bit more robust --- libavcodec/v4l2_context.c | 5 ++--- @@ -20384,7 +20384,7 @@ index 888ba67fea..88a341aae2 100644 From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 24 May 2022 17:02:58 +0000 -Subject: [PATCH 048/136] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA +Subject: [PATCH 048/151] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA. Should also detect and complain about unexpected streams of empty packets. @@ -20494,7 +20494,7 @@ index 88a341aae2..392a68f0c7 100644 From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 24 May 2022 20:02:48 +0000 -Subject: [PATCH 049/136] v4l2m2m_dec: Catch repeated Q fulls +Subject: [PATCH 049/151] v4l2m2m_dec: Catch repeated Q fulls --- libavcodec/v4l2_m2m_dec.c | 8 +++++++- @@ -20536,7 +20536,7 @@ index 392a68f0c7..7e17044706 100644 From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 25 May 2022 15:22:12 +0000 -Subject: [PATCH 050/136] Remove requirement for epoxy & libudev config options +Subject: [PATCH 050/151] Remove requirement for epoxy & libudev config options --- configure | 26 +++++++++++++++++--------- @@ -20663,7 +20663,7 @@ index 65576846e8..37cea71756 100755 From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 27 May 2022 09:36:51 +0000 -Subject: [PATCH 051/136] hevc: If hwaccel avoid creation of s/w only vars +Subject: [PATCH 051/151] hevc: If hwaccel avoid creation of s/w only vars --- libavcodec/hevc_refs.c | 35 +++++++++++++++++++++-------------- @@ -20801,7 +20801,7 @@ index 2867cb2e16..17f53322fb 100644 From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 30 May 2022 17:51:44 +0100 -Subject: [PATCH 052/136] rpi_sand: Add SAND30->NV12 conversion +Subject: [PATCH 052/151] rpi_sand: Add SAND30->NV12 conversion C code only. Reworks the hwcontext_drm conversion to use the rpi_sand_fns generic frame convert fn rather than calling the @@ -21023,7 +21023,7 @@ index 634b55e800..462ccb8abd 100644 From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 1 Jun 2022 17:49:26 +0000 -Subject: [PATCH 053/136] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8 +Subject: [PATCH 053/151] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8 Also reworks the previous Armv8 SAND30->Y16 function in a slightly more efficient way that makes it look more like the Armv7 version. @@ -21962,7 +21962,7 @@ index 256c3d532f..b6071e2928 100644 From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 7 Jun 2022 14:46:12 +0000 -Subject: [PATCH 054/136] v4l2_m2m_enc: Add the ability to encode DRM_PRIME +Subject: [PATCH 054/151] v4l2_m2m_enc: Add the ability to encode DRM_PRIME frames --- @@ -23337,7 +23337,7 @@ index 9a0837ecf3..05ff6ba726 100644 From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 8 Jun 2022 16:13:31 +0000 -Subject: [PATCH 055/136] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is +Subject: [PATCH 055/151] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is always NO_PTS If we do have DTS but don't have PTS then assume PTS=DTS. @@ -23422,7 +23422,7 @@ index fbbfc81342..485a96f4b4 100644 From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 30 Jun 2022 15:59:23 +0000 -Subject: [PATCH 056/136] v4l2: Update H265 request for current API +Subject: [PATCH 056/151] v4l2: Update H265 request for current API This works with v9 of the H265 patch set which hopefully will be the last one. Hevc controls extracted from patched v4l2-controls into @@ -24211,7 +24211,7 @@ index ed48d62e2d..d4adb3f812 100644 From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001 From: John Cox Date: Sun, 3 Jul 2022 13:40:41 +0000 -Subject: [PATCH 057/136] v4l2_req: Observe limit on size of slice_array +Subject: [PATCH 057/151] v4l2_req: Observe limit on size of slice_array This in fact provides some minor simplifications by combing the multi-slice and single-slice paths. @@ -24342,7 +24342,7 @@ index d4adb3f812..0029e23309 100644 From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 4 Jul 2022 14:43:20 +0100 -Subject: [PATCH 058/136] v4l2_req: Add entry point offsets array control +Subject: [PATCH 058/151] v4l2_req: Add entry point offsets array control --- libavcodec/v4l2_req_hevc_vx.c | 88 +++++++++++++++++++++++++++------- @@ -24580,7 +24580,7 @@ index 0029e23309..99c90064ea 100644 From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 4 Jul 2022 16:22:54 +0100 -Subject: [PATCH 059/136] v4l2_req: Support Annex B +Subject: [PATCH 059/151] v4l2_req: Support Annex B --- libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------ @@ -24694,7 +24694,7 @@ index 43ef6631ed..5e0db9850a 100644 From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 4 Jul 2022 18:24:03 +0100 -Subject: [PATCH 060/136] v4l2_req: Add frame mode decode +Subject: [PATCH 060/151] v4l2_req: Add frame mode decode --- libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------ @@ -24820,7 +24820,7 @@ index 5e0db9850a..ada53d0d44 100644 From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 5 Jul 2022 12:54:22 +0000 -Subject: [PATCH 061/136] v4l2_req: Fix probe for frame based decode +Subject: [PATCH 061/151] v4l2_req: Fix probe for frame based decode --- libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++---------- @@ -24903,7 +24903,7 @@ index ada53d0d44..5d083016f8 100644 From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 26 Jul 2022 15:46:14 +0000 -Subject: [PATCH 062/136] vf_deinterlace_v4l2m2m: Support NV12 through +Subject: [PATCH 062/151] vf_deinterlace_v4l2m2m: Support NV12 through deinterlace Supports NV12 (though not yet NV12M) through deinterlace. @@ -25229,7 +25229,7 @@ index 1a933b7e0a..1a3bef5bcb 100644 From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 19 Aug 2022 15:29:11 +0000 -Subject: [PATCH 063/136] v4l2_req: Enable use of MMAP for buffer alloc +Subject: [PATCH 063/151] v4l2_req: Enable use of MMAP for buffer alloc Use MMAP rather than DMABUF if either the dmabuf device can't be opened or create_buf doesn't set the capability. @@ -25961,7 +25961,7 @@ index cd79aad563..5cf17dd5e3 100644 From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 22 Aug 2022 12:35:40 +0000 -Subject: [PATCH 064/136] Set buffer lengths on DQ +Subject: [PATCH 064/151] Set buffer lengths on DQ --- libavcodec/v4l2_req_media.c | 8 ++++++++ @@ -25990,7 +25990,7 @@ index 910ac77bb6..1a9944774a 100644 From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 22 Aug 2022 17:11:24 +0000 -Subject: [PATCH 065/136] Fix compile if videodev2.h defines V4L2 HEVC request +Subject: [PATCH 065/151] Fix compile if videodev2.h defines V4L2 HEVC request API If videodev2.h does define the HEVC request API it is really hard to @@ -26117,7 +26117,7 @@ index 5cf17dd5e3..614a1b4d99 100644 From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Sep 2022 17:59:22 +0100 -Subject: [PATCH 066/136] v4l2_m2m_enc: Send headers in in pkt side_data +Subject: [PATCH 066/151] v4l2_m2m_enc: Send headers in in pkt side_data If GLOBAL_HEADERS are requested then we can't provide them at init time so send as NEW_EXTRADATA side data in a similar way to some AV1 @@ -26198,7 +26198,7 @@ index 05ff6ba726..099ad23928 100644 From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 14 Sep 2022 15:44:10 +0000 -Subject: [PATCH 067/136] matroskaenc: Allow H264 SPS/PPS headers in packet +Subject: [PATCH 067/151] matroskaenc: Allow H264 SPS/PPS headers in packet sidedata --- @@ -26267,7 +26267,7 @@ index 113541bd9a..61e4c976ef 100644 From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 14 Sep 2022 15:55:15 +0000 -Subject: [PATCH 068/136] movenc: Allow H264 SPS/PPS headers in packet sidedata +Subject: [PATCH 068/151] movenc: Allow H264 SPS/PPS headers in packet sidedata --- libavformat/movenc.c | 1 + @@ -26289,7 +26289,7 @@ index c4fcb5f8b1..891adbf7b2 100644 From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 26 Sep 2022 12:45:05 +0100 -Subject: [PATCH 069/136] Allow ffmpeg to select codec internal hwfmts if +Subject: [PATCH 069/151] Allow ffmpeg to select codec internal hwfmts if no_cvt_hw This allows the selection of DRM_PRIME from v4l2m2m without forcing it @@ -26326,7 +26326,7 @@ index ba0c1898cf..839da7b472 100644 From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 1 Sep 2022 11:42:41 +0000 -Subject: [PATCH 070/136] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler +Subject: [PATCH 070/151] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler The logic for running an isp based scaler is pretty much identical to that for the deinterlacer so add to the deinterlacer. This requires @@ -27809,7 +27809,7 @@ index 1a3bef5bcb..2df39ec0f1 100644 From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 22 Sep 2022 14:54:46 +0000 -Subject: [PATCH 071/136] v4l2_m2m: Adjust buffer allocation based on min/max +Subject: [PATCH 071/151] v4l2_m2m: Adjust buffer allocation based on min/max controls Clip requested buffer count to min/max declared by driver. @@ -27861,7 +27861,7 @@ index 6b97eab41e..ba36689ff3 100644 From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 22 Sep 2022 15:00:12 +0000 -Subject: [PATCH 072/136] v4l2_m2m_dec: If src Q is full then wait indefinitely +Subject: [PATCH 072/151] v4l2_m2m_dec: If src Q is full then wait indefinitely for buffer If it is not possible to add another buffer to the src Q then alawys @@ -27894,7 +27894,7 @@ index 485a96f4b4..bb183097f6 100644 From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 22 Sep 2022 15:12:27 +0000 -Subject: [PATCH 073/136] vf_deinterlace_v4l2m2m: Add Q name to structure for +Subject: [PATCH 073/151] vf_deinterlace_v4l2m2m: Add Q name to structure for debug --- @@ -27928,7 +27928,7 @@ index 2df39ec0f1..4edecc02bf 100644 From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 22 Sep 2022 16:08:42 +0000 -Subject: [PATCH 074/136] v4l2_m2m_enc: Set src buffer count to min+2 by +Subject: [PATCH 074/151] v4l2_m2m_enc: Set src buffer count to min+2 by default Set output.num_buffers to 0 by default which will then be set to min+2 @@ -27960,7 +27960,7 @@ index 099ad23928..b8ba815c37 100644 From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 22 Sep 2022 16:13:57 +0000 -Subject: [PATCH 075/136] vf_deinterlace_m2m: For deinterlace set outlink FR to +Subject: [PATCH 075/151] vf_deinterlace_m2m: For deinterlace set outlink FR to twice inlink We used to set the outlink framerate to unknown but it turns out that @@ -27997,7 +27997,7 @@ index 4edecc02bf..c52dae1c44 100644 From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 23 Sep 2022 11:30:56 +0000 -Subject: [PATCH 076/136] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from +Subject: [PATCH 076/151] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from a Q Useful for where (encode) we might have drmprime buffers that we want to @@ -28055,7 +28055,7 @@ index 21265f1bd7..523c53e97d 100644 From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 23 Sep 2022 11:38:36 +0000 -Subject: [PATCH 077/136] v4l2_m2m_enc: DQ output more frequently +Subject: [PATCH 077/151] v4l2_m2m_enc: DQ output more frequently Ensure that we DQ any released src buffers on every op to avoid deadlock with source. @@ -28114,7 +28114,7 @@ index b8ba815c37..a992a3cccc 100644 From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 26 Sep 2022 18:20:00 +0100 -Subject: [PATCH 078/136] conf_native: Remove --enable-rpi from all builds +Subject: [PATCH 078/151] conf_native: Remove --enable-rpi from all builds --- pi-util/conf_native.sh | 5 +++-- @@ -28148,7 +28148,7 @@ index 37cea71756..f22d531ca4 100755 From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 29 Sep 2022 19:48:08 +0000 -Subject: [PATCH 079/136] v4l2_m2m_dec: Deal correctly with avcC H264 data in +Subject: [PATCH 079/151] v4l2_m2m_dec: Deal correctly with avcC H264 data in extradata Decoders expect AnnexB style headers, mkv and similar formats have @@ -28391,7 +28391,7 @@ index bb183097f6..6bd9926b3f 100644 From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 30 Sep 2022 14:20:23 +0000 -Subject: [PATCH 080/136] v4l2_request_hevc: Fix up +Subject: [PATCH 080/151] v4l2_request_hevc: Fix up V4L2_CID_CODEC_STATELESS_BASE if missing --- @@ -28420,7 +28420,7 @@ index 7829d82084..c02fdbe5a8 100644 From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001 From: John Cox Date: Sat, 1 Oct 2022 13:40:57 +0000 -Subject: [PATCH 081/136] vf_deinterlace_v4l2m2m: Fix compile on m/c without +Subject: [PATCH 081/151] vf_deinterlace_v4l2m2m: Fix compile on m/c without V4L2 SAND --- @@ -28554,7 +28554,7 @@ index c52dae1c44..716789f988 100644 From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001 From: John Cox Date: Sun, 2 Oct 2022 12:36:43 +0000 -Subject: [PATCH 082/136] configure: Fix v4l2_req_hevc_vx setup; set after deps +Subject: [PATCH 082/151] configure: Fix v4l2_req_hevc_vx setup; set after deps fixups --- @@ -28592,7 +28592,7 @@ index 5c00a183e3..94c8161b91 100755 From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Sat, 1 Oct 2022 12:39:45 +0000 -Subject: [PATCH 083/136] vf_deinterlace_v4l2m2m: Ensure we get consistent +Subject: [PATCH 083/151] vf_deinterlace_v4l2m2m: Ensure we get consistent final frames On getting EOS at the input of the filster do not simply drop everything @@ -28944,7 +28944,7 @@ index 716789f988..ce875c2c61 100644 From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 5 Oct 2022 16:12:02 +0000 -Subject: [PATCH 084/136] v4l2_m2m_dec: Rework decode pending heuristic +Subject: [PATCH 084/151] v4l2_m2m_dec: Rework decode pending heuristic The old code measured the length of the entire Q in the decoder and attempted to dynamically guess an appropriate length. This was prone to @@ -29115,7 +29115,7 @@ index 6bd9926b3f..bec9b22fcf 100644 From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 21 Oct 2022 13:48:07 +0000 -Subject: [PATCH 085/136] pthread_frame: Fix MT hwaccel. Recent change broke +Subject: [PATCH 085/151] pthread_frame: Fix MT hwaccel. Recent change broke it. Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the @@ -29222,7 +29222,7 @@ index 2cc89a41f5..b14f8e9360 100644 From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 13:18:27 +0000 -Subject: [PATCH 086/136] v4l2_req: Add swfmt to init logging +Subject: [PATCH 086/151] v4l2_req: Add swfmt to init logging (cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf) --- @@ -29259,7 +29259,7 @@ index 614a1b4d99..767ecb036a 100644 From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 13:39:54 +0000 -Subject: [PATCH 087/136] v4l2_m2m: Avoid polling on a queue that is streamoff +Subject: [PATCH 087/151] v4l2_m2m: Avoid polling on a queue that is streamoff (cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b) --- @@ -29304,7 +29304,7 @@ index 4a359bf45e..b296dc111c 100644 From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 14:07:04 +0000 -Subject: [PATCH 088/136] v4l2_m2m: Add function to get number of queued +Subject: [PATCH 088/151] v4l2_m2m: Add function to get number of queued buffers (cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4) @@ -29336,7 +29336,7 @@ index 523c53e97d..8e4f681643 100644 From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 14:48:20 +0000 -Subject: [PATCH 089/136] v4l2_m2m: Add timeouts to dq_all and dequeue_packet +Subject: [PATCH 089/151] v4l2_m2m: Add timeouts to dq_all and dequeue_packet Add timeouts and use them to have better flow control in encode @@ -29505,7 +29505,7 @@ index a992a3cccc..d0d27e5bc2 100644 From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 14:23:32 +0000 -Subject: [PATCH 090/136] v4l2_m2m_enc: Improve debug trace +Subject: [PATCH 090/151] v4l2_m2m_enc: Improve debug trace (cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5) --- @@ -29565,7 +29565,7 @@ index d0d27e5bc2..c8c2de3d47 100644 From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 18 Oct 2022 13:22:36 +0000 -Subject: [PATCH 091/136] v4l2_m2m_enc: Copy dest packets to memory if short of +Subject: [PATCH 091/151] v4l2_m2m_enc: Copy dest packets to memory if short of v4l2 buffers (cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5) @@ -29604,7 +29604,7 @@ index c8c2de3d47..c23187e6e6 100644 From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Oct 2022 11:00:16 +0000 -Subject: [PATCH 092/136] v4l2_m2m_dec: Fix pts_best_effort guessing for +Subject: [PATCH 092/151] v4l2_m2m_dec: Fix pts_best_effort guessing for initial pts (cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67) @@ -29629,7 +29629,7 @@ index bec9b22fcf..47b2735f82 100644 From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Oct 2022 14:47:04 +0000 -Subject: [PATCH 093/136] v4l2_m2m_enc: Wait for frame or space in src Q in +Subject: [PATCH 093/151] v4l2_m2m_enc: Wait for frame or space in src Q in rx_pkt If receive_packet we should ensure that there is space in the source Q @@ -29691,7 +29691,7 @@ index c23187e6e6..524e9424a5 100644 From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Oct 2022 14:54:29 +0000 -Subject: [PATCH 094/136] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS +Subject: [PATCH 094/151] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS in trace (cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a) @@ -29718,7 +29718,7 @@ index ce875c2c61..7c6751b69c 100644 From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Oct 2022 14:55:21 +0000 -Subject: [PATCH 095/136] vf_deinterlace_v4l2m2m: Ignore "wanted" when +Subject: [PATCH 095/151] vf_deinterlace_v4l2m2m: Ignore "wanted" when processing input If we gate send a frame to the outlink on its frame_wanted flag then we @@ -29751,7 +29751,7 @@ index 7c6751b69c..a173a291f8 100644 From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Oct 2022 15:00:43 +0000 -Subject: [PATCH 096/136] conf_native: Add --enable-gpl +Subject: [PATCH 096/151] conf_native: Add --enable-gpl (cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15) --- @@ -29774,7 +29774,7 @@ index f22d531ca4..082d9b5832 100755 From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 15 Nov 2022 13:33:00 +0000 -Subject: [PATCH 097/136] egl_vout: Make formatting consistent - no code +Subject: [PATCH 097/151] egl_vout: Make formatting consistent - no code changes --- @@ -30758,7 +30758,7 @@ index 7b9c610ace..a52cabb082 100644 From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Dec 2022 16:49:43 +0000 -Subject: [PATCH 098/136] v4l2m2m: reporganise get_raw_format for loop logic +Subject: [PATCH 098/151] v4l2m2m: reporganise get_raw_format for loop logic --- libavcodec/v4l2_context.c | 16 +++++----------- @@ -30806,7 +30806,7 @@ index 7031f3d340..79a31cf930 100644 From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Dec 2022 17:49:12 +0000 -Subject: [PATCH 099/136] drm_vout: Set zpos on the plane we pick to ensure it +Subject: [PATCH 099/151] drm_vout: Set zpos on the plane we pick to ensure it is at the front --- @@ -30876,7 +30876,7 @@ index cfb33ce7c3..9bd9e04421 100644 From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Dec 2022 17:51:46 +0000 -Subject: [PATCH 100/136] drm_vout: Only set modifier flag and pass modifiers +Subject: [PATCH 100/151] drm_vout: Only set modifier flag and pass modifiers if there are some --- @@ -30936,7 +30936,7 @@ index 9bd9e04421..a56adea866 100644 From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Dec 2022 17:52:58 +0000 -Subject: [PATCH 101/136] drm_vout: Fix typo in error message +Subject: [PATCH 101/151] drm_vout: Fix typo in error message --- libavdevice/drm_vout.c | 2 +- @@ -30959,7 +30959,7 @@ index a56adea866..351abf1d60 100644 From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 12 Dec 2022 18:00:41 +0000 -Subject: [PATCH 102/136] drm_vout: Add option to name the drm_module to use +Subject: [PATCH 102/151] drm_vout: Add option to name the drm_module to use --- libavdevice/drm_vout.c | 8 +++++--- @@ -31012,7 +31012,7 @@ index 351abf1d60..491e1dc608 100644 From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 13:01:00 +0000 -Subject: [PATCH 103/136] dmabufs: Rework to allow for non-CMA backends +Subject: [PATCH 103/151] dmabufs: Rework to allow for non-CMA backends --- libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++---------- @@ -31266,7 +31266,7 @@ index c4bbed18c6..1c3a5e861f 100644 From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 13:07:58 +0000 -Subject: [PATCH 104/136] dmabufs: Use unref rather than deleet on cmabufs_ctl +Subject: [PATCH 104/151] dmabufs: Use unref rather than deleet on cmabufs_ctl --- libavcodec/v4l2_req_dmabufs.c | 12 +++++++++++- @@ -31354,7 +31354,7 @@ index 767ecb036a..db7ed13b6d 100644 From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 14:21:40 +0000 -Subject: [PATCH 105/136] egl_vout: Remove redundant & completely broken debug +Subject: [PATCH 105/151] egl_vout: Remove redundant & completely broken debug --- libavdevice/egl_vout.c | 25 ------------------------- @@ -31400,7 +31400,7 @@ index a52cabb082..afc7afd13e 100644 From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 16:12:12 +0000 -Subject: [PATCH 106/136] v4l2m2m: Use offset from querybuf rather than always +Subject: [PATCH 106/151] v4l2m2m: Use offset from querybuf rather than always 0 --- @@ -31455,7 +31455,7 @@ index 1ac32c5989..d91d5d1dd0 100644 From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 17:57:27 +0000 -Subject: [PATCH 107/136] v4l2m2m: Fix crash if init errors out before setting +Subject: [PATCH 107/151] v4l2m2m: Fix crash if init errors out before setting avctx --- @@ -31479,7 +31479,7 @@ index 1e30d15fd8..ac6bae0dc3 100644 From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 18:10:30 +0000 -Subject: [PATCH 108/136] v4l2_buffers: Add and use ctx_to_m2mctx + error debug +Subject: [PATCH 108/151] v4l2_buffers: Add and use ctx_to_m2mctx + error debug --- libavcodec/v4l2_buffers.c | 22 +++++++++++++++------- @@ -31546,7 +31546,7 @@ index 5ca58ea593..e28ef2d1e8 100644 From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 18:53:22 +0000 -Subject: [PATCH 109/136] v4l2m2m: Add ability to use cma alloced dmabufs as +Subject: [PATCH 109/151] v4l2m2m: Add ability to use cma alloced dmabufs as well as v4l2 mmap --- @@ -31807,7 +31807,7 @@ index 47b2735f82..4d17057298 100644 From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Dec 2022 19:05:47 +0000 -Subject: [PATCH 110/136] testfilt: Skeleton of hw filter test code +Subject: [PATCH 110/151] testfilt: Skeleton of hw filter test code --- pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++ @@ -31907,7 +31907,7 @@ index 0000000000..b322dac0c2 From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 5 Jan 2023 14:39:30 +0000 -Subject: [PATCH 111/136] pixfmt: Add a #define to indicate presence of SAND +Subject: [PATCH 111/151] pixfmt: Add a #define to indicate presence of SAND formats --- @@ -31931,7 +31931,7 @@ index 22f70007c3..5cc780e7d5 100644 From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 11 Jan 2023 16:30:37 +0000 -Subject: [PATCH 112/136] v4l2_m2m_dec: Fix initial pkt send if no extradata +Subject: [PATCH 112/151] v4l2_m2m_dec: Fix initial pkt send if no extradata --- libavcodec/v4l2_m2m_dec.c | 4 ++-- @@ -31963,7 +31963,7 @@ index 4d17057298..9daf05adfe 100644 From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 16 Jan 2023 16:05:09 +0000 -Subject: [PATCH 113/136] v4l2m2m_dec: Make capture timeout long once pending +Subject: [PATCH 113/151] v4l2m2m_dec: Make capture timeout long once pending count > 31 For some applications (ffmpeg command line) the current heuristic of adding @@ -32060,7 +32060,7 @@ index 9daf05adfe..c8ab883d7e 100644 From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 6 Feb 2023 19:23:16 +0000 -Subject: [PATCH 114/136] Initial buffersink alloc callback code +Subject: [PATCH 114/151] Initial buffersink alloc callback code (cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb) --- @@ -32155,7 +32155,7 @@ index 64e08de53e..09737d322f 100644 From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 30 Jan 2023 17:23:12 +0000 -Subject: [PATCH 115/136] v4l2_m2m_dec: Add a profile check +Subject: [PATCH 115/151] v4l2_m2m_dec: Add a profile check Check the profile in avctx aginst what the v4l2 driver advertises. If the driver doesn't support the check then just accept anything. @@ -32312,7 +32312,7 @@ index c8ab883d7e..098adf4821 100644 From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 1 Feb 2023 17:24:39 +0000 -Subject: [PATCH 116/136] v4l2_m2m_dec: Add extradata parse for h264 & hevc +Subject: [PATCH 116/151] v4l2_m2m_dec: Add extradata parse for h264 & hevc If we have extradata we can extract profile & level and potentailly other useful info from it. Use the codec parser to get it if the decoder @@ -32443,7 +32443,7 @@ index 098adf4821..e64bc707d3 100644 From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 20 Mar 2023 18:12:51 +0000 -Subject: [PATCH 117/136] clean_usr_libs: Now wipes the include files too +Subject: [PATCH 117/151] clean_usr_libs: Now wipes the include files too When swapping ffmpeg versions obsolete makefiles could confuse configure utilities. @@ -32480,7 +32480,7 @@ index b3b2d5509d..01bd6a6a22 100755 From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 20 Mar 2023 18:15:08 +0000 -Subject: [PATCH 118/136] vulkan: Add missing decode extension defines +Subject: [PATCH 118/151] vulkan: Add missing decode extension defines When building on bookworm the video decode extension names were missing. This adds them. I expect this patch will be @@ -32512,7 +32512,7 @@ index 2a9b5f4aac..11e7945f18 100644 From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 21 Mar 2023 14:20:05 +0000 -Subject: [PATCH 119/136] v4l2_m2m_dec: Fix config file for finding if decoder +Subject: [PATCH 119/151] v4l2_m2m_dec: Fix config file for finding if decoder enabled Fixes parsing of extradata for profile testing. 5.x changed where that @@ -32538,7 +32538,7 @@ index e64bc707d3..91136f03da 100644 From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 21 Mar 2023 14:23:20 +0000 -Subject: [PATCH 120/136] v4l2_m2m_dec: Display profile given if skipped in +Subject: [PATCH 120/151] v4l2_m2m_dec: Display profile given if skipped in debug --- @@ -32562,7 +32562,7 @@ index 91136f03da..d124c7b1fc 100644 From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 22 Mar 2023 16:08:08 +0000 -Subject: [PATCH 121/136] conf_native: Fix for 64-bit kernel with 32-bit +Subject: [PATCH 121/151] conf_native: Fix for 64-bit kernel with 32-bit userspace (cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327) @@ -32618,7 +32618,7 @@ index 082d9b5832..0a7d230f1b 100755 From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 20 Apr 2023 11:48:25 +0000 -Subject: [PATCH 122/136] conf_native: Add install prefix variation +Subject: [PATCH 122/151] conf_native: Add install prefix variation (cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d) --- @@ -32732,7 +32732,7 @@ index 0a7d230f1b..f0ed159594 100755 From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 19 Apr 2023 10:47:58 +0000 -Subject: [PATCH 123/136] swcale: Add explicit bgr24->yv12 conversion +Subject: [PATCH 123/151] swcale: Add explicit bgr24->yv12 conversion (cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434) --- @@ -32890,7 +32890,7 @@ index 9af2e7ecc3..9047030ae4 100644 From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 20 Apr 2023 11:26:10 +0000 -Subject: [PATCH 124/136] swscale: Add unscaled XRGB->YUV420P functions +Subject: [PATCH 124/151] swscale: Add unscaled XRGB->YUV420P functions (cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8) --- @@ -33222,7 +33222,7 @@ index 9047030ae4..053c06adf5 100644 From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 20 Apr 2023 11:35:44 +0000 -Subject: [PATCH 125/136] swscale: Add aarch64 unscaled RGB24->YUV420P +Subject: [PATCH 125/151] swscale: Add aarch64 unscaled RGB24->YUV420P (cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160) --- @@ -33480,7 +33480,7 @@ index d81110ec57..8cf40b65f5 100644 From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 27 Apr 2023 13:03:52 +0000 -Subject: [PATCH 126/136] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh +Subject: [PATCH 126/151] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh (cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d) --- @@ -34010,7 +34010,7 @@ index 8cf40b65f5..978ab443ea 100644 From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 26 Apr 2023 15:36:07 +0000 -Subject: [PATCH 127/136] rgb2rgb: Use asm unconditionally +Subject: [PATCH 127/151] rgb2rgb: Use asm unconditionally (cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1) --- @@ -34074,7 +34074,7 @@ index f10c4ef2de..6a0e2dcc09 100644 From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 27 Apr 2023 13:01:43 +0000 -Subject: [PATCH 128/136] tests/swscale: Add options for width and height on +Subject: [PATCH 128/151] tests/swscale: Add options for width and height on the command line (cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271) @@ -34233,7 +34233,7 @@ index 6c38041ddb..4cf41d9f64 100644 From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 26 Apr 2023 16:31:23 +0000 -Subject: [PATCH 129/136] tests/swscale: Add a timing option +Subject: [PATCH 129/151] tests/swscale: Add a timing option -t Where n is the number of time to loop the scale op. Often useful to do it 10 times or so for better resolution @@ -34318,7 +34318,7 @@ index 4cf41d9f64..12776ffec7 100644 From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 20 Apr 2023 13:40:36 +0000 -Subject: [PATCH 130/136] swscale: RGB->YUV420 fix C template to allow odd +Subject: [PATCH 130/151] swscale: RGB->YUV420 fix C template to allow odd widths (cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8) @@ -34455,7 +34455,7 @@ index 053c06adf5..52469b2e4a 100644 From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001 From: John Cox Date: Thu, 4 May 2023 14:26:14 +0000 -Subject: [PATCH 131/136] rtpenc: Add code to send H264 new extradata in +Subject: [PATCH 131/151] rtpenc: Add code to send H264 new extradata in sidedata Fixes issue with pi V4L2 H264 encode which cannot create extradata @@ -34508,7 +34508,7 @@ index a8d296a154..f67dc2a15a 100644 From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001 From: John Cox Date: Mon, 5 Jun 2023 08:34:38 +0000 -Subject: [PATCH 132/136] rgb2rgb: Fix luma narrow+saturation instruction +Subject: [PATCH 132/151] rgb2rgb: Fix luma narrow+saturation instruction (cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a) --- @@ -34579,7 +34579,7 @@ index 978ab443ea..476ca723a0 100644 From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001 From: John Cox Date: Sun, 4 Jun 2023 13:37:59 +0000 -Subject: [PATCH 133/136] v4l2_m2m_dec: Tweak pending count to use dts & +Subject: [PATCH 133/151] v4l2_m2m_dec: Tweak pending count to use dts & reorder size (cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a) @@ -34744,7 +34744,7 @@ index d124c7b1fc..13af62e819 100644 From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001 From: John Cox Date: Wed, 7 Jun 2023 11:14:52 +0000 -Subject: [PATCH 134/136] v4l2_m2m: Add encode size check +Subject: [PATCH 134/151] v4l2_m2m: Add encode size check Previously an out of bounds size would fail whilst trying to copy the buffer with an unhelpful message. This produces a better error at init @@ -34820,7 +34820,7 @@ index f802687b1b..28d9ed4988 100644 From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001 From: John Cox Date: Fri, 9 Jun 2023 10:28:12 +0000 -Subject: [PATCH 135/136] vf_bwdif: Add attributes to ask for vectorization +Subject: [PATCH 135/151] vf_bwdif: Add attributes to ask for vectorization (cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7) --- @@ -34933,7 +34933,7 @@ index 65c617ebb3..09e68523bb 100644 From f4012f09da1c57a0aa5db01f9096992d0c385f7b Mon Sep 17 00:00:00 2001 From: John Cox Date: Tue, 13 Jun 2023 13:07:55 +0000 -Subject: [PATCH 136/136] v4l2m2m_dec: Fix h264 reorder size if no sps +Subject: [PATCH 136/151] v4l2m2m_dec: Fix h264 reorder size if no sps initially (cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4) @@ -34955,3 +34955,2838 @@ index 13af62e819..11c83b2d66 100644 } ff_h264_ps_uninit(&ps); break; + +From fd31937e4befa2368d48e234d66fb962246bf777 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 30 Jun 2023 18:03:29 +0000 +Subject: [PATCH 137/151] sand_fns: Add missing uxtw for neon stride + +--- + libavutil/aarch64/rpi_sand_neon.S | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +index 2f07d9674c..19411cf3f1 100644 +--- a/libavutil/aarch64/rpi_sand_neon.S ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -469,6 +469,7 @@ endfunc + function ff_rpi_sand30_lines_to_planar_y16, export=1 + lsl w4, w4, #7 + sub w4, w4, #64 ++ uxtw x4, w4 + sub w1, w1, w7, lsl #1 + uxtw x6, w6 + add x8, x2, x6, lsl #7 +@@ -634,6 +635,7 @@ endfunc + function ff_rpi_sand30_lines_to_planar_y8, export=1 + lsl w4, w4, #7 + sub w4, w4, #64 ++ uxtw x4, w4 + sub w1, w1, w7 + uxtw x6, w6 + add x8, x2, x6, lsl #7 + +From f6a19a36ffe0dbe0a6e2e450dafec6711db19057 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 30 Jun 2023 18:12:16 +0000 +Subject: [PATCH 138/151] sand_fns: Rework aarch64 neon + sand30_lines_to_planar_c16 + +Previous version could overflow its write buffer on small buffers +which sometimes crashed WPP_F_ericsson_MAIN10_2. + +This version is probably faster too +--- + libavutil/aarch64/rpi_sand_neon.S | 329 ++++++++++++++---------------- + 1 file changed, 151 insertions(+), 178 deletions(-) + +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +index 19411cf3f1..af7e2a88c4 100644 +--- a/libavutil/aarch64/rpi_sand_neon.S ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -248,199 +248,172 @@ incomplete_block_loop_end_c8: + ret + endfunc + +-//void ff_rpi_sand30_lines_to_planar_c16( +-// uint8_t * dst_u, // [x0] +-// unsigned int dst_stride_u, // [w1] == _w*2 +-// uint8_t * dst_v, // [x2] +-// unsigned int dst_stride_v, // [w3] == _w*2 +-// const uint8_t * src, // [x4] +-// unsigned int stride1, // [w5] == 128 +-// unsigned int stride2, // [w6] +-// unsigned int _x, // [w7] == 0 +-// unsigned int y, // [sp, #0] == 0 +-// unsigned int _w, // [sp, #8] -> w3 +-// unsigned int h); // [sp, #16] -> w7 +- +-.macro rpi_sand30_lines_to_planar_c16_block_half +- ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 +- +- xtn v4.4h, v0.4s +- ushr v0.4s, v0.4s, #10 +- xtn v5.4h, v0.4s +- ushr v0.4s, v0.4s, #10 +- xtn v6.4h, v0.4s +- xtn2 v4.8h, v1.4s +- ushr v1.4s, v1.4s, #10 +- xtn2 v5.8h, v1.4s +- ushr v1.4s, v1.4s, #10 +- xtn2 v6.8h, v1.4s +- and v4.16b, v4.16b, v16.16b +- and v5.16b, v5.16b, v16.16b +- and v6.16b, v6.16b, v16.16b +- st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 +- +- xtn v4.4h, v2.4s +- ushr v2.4s, v2.4s, #10 +- xtn v5.4h, v2.4s +- ushr v2.4s, v2.4s, #10 +- xtn v6.4h, v2.4s +- xtn2 v4.8h, v3.4s +- ushr v3.4s, v3.4s, #10 +- xtn2 v5.8h, v3.4s +- ushr v3.4s, v3.4s, #10 +- xtn2 v6.8h, v3.4s +- and v4.16b, v4.16b, v16.16b +- and v5.16b, v5.16b, v16.16b +- and v6.16b, v6.16b, v16.16b +- st3 { v4.8h, v5.8h, v6.8h }, [sp] +- sub sp, sp, #48 +-.endm +- +-function ff_rpi_sand30_lines_to_planar_c16, export=1 +- stp x19, x20, [sp, #-48]! +- stp x21, x22, [sp, #16] +- stp x23, x24, [sp, #32] +- +- ldr w3, [sp, #48+8] // w3 = width +- ldr w7, [sp, #48+16] // w7 = height +- +- // reserve space on the stack for intermediate results +- sub sp, sp, #256 ++// Unzip chroma ++// ++// On entry: ++// a0 = V0, U2, ... ++// a1 = U0, V1, ... ++// a2 = U1, V2, ... ++// b0 = V8, U10, ... ++// b1 = U8, V9, ... ++// b2 = U9, V10, ... ++// ++// On exit: ++// d0 = U0, U3, ... ++// ... ++// a0 = V0, V3, .. ++// ... ++// ++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs) + +- // number of 128byte blocks per row, w8 = width / 48 +- mov w9, #48 +- udiv w8, w3, w9 ++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2 ++ uzp1 \d0\().8h, \a1\().8h, \b1\().8h ++ uzp1 \d1\().8h, \a2\().8h, \b2\().8h ++ uzp2 \d2\().8h, \a0\().8h, \b0\().8h + +- // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 +- mul w9, w8, w9 +- sub w9, w3, w9 ++ uzp1 \a0\().8h, \a0\().8h, \b0\().8h ++ uzp2 \a1\().8h, \a1\().8h, \b1\().8h ++ uzp2 \a2\().8h, \a2\().8h, \b2\().8h ++.endm + +- // row offset, the beginning of the next row to process +- eor w10, w10, w10 ++// SAND30 -> 10bit ++.macro USAND10 d0, d1, d2, a0, a1 ++ shrn \d2\().4h, \a0\().4s, #14 ++ xtn \d0\().4h, \a0\().4s ++ shrn \d1\().4h, \a0\().4s, #10 + +- // offset to the beginning of the next block, w11 = stride2 * 128 - 128 +- lsl w11, w6, #7 +- sub w11, w11, #128 ++ shrn2 \d2\().8h, \a1\().4s, #14 ++ xtn2 \d0\().8h, \a1\().4s ++ shrn2 \d1\().8h, \a1\().4s, #10 + +- // decrease the height by one and in case of remaining pixels increase the block count by one +- sub w7, w7, #1 +- cmp w9, #0 +- cset w19, ne // w19 == 1 iff reamining pixels != 0 +- add w8, w8, w19 ++ ushr \d2\().8h, \d2\().8h, #6 ++ bic \d0\().8h, #0xfc, lsl #8 ++ bic \d1\().8h, #0xfc, lsl #8 ++.endm + +- // bytes we have to move dst back by at the end of every row +- mov w21, #48*2 +- mul w21, w21, w8 +- sub w21, w1, w21 ++// void ff_rpi_sand30_lines_to_planar_c16( ++// uint8_t * dst_u, // [x0] ++// unsigned int dst_stride_u, // [w1] ++// uint8_t * dst_v, // [x2] ++// unsigned int dst_stride_v, // [w3] ++// const uint8_t * src, // [x4] ++// unsigned int stride1, // [w5] 128 ++// unsigned int stride2, // [w6] ++// unsigned int _x, // [w7] 0 ++// unsigned int y, // [sp, #0] ++// unsigned int _w, // [sp, #8] w9 ++// unsigned int h); // [sp, #16] w10 + +- mov w20, #0 // w20 = flag, last row processed ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ ldr w7, [sp, #0] // y ++ ldr w8, [sp, #8] // _w ++ ldr w10, [sp, #16] // h ++ lsl w6, w6, #7 // Fixup stride2 ++ sub w6, w6, #64 ++ uxtw x6, w6 ++ sub w1, w1, w8, LSL #1 // Fixup chroma strides ++ sub w3, w3, w8, LSL #1 ++ lsl w7, w7, #7 // Add y to src ++ add x4, x4, w7, UXTW ++10: ++ mov w13, #0 ++ mov x5, x4 ++ mov w9, w8 ++1: ++ ld1 {v0.4s-v3.4s}, [x5], #64 ++ ld1 {v4.4s-v7.4s}, [x5], x6 + +- mov x12, #0x03ff03ff03ff03ff +- dup v16.2d, x12 ++ USAND10 v17, v16, v18, v0, v1 ++ USAND10 v20, v19, v21, v2, v3 ++ UZPH_C v0, v1, v2, v16, v17, v18, v19, v20, v21 ++ USAND10 v23, v22, v24, v4, v5 ++ USAND10 v26, v25, v27, v6, v7 ++ UZPH_C v4, v5, v6, v22, v23, v24, v25, v26, v27 + +- // iterate through rows, row counter = w12 = 0 +- eor w12, w12, w12 +-row_loop_c16: +- cmp w12, w7 +- bge row_loop_c16_fin ++ subs w9, w9, #48 ++ blt 2f + +- // address of row data = src + row_offset +- mov x13, x4 +- add x13, x13, x10 ++ st3 {v0.8h-v2.8h}, [x0], #48 ++ st3 {v4.8h-v6.8h}, [x0], #48 ++ st3 {v16.8h-v18.8h}, [x2], #48 ++ st3 {v22.8h-v24.8h}, [x2], #48 + +- eor w14, w14, w14 +-block_loop_c16: +- cmp w14, w8 +- bge block_loop_c16_fin +- +- rpi_sand30_lines_to_planar_c16_block_half +- +- ld2 { v0.8h, v1.8h }, [sp], #32 +- ld2 { v2.8h, v3.8h }, [sp], #32 +- ld2 { v4.8h, v5.8h }, [sp] +- sub sp, sp, #64 +- +- st1 { v0.8h }, [x0], #16 +- st1 { v2.8h }, [x0], #16 +- st1 { v4.8h }, [x0], #16 +- st1 { v1.8h }, [x2], #16 +- st1 { v3.8h }, [x2], #16 +- st1 { v5.8h }, [x2], #16 +- +- rpi_sand30_lines_to_planar_c16_block_half +- +- ld2 { v0.8h, v1.8h }, [sp], #32 +- ld2 { v2.8h, v3.8h }, [sp], #32 +- ld2 { v4.8h, v5.8h }, [sp] +- sub sp, sp, #64 +- +- st1 { v0.8h }, [x0], #16 +- st1 { v2.8h }, [x0], #16 +- st1 { v4.8h }, [x0], #16 +- st1 { v1.8h }, [x2], #16 +- st1 { v3.8h }, [x2], #16 +- st1 { v5.8h }, [x2], #16 +- +- add x13, x13, x11 // offset to next block +- add w14, w14, #1 +- b block_loop_c16 +-block_loop_c16_fin: ++ bne 1b ++11: ++ subs w10, w10, #1 ++ add x4, x4, #128 ++ add x0, x0, w1, UXTW ++ add x2, x2, w3, UXTW ++ bne 10b ++99: ++ ret + +- add w10, w10, #128 +- add w12, w12, #1 +- add x0, x0, w21, sxtw // move dst pointers back by x21 +- add x2, x2, w21, sxtw +- b row_loop_c16 +-row_loop_c16_fin: +- +- cmp w20, #1 +- beq row_loop_c16_fin2 +- mov w20, #1 +- sub w8, w8, w19 // decrease block count by w19 +- add w7, w7, #1 // increase height +- b row_loop_c16 +- +-row_loop_c16_fin2: +- sub x0, x0, w21, sxtw // readd x21 in case of the last row +- sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels +- +- // last incomplete block to be finished +- // read operations are fine, stride2 is more than large enough even if rem_pix is 0 +- rpi_sand30_lines_to_planar_c16_block_half +- ld2 { v0.8h, v1.8h }, [sp], #32 +- ld2 { v2.8h, v3.8h }, [sp], #32 +- ld2 { v4.8h, v5.8h }, [sp], #32 +- rpi_sand30_lines_to_planar_c16_block_half +- ld2 { v0.8h, v1.8h }, [sp], #32 +- ld2 { v2.8h, v3.8h }, [sp], #32 +- ld2 { v4.8h, v5.8h }, [sp] +- sub sp, sp, #160 +- +- mov x4, sp +- eor w20, w20, w20 +-rem_pix_c16_loop: +- cmp w20, w9 +- bge rem_pix_c16_fin +- +- ldr w22, [x4], #4 +- str w22, [x0], #2 +- lsr w22, w22, #16 +- str w22, [x2], #2 +- +- add w20, w20, #1 +- b rem_pix_c16_loop +-rem_pix_c16_fin: +- +- add sp, sp, #256 +- +- ldp x23, x24, [sp, #32] +- ldp x21, x22, [sp, #16] +- ldp x19, x20, [sp], #48 +- ret ++// Partial final write ++2: ++ cmp w9, #24-48 ++ blt 1f ++ st3 {v0.8h - v2.8h}, [x0], #48 ++ st3 {v16.8h - v18.8h}, [x2], #48 ++ beq 11b ++ mov v0.16b, v4.16b ++ mov v1.16b, v5.16b ++ sub w9, w9, #24 ++ mov v2.16b, v6.16b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ mov v18.16b, v24.16b ++1: ++ cmp w9, #12-48 ++ blt 1f ++ st3 {v0.4h - v2.4h}, [x0], #24 ++ st3 {v16.4h - v18.4h}, [x2], #24 ++ beq 11b ++ mov v0.2d[0], v0.2d[1] ++ sub w9, w9, #12 ++ mov v1.2d[0], v1.2d[1] ++ mov v2.2d[0], v2.2d[1] ++ mov v16.2d[0], v16.2d[1] ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w9, #6-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v0.h - v2.h}[1], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ st3 {v16.h - v18.h}[1], [x2], #6 ++ beq 11b ++ mov v0.s[0], v0.s[1] ++ sub w9, w9, #6 ++ mov v1.s[0], v1.s[1] ++ mov v2.s[0], v2.s[1] ++ mov v16.s[0], v16.s[1] ++ mov v17.s[0], v17.s[1] ++ mov v18.s[0], v18.s[1] ++1: ++ cmp w9, #3-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ beq 11b ++ mov v0.h[0], v0.h[1] ++ sub w9, w9, #3 ++ mov v1.h[0], v1.h[1] ++ mov v16.h[0], v16.h[1] ++ mov v17.h[0], v17.h[1] ++1: ++ cmp w9, #2-48 ++ blt 1f ++ st2 {v0.h - v1.h}[0], [x0], #4 ++ st2 {v16.h - v17.h}[0], [x2], #4 ++ b 11b ++1: ++ st1 {v0.h}[0], [x0], #2 ++ st1 {v16.h}[0], [x2], #2 ++ b 11b + endfunc + + +- + //void ff_rpi_sand30_lines_to_planar_p010( + // uint8_t * dest, + // unsigned int dst_stride, + +From 68356e594ff32e18e419a476889d958dc24af4b2 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 30 Jun 2023 19:41:06 +0000 +Subject: [PATCH 139/151] sand_fns: Minor optimisations to aarch64 neon + +--- + libavutil/aarch64/rpi_sand_neon.S | 140 ++++++------------------------ + 1 file changed, 28 insertions(+), 112 deletions(-) + +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +index af7e2a88c4..11658de0c8 100644 +--- a/libavutil/aarch64/rpi_sand_neon.S ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -279,18 +279,37 @@ endfunc + // SAND30 -> 10bit + .macro USAND10 d0, d1, d2, a0, a1 + shrn \d2\().4h, \a0\().4s, #14 +- xtn \d0\().4h, \a0\().4s + shrn \d1\().4h, \a0\().4s, #10 + + shrn2 \d2\().8h, \a1\().4s, #14 +- xtn2 \d0\().8h, \a1\().4s + shrn2 \d1\().8h, \a1\().4s, #10 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h + + ushr \d2\().8h, \d2\().8h, #6 + bic \d0\().8h, #0xfc, lsl #8 + bic \d1\().8h, #0xfc, lsl #8 + .endm + ++// SAND30 -> 8bit ++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2 ++ shrn \d1\().4h, \a0\().4s, #12 ++ shrn2 \d1\().8h, \a1\().4s, #12 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h ++ uzp2 \d2\().8h, \a0\().8h, \a1\().8h ++ ++ shrn \t1\().4h, \a2\().4s, #12 ++ shrn2 \t1\().8h, \a3\().4s, #12 ++ uzp1 \t0\().8h, \a2\().8h, \a3\().8h ++ uzp2 \t2\().8h, \a2\().8h, \a3\().8h ++ ++ shrn \d0\().8b, \d0\().8h, #2 ++ shrn2 \d0\().16b, \t0\().8h, #2 ++ shrn \d2\().8b, \d2\().8h, #6 ++ shrn2 \d2\().16b, \t2\().8h, #6 ++ uzp1 \d1\().16b, \d1\().16b, \t1\().16b ++.endm ++ ++ + // void ff_rpi_sand30_lines_to_planar_c16( + // uint8_t * dst_u, // [x0] + // unsigned int dst_stride_u, // [w1] +@@ -322,6 +341,7 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 + 1: + ld1 {v0.4s-v3.4s}, [x5], #64 + ld1 {v4.4s-v7.4s}, [x5], x6 ++ subs w9, w9, #48 + + USAND10 v17, v16, v18, v0, v1 + USAND10 v20, v19, v21, v2, v3 +@@ -330,7 +350,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 + USAND10 v26, v25, v27, v6, v7 + UZPH_C v4, v5, v6, v22, v23, v24, v25, v26, v27 + +- subs w9, w9, #48 + blt 2f + + st3 {v0.8h-v2.8h}, [x0], #48 +@@ -457,61 +476,10 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1 + + subs w5, w5, #96 + +- // v0, v1 +- +- shrn v18.4h, v0.4s, #14 +- xtn v16.4h, v0.4s +- shrn v17.4h, v0.4s, #10 +- +- shrn2 v18.8h, v1.4s, #14 +- xtn2 v16.8h, v1.4s +- shrn2 v17.8h, v1.4s, #10 +- +- ushr v18.8h, v18.8h, #6 +- bic v16.8h, #0xfc, lsl #8 +- bic v17.8h, #0xfc, lsl #8 +- +- // v2, v3 +- +- shrn v21.4h, v2.4s, #14 +- xtn v19.4h, v2.4s +- shrn v20.4h, v2.4s, #10 +- +- shrn2 v21.8h, v3.4s, #14 +- xtn2 v19.8h, v3.4s +- shrn2 v20.8h, v3.4s, #10 +- +- ushr v21.8h, v21.8h, #6 +- bic v19.8h, #0xfc, lsl #8 +- bic v20.8h, #0xfc, lsl #8 +- +- // v4, v5 +- +- shrn v24.4h, v4.4s, #14 +- xtn v22.4h, v4.4s +- shrn v23.4h, v4.4s, #10 +- +- shrn2 v24.8h, v5.4s, #14 +- xtn2 v22.8h, v5.4s +- shrn2 v23.8h, v5.4s, #10 +- +- ushr v24.8h, v24.8h, #6 +- bic v22.8h, #0xfc, lsl #8 +- bic v23.8h, #0xfc, lsl #8 +- +- // v6, v7 +- +- shrn v27.4h, v6.4s, #14 +- xtn v25.4h, v6.4s +- shrn v26.4h, v6.4s, #10 +- +- shrn2 v27.8h, v7.4s, #14 +- xtn2 v25.8h, v7.4s +- shrn2 v26.8h, v7.4s, #10 +- +- ushr v27.8h, v27.8h, #6 +- bic v25.8h, #0xfc, lsl #8 +- bic v26.8h, #0xfc, lsl #8 ++ USAND10 v16, v17, v18, v0, v1 ++ USAND10 v19, v20, v21, v2, v3 ++ USAND10 v22, v23, v24, v4, v5 ++ USAND10 v25, v26, v27, v6, v7 + + blt 2f + +@@ -624,60 +592,8 @@ function ff_rpi_sand30_lines_to_planar_y8, export=1 + subs w5, w5, #96 + + // v0, v1 +- +- shrn v18.4h, v0.4s, #16 +- xtn v16.4h, v0.4s +- shrn v17.4h, v0.4s, #12 +- +- shrn2 v18.8h, v1.4s, #16 +- xtn2 v16.8h, v1.4s +- shrn2 v17.8h, v1.4s, #12 +- +- shrn v18.8b, v18.8h, #6 +- shrn v16.8b, v16.8h, #2 +- xtn v17.8b, v17.8h +- +- // v2, v3 +- +- shrn v21.4h, v2.4s, #16 +- xtn v19.4h, v2.4s +- shrn v20.4h, v2.4s, #12 +- +- shrn2 v21.8h, v3.4s, #16 +- xtn2 v19.8h, v3.4s +- shrn2 v20.8h, v3.4s, #12 +- +- shrn2 v18.16b, v21.8h, #6 +- shrn2 v16.16b, v19.8h, #2 +- xtn2 v17.16b, v20.8h +- +- // v4, v5 +- +- shrn v24.4h, v4.4s, #16 +- xtn v22.4h, v4.4s +- shrn v23.4h, v4.4s, #12 +- +- shrn2 v24.8h, v5.4s, #16 +- xtn2 v22.8h, v5.4s +- shrn2 v23.8h, v5.4s, #12 +- +- shrn v21.8b, v24.8h, #6 +- shrn v19.8b, v22.8h, #2 +- xtn v20.8b, v23.8h +- +- // v6, v7 +- +- shrn v27.4h, v6.4s, #16 +- xtn v25.4h, v6.4s +- shrn v26.4h, v6.4s, #12 +- +- shrn2 v27.8h, v7.4s, #16 +- xtn2 v25.8h, v7.4s +- shrn2 v26.8h, v7.4s, #12 +- +- shrn2 v21.16b, v27.8h, #6 +- shrn2 v19.16b, v25.8h, #2 +- xtn2 v20.16b, v26.8h ++ USAND8 v16, v17, v18, v0, v1, v2, v3, v22, v23, v24 ++ USAND8 v19, v20, v21, v4, v5, v6, v7, v22, v23, v24 + + blt 2f + + +From 3abb0dcc453aba0a069bc1a8f26ba77913c5ef2b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sat, 1 Jul 2023 18:43:32 +0000 +Subject: [PATCH 140/151] sand_fns: Add test for neon to sand30 fns so they can + be tested by checkasm + +--- + libavutil/rpi_sand_fns.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +index b6071e2928..0626bb06cb 100644 +--- a/libavutil/rpi_sand_fns.c ++++ b/libavutil/rpi_sand_fns.c +@@ -35,10 +35,12 @@ Authors: John Cox + #include "frame.h" + + #if ARCH_ARM && HAVE_NEON +-#include "arm/rpi_sand_neon.h" ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" + #define HAVE_SAND_ASM 1 + #elif ARCH_AARCH64 && HAVE_NEON +-#include "aarch64/rpi_sand_neon.h" ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" + #define HAVE_SAND_ASM 1 + #else + #define HAVE_SAND_ASM 0 +@@ -97,7 +99,7 @@ void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + + #if HAVE_SAND_ASM +- if (_x == 0) { ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { + ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); + return; + } +@@ -163,7 +165,7 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_ + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + + #if HAVE_SAND_ASM +- if (_x == 0) { ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { + ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, + src, stride1, stride2, _x, y, _w, h); + return; + +From fb72aa34ec2c42fc595bb1a6c32b599da870fa2b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sat, 1 Jul 2023 18:43:57 +0000 +Subject: [PATCH 141/151] checkasm: Add tests for rpi_sand sand30 fns + +Something of a kludge for function selection as, at the moment, the +rpi_sand fns don't have a jump table that we could use for selection. +--- + tests/checkasm/Makefile | 3 +- + tests/checkasm/checkasm.c | 3 + + tests/checkasm/checkasm.h | 1 + + tests/checkasm/rpi_sand.c | 118 ++++++++++++++++++++++++++++++++++++++ + tests/fate/checkasm.mak | 1 + + 5 files changed, 125 insertions(+), 1 deletion(-) + create mode 100644 tests/checkasm/rpi_sand.c + +diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile +index a6f06c7007..66291baf33 100644 +--- a/tests/checkasm/Makefile ++++ b/tests/checkasm/Makefile +@@ -59,8 +59,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS) + AVUTILOBJS += av_tx.o + AVUTILOBJS += fixed_dsp.o + AVUTILOBJS += float_dsp.o ++AVUTILOBJS-$(CONFIG_SAND) += rpi_sand.o + +-CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) ++CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) $(AVUTILOBJS-yes) + + CHECKASMOBJS-$(ARCH_AARCH64) += aarch64/checkasm.o + CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL) += arm/checkasm.o +diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c +index e96d84a7da..57e0091b80 100644 +--- a/tests/checkasm/checkasm.c ++++ b/tests/checkasm/checkasm.c +@@ -210,6 +210,9 @@ static const struct { + { "fixed_dsp", checkasm_check_fixed_dsp }, + { "float_dsp", checkasm_check_float_dsp }, + { "av_tx", checkasm_check_av_tx }, ++ #if CONFIG_SAND ++ { "rpi_sand", checkasm_check_rpi_sand }, ++ #endif + #endif + { NULL } + }; +diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h +index 8744a81218..f4a0d20358 100644 +--- a/tests/checkasm/checkasm.h ++++ b/tests/checkasm/checkasm.h +@@ -73,6 +73,7 @@ void checkasm_check_motion(void); + void checkasm_check_nlmeans(void); + void checkasm_check_opusdsp(void); + void checkasm_check_pixblockdsp(void); ++void checkasm_check_rpi_sand(void); + void checkasm_check_sbrdsp(void); + void checkasm_check_synth_filter(void); + void checkasm_check_sw_gbrp(void); +diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c +new file mode 100644 +index 0000000000..0888714c4c +--- /dev/null ++++ b/tests/checkasm/rpi_sand.c +@@ -0,0 +1,118 @@ ++/* ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++#include "checkasm.h" ++#include "libavutil/common.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#if ARCH_ARM ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" ++#elif ARCH_AARCH64 ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" ++#endif ++ ++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c) ++{ ++ return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20); ++} ++ ++void checkasm_check_rpi_sand(void) ++{ ++ const unsigned int w = 1280; ++ const unsigned int h = 66; ++ const unsigned int stride1 = 128; ++ const unsigned int stride2 = h*3/2; ++ const unsigned int ssize = ((w+95)/96)*128*h*3/2; ++ const unsigned int ysize = ((w + 32) * (h + 32) * 2); ++ ++ uint8_t * sbuf0 = malloc(ssize); ++ uint8_t * sbuf1 = malloc(ssize); ++ uint8_t * ybuf0 = malloc(ysize); ++ uint8_t * ybuf1 = malloc(ysize); ++ uint8_t * vbuf0 = malloc(ysize); ++ uint8_t * vbuf1 = malloc(ysize); ++ uint8_t * yframe0 = (w + 32) * 16 + ybuf0; ++ uint8_t * yframe1 = (w + 32) * 16 + ybuf1; ++ uint8_t * vframe0 = (w + 32) * 16 + vbuf0; ++ uint8_t * vframe1 = (w + 32) * 16 + vbuf1; ++ unsigned int i; ++ ++ for (i = 0; i != ssize; i += 4) ++ *(uint32_t*)(sbuf0 + i) = rnd(); ++ memcpy(sbuf1, sbuf0, ssize); ++ ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) { ++ declare_func(void, uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h); ++ call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize)) ++ fail(); ++ ++ bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); ++ } ++ ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) { ++ declare_func(void, uint8_t * u_dst, const unsigned int u_stride, ++ uint8_t * v_dst, const unsigned int v_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ memset(vbuf0, 0xbb, ysize); ++ memset(vbuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2); ++ call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize) ++ || memcmp(vbuf0, vbuf1, ysize)) ++ fail(); ++ ++ bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); ++ } ++ ++ ++ report("sand30"); ++ ++ free(sbuf0); ++ free(sbuf1); ++ free(ybuf0); ++ free(ybuf1); ++ free(vbuf0); ++ free(vbuf1); ++} ++ +diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak +index a4e95541f5..6fda6d227e 100644 +--- a/tests/fate/checkasm.mak ++++ b/tests/fate/checkasm.mak +@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \ + fate-checkasm-motion \ + fate-checkasm-opusdsp \ + fate-checkasm-pixblockdsp \ ++ fate-checkasm-rpi_sand \ + fate-checkasm-sbrdsp \ + fate-checkasm-synth_filter \ + fate-checkasm-sw_gbrp \ + +From d798c7b90dd63bca6f9878b1fb30ec1d8f0b9a5e Mon Sep 17 00:00:00 2001 +From: James Darnley +Date: Mon, 20 Feb 2023 20:55:08 +0100 +Subject: [PATCH 142/151] avfilter/bwdif: move filter_line init to a dedicated + function + +(cherry picked from commit b503b5a0cf80f38ecf4737c012b621b7e94f242a) +--- + libavfilter/bwdif.h | 3 ++- + libavfilter/vf_bwdif.c | 13 +++++++++---- + libavfilter/x86/vf_bwdif_init.c | 4 +--- + 3 files changed, 12 insertions(+), 8 deletions(-) + +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index 889ff772ed..5749345f78 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -37,6 +37,7 @@ typedef struct BWDIFContext { + int parity, int clip_max, int spat); + } BWDIFContext; + +-void ff_bwdif_init_x86(BWDIFContext *bwdif); ++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); + + #endif /* AVFILTER_BWDIF_H */ +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 09e68523bb..539fabbd46 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -341,7 +341,14 @@ static int config_props(AVFilterLink *link) + + yadif->csp = av_pix_fmt_desc_get(link->format); + yadif->filter = filter; +- if (yadif->csp->comp[0].depth > 8) { ++ ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth); ++ ++ return 0; ++} ++ ++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) ++{ ++ if (bit_depth > 8) { + s->filter_intra = filter_intra_16bit; + s->filter_line = filter_line_c_16bit; + s->filter_edge = filter_edge_16bit; +@@ -352,10 +359,8 @@ static int config_props(AVFilterLink *link) + } + + #if ARCH_X86 +- ff_bwdif_init_x86(s); ++ ff_bwdif_init_x86(s, bit_depth); + #endif +- +- return 0; + } + + +diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c +index e24e5cd9b1..ba7bc40c3d 100644 +--- a/libavfilter/x86/vf_bwdif_init.c ++++ b/libavfilter/x86/vf_bwdif_init.c +@@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); + +-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif) ++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) + { +- YADIFContext *yadif = &bwdif->yadif; + int cpu_flags = av_get_cpu_flags(); +- int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth; + + if (bit_depth <= 8) { + if (EXTERNAL_SSE2(cpu_flags)) + +From 0eb9c627c07931cf93c4932e07e0df6c0ce860fd Mon Sep 17 00:00:00 2001 +From: James Darnley +Date: Mon, 20 Feb 2023 20:55:08 +0100 +Subject: [PATCH 143/151] checkasm: add test for bwdif + +(cherry picked from commit 087faf8cac51e5e20a5f41b36b8d4c2705a10039) +--- + tests/checkasm/Makefile | 1 + + tests/checkasm/checkasm.c | 3 ++ + tests/checkasm/checkasm.h | 1 + + tests/checkasm/vf_bwdif.c | 84 +++++++++++++++++++++++++++++++++++++++ + tests/fate/checkasm.mak | 1 + + 5 files changed, 90 insertions(+) + create mode 100644 tests/checkasm/vf_bwdif.c + +diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile +index 66291baf33..2c80d8e661 100644 +--- a/tests/checkasm/Makefile ++++ b/tests/checkasm/Makefile +@@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) + # libavfilter tests + AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o + AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o ++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o + AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o + AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o + AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o +diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c +index 57e0091b80..4f983d7fbc 100644 +--- a/tests/checkasm/checkasm.c ++++ b/tests/checkasm/checkasm.c +@@ -179,6 +179,9 @@ static const struct { + #if CONFIG_BLEND_FILTER + { "vf_blend", checkasm_check_blend }, + #endif ++ #if CONFIG_BWDIF_FILTER ++ { "vf_bwdif", checkasm_check_vf_bwdif }, ++ #endif + #if CONFIG_COLORSPACE_FILTER + { "vf_colorspace", checkasm_check_colorspace }, + #endif +diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h +index f4a0d20358..d69bc43999 100644 +--- a/tests/checkasm/checkasm.h ++++ b/tests/checkasm/checkasm.h +@@ -83,6 +83,7 @@ void checkasm_check_utvideodsp(void); + void checkasm_check_v210dec(void); + void checkasm_check_v210enc(void); + void checkasm_check_vc1dsp(void); ++void checkasm_check_vf_bwdif(void); + void checkasm_check_vf_eq(void); + void checkasm_check_vf_gblur(void); + void checkasm_check_vf_hflip(void); +diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c +new file mode 100644 +index 0000000000..46224bb575 +--- /dev/null ++++ b/tests/checkasm/vf_bwdif.c +@@ -0,0 +1,84 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++#include "checkasm.h" ++#include "libavcodec/internal.h" ++#include "libavfilter/bwdif.h" ++ ++#define WIDTH 256 ++ ++#define randomize_buffers(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = rnd() & mask ++ ++#define BODY(type, depth) \ ++ do { \ ++ type prev0[9*WIDTH], prev1[9*WIDTH]; \ ++ type next0[9*WIDTH], next1[9*WIDTH]; \ ++ type cur0[9*WIDTH], cur1[9*WIDTH]; \ ++ type dst0[WIDTH], dst1[WIDTH]; \ ++ const int stride = WIDTH; \ ++ const int mask = (1< +Date: Thu, 6 Jul 2023 13:56:18 +0000 +Subject: [PATCH 144/151] Revert "vf_bwdif: Add attributes to ask for + vectorization" + +This reverts commit 281250290ba5c2dcd8676e9a261050e65c10bcb7. +Will be replaced by hand coded asm as on upstream +--- + libavfilter/vf_bwdif.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 539fabbd46..34e8c5e234 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -74,10 +74,10 @@ typedef struct ThreadData { + int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \ + int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \ + int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ +- {/*\ ++ \ + if (!diff) { \ + dst[0] = d; \ +- } else {*/ ++ } else { + + #define SPAT_CHECK() \ + int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \ +@@ -89,16 +89,15 @@ typedef struct ThreadData { + diff = FFMAX3(diff, min, -max); + + #define FILTER_LINE() \ +- int i1, i2; \ + SPAT_CHECK() \ +- /*if (FFABS(c - e) > temporal_diff0)*/ { \ +- i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \ ++ if (FFABS(c - e) > temporal_diff0) { \ ++ interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \ + - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \ + + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \ + + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ +- } /*else*/ { \ +- i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ +- }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\ ++ } else { \ ++ interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ ++ } + + #define FILTER_EDGE() \ + if (spat) { \ +@@ -112,7 +111,7 @@ typedef struct ThreadData { + else if (interpol < d - diff) \ + interpol = d - diff; \ + \ +- dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \ ++ dst[0] = av_clip(interpol, 0, clip_max); \ + } \ + \ + dst++; \ +@@ -123,7 +122,7 @@ typedef struct ThreadData { + next2++; \ + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, ++static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max) + { + uint8_t *dst = dst1; +@@ -133,7 +132,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restr + FILTER_INTRA() + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, ++static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +@@ -151,7 +150,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *rest + FILTER2() + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, ++static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) + { +@@ -168,7 +167,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restri + FILTER2() + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, ++static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max) + { + uint16_t *dst = dst1; +@@ -178,7 +177,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void + FILTER_INTRA() + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, ++static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +@@ -196,7 +195,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void + FILTER2() + } + +-static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, ++static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) + { + +From 093eddd9ef66a7db9e637f3acfe51d950c87f613 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:39 +0000 +Subject: [PATCH 145/151] tests/checkasm: Add test for vf_bwdif filter_intra +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 7caa8d6b91e738ad2c1ea61746b6c062c470f7d3) +--- + tests/checkasm/vf_bwdif.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c +index 46224bb575..034bbabb4c 100644 +--- a/tests/checkasm/vf_bwdif.c ++++ b/tests/checkasm/vf_bwdif.c +@@ -20,6 +20,7 @@ + #include "checkasm.h" + #include "libavcodec/internal.h" + #include "libavfilter/bwdif.h" ++#include "libavutil/mem_internal.h" + + #define WIDTH 256 + +@@ -81,4 +82,40 @@ void checkasm_check_vf_bwdif(void) + BODY(uint16_t, 10); + report("bwdif10"); + } ++ ++ if (check_func(ctx_8.filter_intra, "bwdif8.intra")) { ++ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]); ++ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]); ++ const int stride = WIDTH; ++ const int mask = (1<<8)-1; ++ ++ declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++ randomize_buffers( cur0, cur1, mask, 11*WIDTH); ++ memset(dst0, 0xba, WIDTH * 3); ++ memset(dst1, 0xba, WIDTH * 3); ++ ++ call_ref(dst0 + stride, ++ cur0 + stride * 4, WIDTH, ++ stride, -stride, stride * 3, -stride * 3, ++ 0, mask); ++ call_new(dst1 + stride, ++ cur0 + stride * 4, WIDTH, ++ stride, -stride, stride * 3, -stride * 3, ++ 0, mask); ++ ++ if (memcmp(dst0, dst1, WIDTH*3) ++ || memcmp( cur0, cur1, WIDTH*11)) ++ fail(); ++ ++ bench_new(dst1 + stride, ++ cur0 + stride * 4, WIDTH, ++ stride, -stride, stride * 3, -stride * 3, ++ 0, mask); ++ ++ report("bwdif8.intra"); ++ } + } + +From 28ef7402381b6fe241f81e21f302a23f8af674bf Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:40 +0000 +Subject: [PATCH 146/151] avfilter/vf_bwdif: Add neon for filter_intra +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adds an outline for aarch neon functions +Adds common macros and consts for aarch64 neon +Exports C filter_intra needed for tail fixup of neon code +Adds neon for filter_intra + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 5075cfb4e6a21f6b4da9e62bdb0bad4cb32a4673) +--- + libavfilter/aarch64/Makefile | 2 + + libavfilter/aarch64/vf_bwdif_init_aarch64.c | 56 ++++++++ + libavfilter/aarch64/vf_bwdif_neon.S | 136 ++++++++++++++++++++ + libavfilter/bwdif.h | 4 + + libavfilter/vf_bwdif.c | 8 +- + 5 files changed, 203 insertions(+), 3 deletions(-) + create mode 100644 libavfilter/aarch64/vf_bwdif_init_aarch64.c + create mode 100644 libavfilter/aarch64/vf_bwdif_neon.S + +diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile +index b58daa3a3f..b68209bc94 100644 +--- a/libavfilter/aarch64/Makefile ++++ b/libavfilter/aarch64/Makefile +@@ -1,3 +1,5 @@ ++OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o + OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o + ++NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o + NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +new file mode 100644 +index 0000000000..3ffaa07ab3 +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -0,0 +1,56 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/common.h" ++#include "libavfilter/bwdif.h" ++#include "libavutil/aarch64/cpu.h" ++ ++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++ ++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0, ++ w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++} ++ ++void ++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) ++{ ++ const int cpu_flags = av_get_cpu_flags(); ++ ++ if (bit_depth != 8) ++ return; ++ ++ if (!have_neon(cpu_flags)) ++ return; ++ ++ s->filter_intra = filter_intra_helper; ++} ++ +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +new file mode 100644 +index 0000000000..e288efbe6c +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -0,0 +1,136 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Space taken on the stack by an int (32-bit) ++#ifdef __APPLE__ ++.set SP_INT, 4 ++#else ++.set SP_INT, 8 ++#endif ++ ++.macro SQSHRUNN b, s0, s1, s2, s3, n ++ sqshrun \s0\().4h, \s0\().4s, #\n - 8 ++ sqshrun2 \s0\().8h, \s1\().4s, #\n - 8 ++ sqshrun \s1\().4h, \s2\().4s, #\n - 8 ++ sqshrun2 \s1\().8h, \s3\().4s, #\n - 8 ++ uzp2 \b\().16b, \s0\().16b, \s1\().16b ++.endm ++ ++.macro SMULL4K a0, a1, a2, a3, s0, s1, k ++ smull \a0\().4s, \s0\().4h, \k ++ smull2 \a1\().4s, \s0\().8h, \k ++ smull \a2\().4s, \s1\().4h, \k ++ smull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMULL4K a0, a1, a2, a3, s0, s1, k ++ umull \a0\().4s, \s0\().4h, \k ++ umull2 \a1\().4s, \s0\().8h, \k ++ umull \a2\().4s, \s1\().4h, \k ++ umull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k ++ umlal \a0\().4s, \s0\().4h, \k ++ umlal2 \a1\().4s, \s0\().8h, \k ++ umlal \a2\().4s, \s1\().4h, \k ++ umlal2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k ++ umlsl \a0\().4s, \s0\().4h, \k ++ umlsl2 \a1\().4s, \s0\().8h, \k ++ umlsl \a2\().4s, \s1\().4h, \k ++ umlsl2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro LDR_COEFFS d, t0 ++ movrel \t0, coeffs, 0 ++ ld1 {\d\().8h}, [\t0] ++.endm ++ ++// static const uint16_t coef_lf[2] = { 4309, 213 }; ++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 }; ++// static const uint16_t coef_sp[2] = { 5077, 981 }; ++ ++const coeffs, align=4 // align 4 means align on 2^4 boundry ++ .hword 4309 * 4, 213 * 4 // lf[0]*4 = v0.h[0] ++ .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] ++ .hword 5077, 981 // sp[0] = v0.h[6] ++endconst ++ ++// ============================================================================ ++// ++// void ff_bwdif_filter_intra_neon( ++// void *dst1, // x0 ++// void *cur1, // x1 ++// int w, // w2 ++// int prefs, // w3 ++// int mrefs, // w4 ++// int prefs3, // w5 ++// int mrefs3, // w6 ++// int parity, // w7 unused ++// int clip_max) // [sp, #0] unused ++ ++function ff_bwdif_filter_intra_neon, export=1 ++ cmp w2, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// for (x = 0; x < w; x++) { ++10: ++ ++// interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; ++ ldr q31, [x1, w4, sxtw] ++ ldr q30, [x1, w3, sxtw] ++ ldr q29, [x1, w6, sxtw] ++ ldr q28, [x1, w5, sxtw] ++ ++ uaddl v20.8h, v31.8b, v30.8b ++ uaddl2 v21.8h, v31.16b, v30.16b ++ ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[6] ++ ++ uaddl v20.8h, v29.8b, v28.8b ++ uaddl2 v21.8h, v29.16b, v28.16b ++ ++ UMLSL4K v2, v3, v4, v5, v20, v21, v0.h[7] ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ SQSHRUNN v2, v2, v3, v4, v5, 13 ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ ++ subs w2, w2, #16 ++ add x1, x1, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index 5749345f78..ae6f6ce223 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -39,5 +39,9 @@ typedef struct BWDIFContext { + + void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); + void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth); ++ ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); + + #endif /* AVFILTER_BWDIF_H */ +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 34e8c5e234..6ec8bbab5d 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -122,8 +122,8 @@ typedef struct ThreadData { + next2++; \ + } + +-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, +- int prefs3, int mrefs3, int parity, int clip_max) ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) + { + uint8_t *dst = dst1; + uint8_t *cur = cur1; +@@ -352,13 +352,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) + s->filter_line = filter_line_c_16bit; + s->filter_edge = filter_edge_16bit; + } else { +- s->filter_intra = filter_intra; ++ s->filter_intra = ff_bwdif_filter_intra_c; + s->filter_line = filter_line_c; + s->filter_edge = filter_edge; + } + + #if ARCH_X86 + ff_bwdif_init_x86(s, bit_depth); ++#elif ARCH_AARCH64 ++ ff_bwdif_init_aarch64(s, bit_depth); + #endif + } + + +From 2f8199a41cfd43595352899e722646052b0db2ee Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:41 +0000 +Subject: [PATCH 147/151] tests/checkasm: Add test for vf_bwdif filter_edge +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 7ed7c00f55a50ac88589f9e17c172d4a4fce0581) +--- + tests/checkasm/vf_bwdif.c | 54 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 54 insertions(+) + +diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c +index 034bbabb4c..5fdba09fdc 100644 +--- a/tests/checkasm/vf_bwdif.c ++++ b/tests/checkasm/vf_bwdif.c +@@ -83,6 +83,60 @@ void checkasm_check_vf_bwdif(void) + report("bwdif10"); + } + ++ { ++ LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]); ++ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]); ++ const int stride = WIDTH; ++ const int mask = (1<<8)-1; ++ int spat; ++ int parity; ++ ++ for (spat = 0; spat != 2; ++spat) { ++ for (parity = 0; parity != 2; ++parity) { ++ if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) { ++ ++ declare_func(void, void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ ++ randomize_buffers(prev0, prev1, mask, 11*WIDTH); ++ randomize_buffers(next0, next1, mask, 11*WIDTH); ++ randomize_buffers( cur0, cur1, mask, 11*WIDTH); ++ memset(dst0, 0xba, WIDTH * 3); ++ memset(dst1, 0xba, WIDTH * 3); ++ ++ call_ref(dst0 + stride, ++ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH, ++ stride, -stride, stride * 2, -stride * 2, ++ parity, mask, spat); ++ call_new(dst1 + stride, ++ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH, ++ stride, -stride, stride * 2, -stride * 2, ++ parity, mask, spat); ++ ++ if (memcmp(dst0, dst1, WIDTH*3) ++ || memcmp(prev0, prev1, WIDTH*11) ++ || memcmp(next0, next1, WIDTH*11) ++ || memcmp( cur0, cur1, WIDTH*11)) ++ fail(); ++ ++ bench_new(dst1 + stride, ++ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH, ++ stride, -stride, stride * 2, -stride * 2, ++ parity, mask, spat); ++ } ++ } ++ } ++ ++ report("bwdif8.edge"); ++ } ++ + if (check_func(ctx_8.filter_intra, "bwdif8.intra")) { + LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]); + LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]); + +From 171d7f201503812617b8e320c83cc33120425923 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:42 +0000 +Subject: [PATCH 148/151] avfilter/vf_bwdif: Add neon for filter_edge +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Adds clip and spatial macros for aarch64 neon +Exports C filter_edge needed for tail fixup of neon code +Adds neon for filter_edge + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 8130df83e0fbd3264fe990fb4e084ecbd452d0b1) +--- + libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 +++ + libavfilter/aarch64/vf_bwdif_neon.S | 177 ++++++++++++++++++++ + libavfilter/bwdif.h | 4 + + libavfilter/vf_bwdif.c | 8 +- + 4 files changed, 205 insertions(+), 4 deletions(-) + +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +index 3ffaa07ab3..e75cf2f204 100644 +--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -24,10 +24,29 @@ + #include "libavfilter/bwdif.h" + #include "libavutil/aarch64/cpu.h" + ++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ + void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max); + + ++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++ ++ if (w0 < w) ++ ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++} ++ + static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max) + { +@@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) + return; + + s->filter_intra = filter_intra_helper; ++ s->filter_edge = filter_edge_helper; + } + +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +index e288efbe6c..389302b813 100644 +--- a/libavfilter/aarch64/vf_bwdif_neon.S ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -66,6 +66,79 @@ + umlsl2 \a3\().4s, \s1\().8h, \k + .endm + ++// int b = m2s1 - m1; ++// int f = p2s1 - p1; ++// int dc = c0s1 - m1; ++// int de = c0s1 - p1; ++// int sp_max = FFMIN(p1 - c0s1, m1 - c0s1); ++// sp_max = FFMIN(sp_max, FFMAX(-b,-f)); ++// int sp_min = FFMIN(c0s1 - p1, c0s1 - m1); ++// sp_min = FFMIN(sp_min, FFMAX(b,f)); ++// diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max); ++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3 ++ uqsub \t0\().16b, \p1\().16b, \c0s1\().16b ++ uqsub \t2\().16b, \m1\().16b, \c0s1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m1\().16b, \m2s1\().16b ++ uqsub \t3\().16b, \p1\().16b, \p2s1\().16b ++ umax \t3\().16b, \t3\().16b, \t1\().16b ++ umin \t3\().16b, \t3\().16b, \t2\().16b ++ ++ uqsub \t0\().16b, \c0s1\().16b, \p1\().16b ++ uqsub \t2\().16b, \c0s1\().16b, \m1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m2s1\().16b, \m1\().16b ++ uqsub \t0\().16b, \p2s1\().16b, \p1\().16b ++ umax \t0\().16b, \t0\().16b, \t1\().16b ++ umin \t2\().16b, \t2\().16b, \t0\().16b ++ ++ cmeq \t1\().16b, \diff\().16b, #0 ++ umax \diff\().16b, \diff\().16b, \t3\().16b ++ umax \diff\().16b, \diff\().16b, \t2\().16b ++ bic \diff\().16b, \diff\().16b, \t1\().16b ++.endm ++ ++// i0 = s0; ++// if (i0 > d0 + diff0) ++// i0 = d0 + diff0; ++// else if (i0 < d0 - diff0) ++// i0 = d0 - diff0; ++// ++// i0 = s0 is safe ++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1 ++ uqadd \t0\().16b, \d0\().16b, \diff\().16b ++ uqsub \t1\().16b, \d0\().16b, \diff\().16b ++ umin \i0\().16b, \s0\().16b, \t0\().16b ++ umax \i0\().16b, \i0\().16b, \t1\().16b ++.endm ++ ++// i0 = FFABS(m1 - p1) > td0 ? i1 : i2; ++// DIFF_CLIP ++// ++// i0 = i1 is safe ++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2 ++ uabd \t0\().16b, \m1\().16b, \p1\().16b ++ cmhi \t0\().16b, \t0\().16b, \td0\().16b ++ bsl \t0\().16b, \i1\().16b, \i2\().16b ++ DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2 ++.endm ++ ++.macro PUSH_VREGS ++ stp d8, d9, [sp, #-64]! ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++.endm ++ ++.macro POP_VREGS ++ ldp d14, d15, [sp, #48] ++ ldp d12, d13, [sp, #32] ++ ldp d10, d11, [sp, #16] ++ ldp d8, d9, [sp], #64 ++.endm ++ + .macro LDR_COEFFS d, t0 + movrel \t0, coeffs, 0 + ld1 {\d\().8h}, [\t0] +@@ -81,6 +154,110 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry + .hword 5077, 981 // sp[0] = v0.h[6] + endconst + ++// ============================================================================ ++// ++// void ff_bwdif_filter_edge_neon( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int parity, // [sp, #SP_INT] ++// int clip_max, // [sp, #SP_INT*2] unused ++// int spat); // [sp, #SP_INT*3] ++ ++function ff_bwdif_filter_edge_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ++ ldr w8, [sp, #0] // mrefs2 ++ ++ ldr w17, [sp, #SP_INT] // parity ++ ldr w16, [sp, #SP_INT*3] // spat ++ cmp w17, #0 ++ csel x17, x1, x3, ne ++ ++// for (x = 0; x < w; x++) { ++ ++10: ++// int m1 = cur[mrefs]; ++// int d = (prev2[0] + next2[0]) >> 1; ++// int p1 = cur[prefs]; ++// int temporal_diff0 = FFABS(prev2[0] - next2[0]); ++// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); ++ ldr q31, [x2] ++ ldr q21, [x17] ++ uhadd v16.16b, v31.16b, v21.16b // d0 = v16 ++ uabd v17.16b, v31.16b, v21.16b // td0 = v17 ++ ldr q24, [x2, w6, sxtw] // m1 = v24 ++ ldr q22, [x2, w5, sxtw] // p1 = v22 ++ ++ ldr q0, [x1, w6, sxtw] // prev[mrefs] ++ ldr q2, [x1, w5, sxtw] // prev[prefs] ++ ldr q1, [x3, w6, sxtw] // next[mrefs] ++ ldr q3, [x3, w5, sxtw] // next[prefs] ++ ++ ushr v29.16b, v17.16b, #1 ++ ++ uabd v31.16b, v0.16b, v24.16b ++ uabd v30.16b, v2.16b, v22.16b ++ uhadd v0.16b, v31.16b, v30.16b // td1 = q0 ++ ++ uabd v31.16b, v1.16b, v24.16b ++ uabd v30.16b, v3.16b, v22.16b ++ uhadd v1.16b, v31.16b, v30.16b // td2 = q1 ++ ++ umax v0.16b, v0.16b, v29.16b ++ umax v0.16b, v0.16b, v1.16b // diff = v0 ++ ++// if (spat) { ++// SPAT_CHECK() ++// } ++// i0 = (m1 + p1) >> 1; ++ cbz w16, 1f ++ ++ ldr q31, [x2, w8, sxtw] ++ ldr q18, [x17, w8, sxtw] ++ ldr q30, [x2, w7, sxtw] ++ ldr q19, [x17, w7, sxtw] ++ uhadd v18.16b, v18.16b, v31.16b ++ uhadd v19.16b, v19.16b, v30.16b ++ ++ SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28 ++ ++1: ++ uhadd v2.16b, v22.16b, v24.16b ++ ++ // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30 ++ DIFF_CLIP v2, v2, v16, v0, v31, v30 ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ subs w4, w4, #16 ++ add x1, x1, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc ++ + // ============================================================================ + // + // void ff_bwdif_filter_intra_neon( +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index ae6f6ce223..ae1616d366 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); + void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); + void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth); + ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ + void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max); + +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 6ec8bbab5d..688c2d2572 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -150,9 +150,9 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + +-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int parity, int clip_max, int spat) ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) + { + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -354,7 +354,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) + } else { + s->filter_intra = ff_bwdif_filter_intra_c; + s->filter_line = filter_line_c; +- s->filter_edge = filter_edge; ++ s->filter_edge = ff_bwdif_filter_edge_c; + } + + #if ARCH_X86 + +From abf6588935bce275ba302766bcd8c3bb7a523d3c Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:43 +0000 +Subject: [PATCH 149/151] avfilter/vf_bwdif: Add neon for filter_line +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Exports C filter_line needed for tail fixup of neon code +Adds neon for filter_line + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 94cb94a2c0910d364a7181fc5cc0e9556b777d0a) +--- + libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ + libavfilter/aarch64/vf_bwdif_neon.S | 203 ++++++++++++++++++++ + libavfilter/bwdif.h | 5 + + libavfilter/vf_bwdif.c | 10 +- + 4 files changed, 234 insertions(+), 5 deletions(-) + +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +index e75cf2f204..21e67884ab 100644 +--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, + void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max); + ++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ ++ ++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, ++ w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++} + + static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, +@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) + return; + + s->filter_intra = filter_intra_helper; ++ s->filter_line = filter_line_helper; + s->filter_edge = filter_edge_helper; + } + +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +index 389302b813..f185e94e3c 100644 +--- a/libavfilter/aarch64/vf_bwdif_neon.S ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -154,6 +154,209 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry + .hword 5077, 981 // sp[0] = v0.h[6] + endconst + ++// =========================================================================== ++// ++// void filter_line( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int prefs3, // [sp, #SP_INT] ++// int mrefs3, // [sp, #SP_INT*2] ++// int prefs4, // [sp, #SP_INT*3] ++// int mrefs4, // [sp, #SP_INT*4] ++// int parity, // [sp, #SP_INT*5] ++// int clip_max) // [sp, #SP_INT*6] ++ ++function ff_bwdif_filter_line_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++ // Rearrange regs to be the same as line3 for ease of debug! ++ mov w10, w4 // w10 = loop count ++ mov w9, w6 // w9 = mref ++ mov w12, w7 // w12 = pref2 ++ mov w11, w5 // w11 = pref ++ ldr w8, [sp, #0] // w8 = mref2 ++ ldr w7, [sp, #SP_INT*2] // w7 = mref3 ++ ldr w6, [sp, #SP_INT*4] // w6 = mref4 ++ ldr w13, [sp, #SP_INT] // w13 = pref3 ++ ldr w14, [sp, #SP_INT*3] // w14 = pref4 ++ ++ mov x4, x3 ++ mov x3, x2 ++ mov x2, x1 ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ldr w17, [sp, #SP_INT*5] // parity ++ cmp w17, #0 ++ csel x17, x2, x4, ne ++ ++ PUSH_VREGS ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ + // ============================================================================ + // + // void ff_bwdif_filter_edge_neon( +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index ae1616d366..cce99953f3 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, + void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max); + ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ + #endif /* AVFILTER_BWDIF_H */ +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 688c2d2572..2dc47f9614 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs + FILTER_INTRA() + } + +-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int prefs3, int mrefs3, int prefs4, int mrefs4, +- int parity, int clip_max) ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) + { + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -353,7 +353,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) + s->filter_edge = filter_edge_16bit; + } else { + s->filter_intra = ff_bwdif_filter_intra_c; +- s->filter_line = filter_line_c; ++ s->filter_line = ff_bwdif_filter_line_c; + s->filter_edge = ff_bwdif_filter_edge_c; + } + + +From 7601de6ab2604d1f530e4b8f20f409d1ec2ae6a4 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:44 +0000 +Subject: [PATCH 150/151] avfilter/vf_bwdif: Add a filter_line3 method for + optimisation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add an optional filter_line3 to the available optimisations. + +filter_line3 is equivalent to filter_line, memcpy, filter_line + +filter_line shares quite a number of loads and some calculations in +common with its next iteration and testing shows that using aarch64 +neon filter_line3s performance is 30% better than two filter_lines +and a memcpy. + +Adds a test for vf_bwdif filter_line3 to checkasm + +Rounds job start lines down to a multiple of 4. This means that if +filter_line3 exists then filter_line will not sometimes be called +once at the end of a slice depending on thread count. The final slice +may do up to 3 extra lines but filter_edge is faster than filter_line +so it is unlikely to create any noticable thread load variation. + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit 697533e76dbea8cc7fd6a0642bc60050cc05ead8) +--- + libavfilter/bwdif.h | 7 ++++ + libavfilter/vf_bwdif.c | 44 +++++++++++++++++++-- + tests/checkasm/vf_bwdif.c | 81 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 129 insertions(+), 3 deletions(-) + +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index cce99953f3..496cec72ef 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -35,6 +35,9 @@ typedef struct BWDIFContext { + void (*filter_edge)(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat); ++ void (*filter_line3)(void *dst, int dstride, ++ const void *prev, const void *cur, const void *next, int prefs, ++ int w, int parity, int clip_max); + } BWDIFContext; + + void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); +@@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max); + ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); ++ + #endif /* AVFILTER_BWDIF_H */ +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index 2dc47f9614..9847d38b6a 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + ++#define NEXT_LINE()\ ++ dst += d_stride; \ ++ prev += prefs; \ ++ cur += prefs; \ ++ next += prefs; ++ ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ const int prefs = s_stride; ++ uint8_t * dst = dst1; ++ const uint8_t * prev = prev1; ++ const uint8_t * cur = cur1; ++ const uint8_t * next = next1; ++ ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++ NEXT_LINE(); ++ memcpy(dst, cur, w); ++ NEXT_LINE(); ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++} ++ + void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) +@@ -212,6 +237,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + ++// Round job start line down to multiple of 4 so that if filter_line3 exists ++// and the frame is a multiple of 4 high then filter_line will never be called ++static inline int job_start(const int jobnr, const int nb_jobs, const int h) ++{ ++ return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3; ++} ++ + static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + { + BWDIFContext *s = ctx->priv; +@@ -221,8 +253,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1; + int df = (yadif->csp->comp[td->plane].depth + 7) / 8; + int refs = linesize / df; +- int slice_start = (td->h * jobnr ) / nb_jobs; +- int slice_end = (td->h * (jobnr+1)) / nb_jobs; ++ int slice_start = job_start(jobnr, nb_jobs, td->h); ++ int slice_end = job_start(jobnr + 1, nb_jobs, td->h); + int y; + + for (y = slice_start; y < slice_end; y++) { +@@ -244,6 +276,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + refs << 1, -(refs << 1), + td->parity ^ td->tff, clip_max, + (y < 2) || ((y + 3) > td->h) ? 0 : 1); ++ } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) { ++ s->filter_line3(dst, td->frame->linesize[td->plane], ++ prev, cur, next, linesize, td->w, ++ td->parity ^ td->tff, clip_max); ++ y += 2; + } else { + s->filter_line(dst, prev, cur, next, td->w, + refs, -refs, refs << 1, -(refs << 1), +@@ -280,7 +317,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, + td.plane = i; + + ff_filter_execute(ctx, filter_slice, &td, NULL, +- FFMIN(h, ff_filter_get_nb_threads(ctx))); ++ FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx))); + } + if (yadif->current_field == YADIF_FIELD_END) { + yadif->current_field = YADIF_FIELD_NORMAL; +@@ -347,6 +384,7 @@ static int config_props(AVFilterLink *link) + + av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) + { ++ s->filter_line3 = 0; + if (bit_depth > 8) { + s->filter_intra = filter_intra_16bit; + s->filter_line = filter_line_c_16bit; +diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c +index 5fdba09fdc..3399cacdf7 100644 +--- a/tests/checkasm/vf_bwdif.c ++++ b/tests/checkasm/vf_bwdif.c +@@ -28,6 +28,10 @@ + for (size_t i = 0; i < count; i++) \ + buf0[i] = buf1[i] = rnd() & mask + ++#define randomize_overflow_check(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0; ++ + #define BODY(type, depth) \ + do { \ + type prev0[9*WIDTH], prev1[9*WIDTH]; \ +@@ -83,6 +87,83 @@ void checkasm_check_vf_bwdif(void) + report("bwdif10"); + } + ++ if (!ctx_8.filter_line3) ++ ctx_8.filter_line3 = ff_bwdif_filter_line3_c; ++ ++ { ++ LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]); ++ LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]); ++ LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]); ++ const int stride = WIDTH; ++ const int mask = (1<<8)-1; ++ int parity; ++ ++ for (parity = 0; parity != 2; ++parity) { ++ if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) { ++ ++ declare_func(void, void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int prefs, ++ int w, int parity, int clip_max); ++ ++ randomize_buffers(prev0, prev1, mask, 11*WIDTH); ++ randomize_buffers(next0, next1, mask, 11*WIDTH); ++ randomize_buffers( cur0, cur1, mask, 11*WIDTH); ++ ++ call_ref(dst0, stride, ++ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride, ++ WIDTH, parity, mask); ++ call_new(dst1, stride, ++ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride, ++ WIDTH, parity, mask); ++ ++ if (memcmp(dst0, dst1, WIDTH*3) ++ || memcmp(prev0, prev1, WIDTH*11) ++ || memcmp(next0, next1, WIDTH*11) ++ || memcmp( cur0, cur1, WIDTH*11)) ++ fail(); ++ ++ bench_new(dst1, stride, ++ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride, ++ WIDTH, parity, mask); ++ } ++ } ++ ++ // Use just 0s and ~0s to try to provoke bad cropping or overflow ++ // Parity makes no difference to this test so just test 0 ++ if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) { ++ ++ declare_func(void, void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int prefs, ++ int w, int parity, int clip_max); ++ ++ randomize_overflow_check(prev0, prev1, mask, 11*WIDTH); ++ randomize_overflow_check(next0, next1, mask, 11*WIDTH); ++ randomize_overflow_check( cur0, cur1, mask, 11*WIDTH); ++ ++ call_ref(dst0, stride, ++ prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride, ++ WIDTH, 0, mask); ++ call_new(dst1, stride, ++ prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride, ++ WIDTH, 0, mask); ++ ++ if (memcmp(dst0, dst1, WIDTH*3) ++ || memcmp(prev0, prev1, WIDTH*11) ++ || memcmp(next0, next1, WIDTH*11) ++ || memcmp( cur0, cur1, WIDTH*11)) ++ fail(); ++ ++ // No point to benching ++ } ++ ++ report("bwdif8.line3"); ++ } ++ + { + LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]); + LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]); + +From 120058b7abd0db1d222b1e197207de8226fdfd94 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jul 2023 14:04:45 +0000 +Subject: [PATCH 151/151] avfilter/vf_bwdif: Add neon for filter_line3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: John Cox +Signed-off-by: Martin Storsjö +(cherry picked from commit f00222e81f7d6a59d977fbb280d67989818e0ad2) +--- + libavfilter/aarch64/vf_bwdif_init_aarch64.c | 28 ++ + libavfilter/aarch64/vf_bwdif_neon.S | 272 ++++++++++++++++++++ + 2 files changed, 300 insertions(+) + +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +index 21e67884ab..f52bc4b9b4 100644 +--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -36,6 +36,33 @@ void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max); + ++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); ++ ++ ++static void filter_line3_helper(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ // Asm works on 16 byte chunks ++ // If w is a multiple of 16 then all is good - if not then if width rounded ++ // up to nearest 16 will fit in both src & dst strides then allow the asm ++ // to write over the padding bytes as that is almost certainly faster than ++ // having to invoke the C version to clean up the tail. ++ const int w1 = FFALIGN(w, 16); ++ const int w0 = clip_max != 255 ? 0 : ++ d_stride <= w1 && s_stride <= w1 ? w : w & ~15; ++ ++ ff_bwdif_filter_line3_neon(dst1, d_stride, ++ prev1, cur1, next1, s_stride, ++ w0, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride, ++ (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride, ++ w - w0, parity, clip_max); ++} + + static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, +@@ -93,5 +120,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) + s->filter_intra = filter_intra_helper; + s->filter_line = filter_line_helper; + s->filter_edge = filter_edge_helper; ++ s->filter_line3 = filter_line3_helper; + } + +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +index f185e94e3c..ae9aab20cd 100644 +--- a/libavfilter/aarch64/vf_bwdif_neon.S ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -154,6 +154,278 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry + .hword 5077, 981 // sp[0] = v0.h[6] + endconst + ++// =========================================================================== ++// ++// void ff_bwdif_filter_line3_neon( ++// void * dst1, // x0 ++// int d_stride, // w1 ++// const void * prev1, // x2 ++// const void * cur1, // x3 ++// const void * next1, // x4 ++// int s_stride, // w5 ++// int w, // w6 ++// int parity, // w7 ++// int clip_max); // [sp, #0] (Ignored) ++ ++function ff_bwdif_filter_line3_neon, export=1 ++ // Sanity check w ++ cmp w6, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ cmp w7, #0 ++ csel x17, x2, x4, ne ++ ++ // We want all the V registers - save all the ones we must ++ PUSH_VREGS ++ ++ // Some rearrangement of initial values for nice layout of refs in regs ++ mov w10, w6 // w10 = loop count ++ neg w9, w5 // w9 = mref ++ lsl w8, w9, #1 // w8 = mref2 ++ add w7, w9, w9, LSL #1 // w7 = mref3 ++ lsl w6, w9, #2 // w6 = mref4 ++ mov w11, w5 // w11 = pref ++ lsl w12, w5, #1 // w12 = pref2 ++ add w13, w5, w5, LSL #1 // w13 = pref3 ++ lsl w14, w5, #2 // w14 = pref4 ++ add w15, w5, w5, LSL #2 // w15 = pref5 ++ add w16, w14, w12 // w16 = pref6 ++ ++ lsl w5, w1, #1 // w5 = d_stride * 2 ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// j1 = -coef_hf[1] * (c0 + p4); // j1 = v6-v9 (-c0:v20,v21) ++ add v20.8h, v20.8h, v24.8h ++ add v21.8h, v21.8h, v25.8h ++ SMULL4K v6, v7, v8, v9, v20, v21, v0.h[5] ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ ldr q28, [x3, w16, sxtw] ++ ldr q25, [x17, w16, sxtw] ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// p6 = prev2[prefs6] + next2[prefs6]; // p6 = v24,v25 ++ uaddl v24.8h, v25.8b, v28.8b ++ uaddl2 v25.8h, v25.16b, v28.16b ++ ++// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) ++ add v24.8h, v24.8h, v22.8h ++ add v25.8h, v25.8h, v23.8h ++ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p5 = cur[prefs5]; // p5 = v25 ++ ldr q25, [x3, w15, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// j1 += coef_hf[0] * p2; // - ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[2] ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// j1 -= coef_lf[1] * 4 * (m1 + p5); // - ++ uaddl v26.8h, v24.8b, v25.8b ++ uaddl2 v27.8h, v24.16b, v25.16b ++ UMLSL4K v6, v7, v8, v9, v26, v27, v0.h[1] ++ ++// j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16 ++ uaddl v18.8h, v22.8b, v21.8b ++ uaddl2 v19.8h, v22.16b, v21.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v24.8b, v25.8b ++ uaddl2 v19.8h, v24.16b, v25.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v16, v28, v29, v30, v31, 13 ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++// j1 += coef_lf[0] * 4 * (p1 + p3); // p1 = v22, p3 = v21 ++ uaddl v26.8h, v21.8b, v22.8b ++ uaddl2 v27.8h, v21.16b, v22.16b ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[0] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// j1 >>= 15; // j1 = v3, -v6*, -v7*, -v8*, -v9* ++ SQSHRUNN v3, v6, v7, v8, v9, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++ ldr q27, [x2, w13, sxtw] ++ ldr q26, [x4, w13, sxtw] ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++// } // v28, v30 preserved for next block ++// { // tdiff2 = v14 ++// int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1; ++// int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1; ++ uabd v31.16b, v21.16b, v27.16b ++ uabd v29.16b, v21.16b, v26.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19 ++ ushr v19.16b, v14.16b, #1 ++ umax v19.16b, v19.16b, v31.16b ++ umax v19.16b, v19.16b, v29.16b ++// } ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12 ++ SPAT_CHECK v19, v10, v22, v15, v21, v12, v31, v30, v29, v28 ++ ++ // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19 ++ INTERPOL v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29 ++ ++// dst[d_stride * 2] = av_clip_uint8(interpol); ++ str q3, [x0, w5, sxtw] ++ ++// dst[d_stride] = p1; ++ str q22, [x0, w1, sxtw] ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ + // =========================================================================== + // + // void filter_line(