From 35f659918f75ee23eb6a52432d20726bdbad5533 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Fri, 21 Jul 2023 20:16:20 +0200
Subject: [PATCH 1/2] RPi: update kodi patch to support bwdif deinterlace

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 ...et-max-bpc-for-high-bit-depth-videos.patch |   4 +-
 ...DRMPRIME-Also-support-YUV420-buffers.patch |  12 +-
 ...MPRIME-Adjust-av-formats-to-match-re.patch |  14 +-
 ...PRIME-Add-support-for-arbitrary-outp.patch |  30 +-
 ...PRIME-Remove-obsolete-thread_safe_ca.patch |   8 +-
 ...eoCodecDRMPRIME-Clear-m_pFilterGraph.patch |  24 ++
 ...PRIME-Move-FilterTest-from-open-to-f.patch |  70 +++++
 ...PRIME-Rework-filtering-code-to-handl.patch | 277 ++++++++++++++++++
 ...eg-change-default-software-deinterla.patch |  74 +++++
 ...MPRIME-Support-decoding-to-DRMPRIME-.patch |  55 ++++
 ...PRIME-Request-v4l2-buffers-be-alloca.patch |  30 ++
 ...PRIME-Add-setting-to-enable-hw-deint.patch | 123 ++++++++
 12 files changed, 687 insertions(+), 34 deletions(-)
 create mode 100644 projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch
 create mode 100644 projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch
 create mode 100644 projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch
 create mode 100644 projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch
 create mode 100644 projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch
 create mode 100644 projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch
 create mode 100644 projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch

diff --git a/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch b/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch
index 4e0946e41c..5861fab6c9 100644
--- a/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch
+++ b/projects/RPi/patches/kodi/0001-gbm-Set-max-bpc-for-high-bit-depth-videos.patch
@@ -1,7 +1,7 @@
-From e181af5b2b97b3fbc69b9ad7318a3c02f6186ca5 Mon Sep 17 00:00:00 2001
+From 89a7f05ee85fca27f1140a035fec804d84959dbe Mon Sep 17 00:00:00 2001
 From: Dom Cobley <popcornmix@gmail.com>
 Date: Fri, 3 Dec 2021 16:00:50 +0000
-Subject: [PATCH 1/5] gbm: Set max bpc for high bit depth videos
+Subject: [PATCH 01/12] gbm: Set max bpc for high bit depth videos
 
 ---
  .../HwDecRender/VideoLayerBridgeDRMPRIME.cpp      | 15 +++++++++++++++
diff --git a/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch b/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch
index 991028312f..0bf0799852 100644
--- a/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch
+++ b/projects/RPi/patches/kodi/0002-CDVDVideoCodecDRMPRIME-Also-support-YUV420-buffers.patch
@@ -1,7 +1,7 @@
-From 05fbbc78734827304edd3eb10de0a0117d10a8b9 Mon Sep 17 00:00:00 2001
+From 7d18280622c8ac12dbf1f6d4d5ca9589e1a61b02 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 11 Sep 2021 14:03:05 +0100
-Subject: [PATCH 2/5] CDVDVideoCodecDRMPRIME: Also support YUV420 buffers
+Subject: [PATCH 02/12] CDVDVideoCodecDRMPRIME: Also support YUV420 buffers
 
 CDVDVideoCodecDRMPRIME: Add support for deinterlace of sw decoded buffers
 
@@ -11,10 +11,10 @@ Need to call SetDimensions earlier and store the drm descriptor in expected plac
  1 file changed, 12 insertions(+), 2 deletions(-)
 
 diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
-index b221cdaf75..c0efa91770 100644
+index f5e26b203c..90f1fb07a9 100644
 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
-@@ -619,7 +619,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
+@@ -622,7 +622,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
      pVideoPicture->videoBuffer = nullptr;
    }
  
@@ -23,7 +23,7 @@ index b221cdaf75..c0efa91770 100644
    {
      CVideoBufferDRMPRIMEFFmpeg* buffer =
          dynamic_cast<CVideoBufferDRMPRIMEFFmpeg*>(m_videoBufferPool->Get());
-@@ -697,7 +697,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -700,7 +700,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
  
    const AVFilter* srcFilter = avfilter_get_by_name("buffer");
    const AVFilter* outFilter = avfilter_get_by_name("buffersink");
@@ -32,7 +32,7 @@ index b221cdaf75..c0efa91770 100644
  
    std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
                                           "pixel_aspect={}/{}",
-@@ -845,6 +845,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
+@@ -848,6 +848,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
  
  CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
  {
diff --git a/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch b/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch
index ad0b7598a0..f5ed2b0231 100644
--- a/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch
+++ b/projects/RPi/patches/kodi/0003-CDVDVideoCodecDRMPRIME-Adjust-av-formats-to-match-re.patch
@@ -1,15 +1,15 @@
-From 8fbcf5fada25720b5c6f66959d5ee1c28cff04f9 Mon Sep 17 00:00:00 2001
+From e36845fd7e48b364f68a43bd8c66e06a570a6f4c Mon Sep 17 00:00:00 2001
 From: Dom Cobley <popcornmix@gmail.com>
 Date: Wed, 18 Jan 2023 16:41:00 +0000
-Subject: [PATCH 3/5] CDVDVideoCodecDRMPRIME: Adjust av formats to match recent
- ffmpeg changes
+Subject: [PATCH 03/12] CDVDVideoCodecDRMPRIME: Adjust av formats to match
+ recent ffmpeg changes
 
 ---
  .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp   | 6 ++++--
  1 file changed, 4 insertions(+), 2 deletions(-)
 
 diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
-index c0efa91770..29d38a3ec0 100644
+index 90f1fb07a9..169e8544de 100644
 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 @@ -355,6 +355,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
@@ -19,8 +19,8 @@ index c0efa91770..29d38a3ec0 100644
 +  m_pCodecContext->thread_safe_callbacks = 1;
    m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount();
  
-   if (hints.extradata && hints.extrasize > 0)
-@@ -697,13 +698,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+   if (hints.extradata)
+@@ -700,13 +701,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
  
    const AVFilter* srcFilter = avfilter_get_by_name("buffer");
    const AVFilter* outFilter = avfilter_get_by_name("buffersink");
@@ -36,7 +36,7 @@ index c0efa91770..29d38a3ec0 100644
                                           m_pCodecContext->time_base.num ?
                                             m_pCodecContext->time_base.num : 1,
                                           m_pCodecContext->time_base.num ?
-@@ -855,6 +856,7 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
+@@ -858,6 +859,7 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
      m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
    }
  
diff --git a/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch b/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch
index 7e4ec85ce4..9a03fdee20 100644
--- a/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch
+++ b/projects/RPi/patches/kodi/0004-DVDVideoCodecDRMPRIME-Add-support-for-arbitrary-outp.patch
@@ -1,7 +1,7 @@
-From 56117d2874dcc36ac779609c63f1a8b0bace5366 Mon Sep 17 00:00:00 2001
+From 092ae2d56a5b8ed1558e82c2beae6e4223df57ff Mon Sep 17 00:00:00 2001
 From: Dom Cobley <popcornmix@gmail.com>
 Date: Mon, 6 Feb 2023 15:19:51 +0000
-Subject: [PATCH 4/5] DVDVideoCodecDRMPRIME: Add support for arbitrary output
+Subject: [PATCH 04/12] DVDVideoCodecDRMPRIME: Add support for arbitrary output
  pixel formats
 
 This enables any ffmpeg pixel formats to be supported by DRMPRIME decoder
@@ -20,7 +20,7 @@ And it happens automatically without requiring user video settings
  2 files changed, 77 insertions(+), 50 deletions(-)
 
 diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
-index 29d38a3ec0..611876ba8d 100644
+index 169e8544de..28bd0a9bc7 100644
 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 @@ -219,7 +219,7 @@ enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avct
@@ -67,7 +67,7 @@ index 29d38a3ec0..611876ba8d 100644
  
      buffer->Export(frame, width, height);
      buffer->SyncStart();
-@@ -628,9 +628,9 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
+@@ -631,9 +631,9 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
      buffer->SetRef(m_pFrame);
      pVideoPicture->videoBuffer = buffer;
    }
@@ -79,7 +79,7 @@ index 29d38a3ec0..611876ba8d 100644
      buffer->SetPictureParams(*pVideoPicture);
      buffer->Acquire();
      buffer->SyncEnd();
-@@ -664,13 +664,13 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
+@@ -667,13 +667,13 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
  
      if (name.find("deinterlace") != std::string::npos)
      {
@@ -96,7 +96,7 @@ index 29d38a3ec0..611876ba8d 100644
          return;
        }
      }
-@@ -680,14 +680,31 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
+@@ -683,14 +683,31 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
              __FUNCTION__);
  }
  
@@ -130,7 +130,7 @@ index 29d38a3ec0..611876ba8d 100644
      return true;
  
    if (!(m_pFilterGraph = avfilter_graph_alloc()))
-@@ -698,13 +715,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -701,13 +718,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
  
    const AVFilter* srcFilter = avfilter_get_by_name("buffer");
    const AVFilter* outFilter = avfilter_get_by_name("buffersink");
@@ -146,7 +146,7 @@ index 29d38a3ec0..611876ba8d 100644
                                           m_pCodecContext->time_base.num ?
                                             m_pCodecContext->time_base.num : 1,
                                           m_pCodecContext->time_base.num ?
-@@ -723,7 +740,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -726,7 +743,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
      CLog::Log(LOGERROR,
                "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: src: {} ({})",
                err, result);
@@ -154,7 +154,7 @@ index 29d38a3ec0..611876ba8d 100644
      return false;
    }
  
-@@ -731,7 +747,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -734,7 +750,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
    if (!par)
    {
      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc buffersrc");
@@ -162,7 +162,7 @@ index 29d38a3ec0..611876ba8d 100644
      return false;
    }
  
-@@ -747,7 +762,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -750,7 +765,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
      CLog::Log(LOGERROR,
                "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersrc_parameters_set:  {} ({})",
                err, result);
@@ -170,7 +170,7 @@ index 29d38a3ec0..611876ba8d 100644
      return false;
    }
    av_freep(&par);
-@@ -761,7 +775,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -764,7 +778,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
      CLog::Log(LOGERROR,
                "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: out: {} ({})",
                err, result);
@@ -178,7 +178,7 @@ index 29d38a3ec0..611876ba8d 100644
      return false;
    }
  
-@@ -770,32 +783,46 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -773,32 +786,46 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
    if (result < 0)
    {
      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - failed settings pix formats");
@@ -243,7 +243,7 @@ index 29d38a3ec0..611876ba8d 100644
    }
  
    if ((result = avfilter_graph_config(m_pFilterGraph,  nullptr)) < 0)
-@@ -804,15 +831,11 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+@@ -807,15 +834,11 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
      av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_config:  {} ({})",
                err, result);
@@ -259,7 +259,7 @@ index 29d38a3ec0..611876ba8d 100644
  
    m_processInfo.SetVideoDeintMethod(filters);
  
-@@ -847,16 +870,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
+@@ -850,16 +873,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
  CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
  {
    // sw decoded buffers need cache flush and for descripter to be set
@@ -279,7 +279,7 @@ index 29d38a3ec0..611876ba8d 100644
    int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
    if (ret < 0)
    {
-@@ -949,25 +972,28 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
+@@ -960,25 +983,28 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
      return VC_ERROR;
    }
  
diff --git a/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch b/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch
index 468e4c0bfb..64e5f3d123 100644
--- a/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch
+++ b/projects/RPi/patches/kodi/0005-DVDVideoCodecDRMPRIME-Remove-obsolete-thread_safe_ca.patch
@@ -1,7 +1,7 @@
-From 85c8218d79f042c6d16b1d1ff6479743f095994e Mon Sep 17 00:00:00 2001
+From 4a3cb2af8b0751807d212044ba424d07f2a7ba55 Mon Sep 17 00:00:00 2001
 From: Dom Cobley <popcornmix@gmail.com>
 Date: Fri, 14 Apr 2023 19:59:42 +0100
-Subject: [PATCH 5/5] DVDVideoCodecDRMPRIME: Remove obsolete
+Subject: [PATCH 05/12] DVDVideoCodecDRMPRIME: Remove obsolete
  thread_safe_callbacks
 
 ---
@@ -9,7 +9,7 @@ Subject: [PATCH 5/5] DVDVideoCodecDRMPRIME: Remove obsolete
  1 file changed, 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
-index 611876ba8d..f7022e1854 100644
+index 28bd0a9bc7..670b5f22ce 100644
 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
 @@ -355,7 +355,6 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
@@ -19,7 +19,7 @@ index 611876ba8d..f7022e1854 100644
 -  m_pCodecContext->thread_safe_callbacks = 1;
    m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount();
  
-   if (hints.extradata && hints.extrasize > 0)
+   if (hints.extradata)
 -- 
 2.39.2
 
diff --git a/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch b/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch
new file mode 100644
index 0000000000..3751c0d06e
--- /dev/null
+++ b/projects/RPi/patches/kodi/0006-DVDVideoCodecDRMPRIME-Clear-m_pFilterGraph.patch
@@ -0,0 +1,24 @@
+From 018e080fb3fea185df01d2659d59231aef787759 Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Wed, 31 May 2023 19:40:37 +0100
+Subject: [PATCH 06/12] DVDVideoCodecDRMPRIME: Clear m_pFilterGraph
+
+---
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index 670b5f22ce..8568f162ae 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -866,6 +866,7 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
+     // Disposed by above code
+     m_pFilterIn = nullptr;
+     m_pFilterOut = nullptr;
++    m_pFilterGraph = nullptr;
+   }
+ }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch b/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch
new file mode 100644
index 0000000000..32af8bc1bb
--- /dev/null
+++ b/projects/RPi/patches/kodi/0007-DVDVideoCodecDRMPRIME-Move-FilterTest-from-open-to-f.patch
@@ -0,0 +1,70 @@
+From b62d5e56d76ce179e3a1169566aa2146da48b147 Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Fri, 2 Jun 2023 11:34:22 +0100
+Subject: [PATCH 07/12] DVDVideoCodecDRMPRIME: Move FilterTest from open to
+ first frame returned
+
+The pixel format is not accurate until the first frame is returned
+and it may (later) influence the choice of deinterlacers available.
+---
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 24 ++++++++++++-------
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |  1 +
+ 2 files changed, 16 insertions(+), 9 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index 8568f162ae..f515c5d5f1 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -387,15 +387,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
+   m_processInfo.SetVideoDAR(hints.aspect);
+   m_processInfo.SetVideoDeintMethod("none");
+ 
+-  FilterTest();
+-
+-  if (!m_deintFilterName.empty())
+-  {
+-    std::list<EINTERLACEMETHOD> methods;
+-    methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+-    m_processInfo.UpdateDeinterlacingMethods(methods);
+-    m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+-  }
++  m_checkedDeinterlace = false;
+ 
+   return true;
+ }
+@@ -983,6 +975,20 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
+     return VC_ERROR;
+   }
+ 
++  if (!m_checkedDeinterlace)
++  {
++    FilterTest();
++
++    if (!m_deintFilterName.empty())
++    {
++      std::list<EINTERLACEMETHOD> methods;
++      methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
++      m_processInfo.UpdateDeinterlacingMethods(methods);
++      m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
++    }
++    m_checkedDeinterlace = true;
++  }
++
+   // we need to scale if the buffer isn't in DRM_PRIME format
+   bool need_scale = !IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && !IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format));
+ 
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+index bb88fde1f9..df17f89b96 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+@@ -56,6 +56,7 @@ protected:
+   int m_codecControlFlags = 0;
+   CDVDStreamInfo m_hints;
+   double m_DAR = 1.0;
++  bool m_checkedDeinterlace = false;
+   AVCodecContext* m_pCodecContext = nullptr;
+   AVFrame* m_pFrame = nullptr;
+   AVFrame* m_pFilterFrame = nullptr;
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch b/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch
new file mode 100644
index 0000000000..a62af7f15d
--- /dev/null
+++ b/projects/RPi/patches/kodi/0008-DVDVideoCodecDRMPRIME-Rework-filtering-code-to-handl.patch
@@ -0,0 +1,277 @@
+From b359d89684418cc3a6f894434d212611c7c12cd5 Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Wed, 31 May 2023 14:19:20 +0100
+Subject: [PATCH 08/12] DVDVideoCodecDRMPRIME: Rework filtering code to handle
+ sw deinterlace
+
+---
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 134 +++++++++---------
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |   4 +-
+ 2 files changed, 68 insertions(+), 70 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index f515c5d5f1..b614312a77 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -207,11 +207,7 @@ static const AVCodec* FindDecoder(CDVDStreamInfo& hints)
+         return codec;
+     }
+ 
+-  codec = avcodec_find_decoder(hints.codec);
+-  if (codec && (codec->capabilities & AV_CODEC_CAP_DR1) == AV_CODEC_CAP_DR1)
+-    return codec;
+-
+-  return nullptr;
++  return avcodec_find_decoder(hints.codec);
+ }
+ 
+ enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avctx,
+@@ -645,27 +641,33 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
+   return true;
+ }
+ 
+-void CDVDVideoCodecDRMPRIME::FilterTest()
++void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt)
+ {
+-  const AVFilter* filter;
+-  void* opaque{};
+-
+   m_deintFilterName.clear();
+ 
+-  while ((filter = av_filter_iterate(&opaque)) != nullptr)
++  // look twice, first for DRM_PRIME support, then for actual pixel format
++  for (int i=0; i < 2; i++)
+   {
+-    std::string name(filter->name);
++    const AVFilter* filter;
++    void* opaque{};
+ 
+-    if (name.find("deinterlace") != std::string::npos)
++    while ((filter = av_filter_iterate(&opaque)) != nullptr)
+     {
+-      bool ret = FilterOpen(name, false, true);
+-      FilterClose();
+-      if (ret)
++      std::string name(filter->name);
++
++      if (name.find(i == 0 ? "deinterlace" : "bwdif") != std::string::npos)
+       {
+-        m_deintFilterName = name;
+-        CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
+-                  __FUNCTION__, name);
+-        return;
++        bool ret = FilterOpen(name, pix_fmt, true);
++        FilterClose();
++        if (ret)
++        {
++          m_deintFilterName = name;
++          if (name == "bwdif" || name == "yadif")
++            m_deintFilterName += "=1:-1:1";
++          CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
++                    __FUNCTION__, name);
++          return;
++        }
+       }
+     }
+   }
+@@ -691,14 +693,17 @@ AVFrame *CDVDVideoCodecDRMPRIME::alloc_filter_frame(AVFilterContext * ctx, void
+   return frame;
+ }
+ 
+-bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, bool test)
++bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test)
+ {
+   int result;
+ 
++  if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P)
++     pix_fmt = AV_PIX_FMT_DRM_PRIME;
++
+   if (m_pFilterGraph)
+     FilterClose();
+ 
+-  if (filters.empty() && !scale)
++  if (filters.empty())
+     return true;
+ 
+   if (!(m_pFilterGraph = avfilter_graph_alloc()))
+@@ -709,13 +714,12 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
+ 
+   const AVFilter* srcFilter = avfilter_get_by_name("buffer");
+   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
+-  enum AVPixelFormat pix_fmts[] = { scale ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
+ 
+   std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
+                                          "pixel_aspect={}/{}",
+                                          m_pCodecContext->width,
+                                          m_pCodecContext->height,
+-                                         scale ? m_pCodecContext->pix_fmt : AV_PIX_FMT_DRM_PRIME,
++                                         pix_fmt,
+                                          m_pCodecContext->time_base.num ?
+                                            m_pCodecContext->time_base.num : 1,
+                                          m_pCodecContext->time_base.num ?
+@@ -772,6 +776,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
+     return false;
+   }
+ 
++  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE };
+   result = av_opt_set_int_list(m_pFilterOut, "pix_fmts", &pix_fmts[0],
+                                AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
+   if (result < 0)
+@@ -780,43 +785,32 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
+     return false;
+   }
+ 
+-  if (!filters.empty())
++  if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast<void*>(this))) < 0)
+   {
+-    AVFilterInOut* outputs = avfilter_inout_alloc();
+-    AVFilterInOut* inputs  = avfilter_inout_alloc();
++    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result);
++    return result;
++  }
++  AVFilterInOut* outputs = avfilter_inout_alloc();
++  AVFilterInOut* inputs  = avfilter_inout_alloc();
+ 
+-    outputs->name = av_strdup("in");
+-    outputs->filter_ctx = m_pFilterIn;
+-    outputs->pad_idx = 0;
+-    outputs->next = nullptr;
++  outputs->name = av_strdup("in");
++  outputs->filter_ctx = m_pFilterIn;
++  outputs->pad_idx = 0;
++  outputs->next = nullptr;
+ 
+-    inputs->name = av_strdup("out");
+-    inputs->filter_ctx = m_pFilterOut;
+-    inputs->pad_idx = 0;
+-    inputs->next = nullptr;
++  inputs->name = av_strdup("out");
++  inputs->filter_ctx = m_pFilterOut;
++  inputs->pad_idx = 0;
++  inputs->next = nullptr;
+ 
+-    result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
+-    avfilter_inout_free(&outputs);
+-    avfilter_inout_free(&inputs);
++  result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
++  avfilter_inout_free(&outputs);
++  avfilter_inout_free(&inputs);
+ 
+-    if (result < 0)
+-    {
+-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
+-      return false;
+-    }
+-  }
+-  else
++  if (result < 0)
+   {
+-    if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast<void*>(this))) < 0)
+-    {
+-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result);
+-      return result;
+-    }
+-    if ((result = avfilter_link(m_pFilterIn, 0, m_pFilterOut, 0)) < 0)
+-    {
+-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_link");
+-      return false;
+-    }
++    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
++    return false;
+   }
+ 
+   if ((result = avfilter_graph_config(m_pFilterGraph,  nullptr)) < 0)
+@@ -831,8 +825,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
+   if (test)
+     return true;
+ 
+-  m_processInfo.SetVideoDeintMethod(filters);
+-
+   if (CServiceBroker::GetLogging().CanLogComponent(LOGVIDEO))
+   {
+     char* graphDump = avfilter_graph_dump(m_pFilterGraph, nullptr);
+@@ -864,8 +856,8 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
+ 
+ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
+ {
+-  // sw decoded buffers need cache flush and for descripter to be set
+-  if (!IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
++  // sw decoded buffers submitted to hw decoder need cache flush and for descripter to be set
++  if (m_pFrame->format != AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_DRM_PRIME)
+   {
+     CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(av_buffer_get_opaque(m_pFrame->buf[0]));
+     buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
+@@ -975,9 +967,10 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
+     return VC_ERROR;
+   }
+ 
++  AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(m_pFrame->format);
+   if (!m_checkedDeinterlace)
+   {
+-    FilterTest();
++    FilterTest(pix_fmt);
+ 
+     if (!m_deintFilterName.empty())
+     {
+@@ -989,28 +982,33 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
+     m_checkedDeinterlace = true;
+   }
+ 
+-  // we need to scale if the buffer isn't in DRM_PRIME format
+-  bool need_scale = !IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && !IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format));
+-
+   if (!m_processInfo.GetVideoInterlaced() && m_pFrame->interlaced_frame)
+     m_processInfo.SetVideoInterlaced(true);
+ 
+   std::string filterChain = GetFilterChain(m_pFrame->interlaced_frame);
+-  if (!filterChain.empty() || need_scale)
++
++  // we need to scale if the buffer isn't in DRM_PRIME format
++  if (!IsSupportedSwFormat(pix_fmt) && !IsSupportedHwFormat(pix_fmt))
++    filterChain = "scale";
++  // we need to copy if the buffer wasn't allocated by us
++  else if (!IsSupportedHwFormat(pix_fmt) && !(m_pCodecContext->codec->capabilities & AV_CODEC_CAP_DR1))
++    filterChain = "copy";
++
++  if (!filterChain.empty())
+   {
+-    bool reopenFilter = false;
+-    if (m_filters != filterChain)
+-      reopenFilter = true;
++    bool reopenFilter = m_filters != filterChain;
+ 
+     if (m_pFilterGraph &&
+         (m_pFilterIn->outputs[0]->w != m_pFrame->width ||
+          m_pFilterIn->outputs[0]->h != m_pFrame->height))
+       reopenFilter = true;
+ 
+-    if (reopenFilter || (need_scale && m_pFilterGraph == nullptr))
++    if (reopenFilter)
+     {
+       m_filters = filterChain;
+-      if (!FilterOpen(filterChain, need_scale, false))
++      m_processInfo.SetVideoDeintMethod(m_filters);
++
++      if (!FilterOpen(filterChain, pix_fmt, false))
+         FilterClose();
+     }
+ 
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+index df17f89b96..55675c3c2e 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+@@ -45,9 +45,9 @@ protected:
+   static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt);
+   static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags);
+   static AVFrame *alloc_filter_frame(AVFilterContext * ctx, void * v, int w, int h);
+-  bool FilterOpen(const std::string& filters, bool scale, bool test);
++  bool FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test);
+   void FilterClose();
+-  void FilterTest();
++  void FilterTest(AVPixelFormat pix_fmt);
+   std::string GetFilterChain(bool interlaced);
+ 
+   std::string m_name;
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch b/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch
new file mode 100644
index 0000000000..141aa2c5cd
--- /dev/null
+++ b/projects/RPi/patches/kodi/0009-VideoPlayer-ffmpeg-change-default-software-deinterla.patch
@@ -0,0 +1,74 @@
+From c9a70db5879a6ac37b5840621aa102812104087f Mon Sep 17 00:00:00 2001
+From: Alan Swanson <reiver@improbability.net>
+Date: Thu, 18 May 2023 16:12:43 +0100
+Subject: [PATCH 09/12] VideoPlayer: ffmpeg change default software
+ deinterlacer from yadif to bwdif
+
+---
+ .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp            | 10 +++++-----
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp       |  4 ++--
+ 2 files changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+index 032ee16454..5d1b7162f9 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+@@ -55,7 +55,7 @@ enum DecoderState
+ 
+ enum EFilterFlags {
+   FILTER_NONE                =  0x0,
+-  FILTER_DEINTERLACE_YADIF   =  0x1,  //< use first deinterlace mode
++  FILTER_DEINTERLACE_BWDIF   =  0x1,  //< use first deinterlace mode
+   FILTER_DEINTERLACE_ANY     =  0xf,  //< use any deinterlace mode
+   FILTER_DEINTERLACE_FLAGGED = 0x10,  //< only deinterlace flagged frames
+   FILTER_DEINTERLACE_HALFED  = 0x20,  //< do half rate deinterlacing
+@@ -526,12 +526,12 @@ void CDVDVideoCodecFFmpeg::SetFilters()
+       }
+   }
+ 
+-  if (filters & FILTER_DEINTERLACE_YADIF)
++  if (filters & FILTER_DEINTERLACE_BWDIF)
+   {
+     if (filters & FILTER_DEINTERLACE_HALFED)
+-      m_filters_next = "yadif=0:-1";
++      m_filters_next = "bwdif=0:-1";
+     else
+-      m_filters_next = "yadif=1:-1";
++      m_filters_next = "bwdif=1:-1";
+ 
+     if (filters & FILTER_DEINTERLACE_FLAGGED)
+       m_filters_next += ":1";
+@@ -1226,7 +1226,7 @@ int CDVDVideoCodecFFmpeg::FilterOpen(const std::string& filters, bool scale)
+       return result;
+     }
+ 
+-    if (filters.compare(0,5,"yadif") == 0)
++    if (filters.compare(0,5,"bwdif") == 0)
+     {
+       m_processInfo.SetVideoDeintMethod(filters);
+     }
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp
+index 24edd058e9..f9b6f17824 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/VAAPI.cpp
+@@ -3007,7 +3007,7 @@ bool CFFmpegPostproc::Init(EINTERLACEMETHOD method)
+   {
+     std::string filter;
+ 
+-    filter = "yadif=1:-1";
++    filter = "bwdif=1:-1";
+ 
+     if (avfilter_graph_parse_ptr(m_pFilterGraph, filter.c_str(), &inputs, &outputs, NULL) < 0)
+     {
+@@ -3026,7 +3026,7 @@ bool CFFmpegPostproc::Init(EINTERLACEMETHOD method)
+       return false;
+     }
+ 
+-    m_config.processInfo->SetVideoDeintMethod("yadif");
++    m_config.processInfo->SetVideoDeintMethod("bwdif");
+   }
+   else if (method == VS_INTERLACEMETHOD_RENDER_BOB ||
+            method == VS_INTERLACEMETHOD_NONE)
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch b/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch
new file mode 100644
index 0000000000..97f403f138
--- /dev/null
+++ b/projects/RPi/patches/kodi/0010-CDVDVideoCodecDRMPRIME-Support-decoding-to-DRMPRIME-.patch
@@ -0,0 +1,55 @@
+From 88d0dd1bb5be849f2066f92f55bd7d8c80eb7edf Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Tue, 20 Jun 2023 15:13:09 +0100
+Subject: [PATCH 10/12] CDVDVideoCodecDRMPRIME: Support decoding to DRMPRIME
+ with sw deinterlace
+
+We can map a YUV style DRM_PRIME buffer back to AV_PIX_FMT_YUV420P
+to allow subsquent sw deinterlace
+---
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 22 +++++++++++++++++++
+ 1 file changed, 22 insertions(+)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index b614312a77..023334f5db 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -700,6 +700,9 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelForma
+   if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P)
+      pix_fmt = AV_PIX_FMT_DRM_PRIME;
+ 
++  if (filters.find("bwdif") != std::string::npos && pix_fmt == AV_PIX_FMT_DRM_PRIME)
++     pix_fmt = AV_PIX_FMT_YUV420P;
++
+   if (m_pFilterGraph)
+     FilterClose();
+ 
+@@ -866,6 +869,25 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
+     m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
+     m_pFrame->format = AV_PIX_FMT_DRM_PRIME;
+   }
++  // hw decoded buffers submitted to sw decoder need mapping of planes for cpu to access
++  else if (m_pFrame->format == AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_YUV420P)
++  {
++    AVFrame *frame = av_frame_alloc();
++    frame->width = m_pFrame->width;
++    frame->height = m_pFrame->height;
++    frame->format = AV_PIX_FMT_YUV420P;
++    int ret = av_hwframe_map(frame, m_pFrame, (int)AV_HWFRAME_MAP_READ);
++    if (ret < 0)
++    {
++      char err[AV_ERROR_MAX_STRING_SIZE] = {};
++      av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
++      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - av_hwframe_map failed: {} ({})",
++                __FUNCTION__, err, ret);
++      return VC_ERROR;
++    }
++    av_frame_unref(m_pFrame);
++    av_frame_move_ref(m_pFrame, frame);
++  }
+ 
+   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
+   if (ret < 0)
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch b/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch
new file mode 100644
index 0000000000..c75d4c73dc
--- /dev/null
+++ b/projects/RPi/patches/kodi/0011-DVDVideoCodecDRMPRIME-Request-v4l2-buffers-be-alloca.patch
@@ -0,0 +1,30 @@
+From c2ced5695054a42fe4ba8520669d7c69e583e2a1 Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Tue, 20 Jun 2023 15:14:02 +0100
+Subject: [PATCH 11/12] DVDVideoCodecDRMPRIME: Request v4l2 buffers be
+ allocated through cache
+
+This is an optional request, but will improve performance of sw deinterlace
+if supported.
+---
+ .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp     | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index 023334f5db..0182f30a61 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -367,6 +367,10 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
+   for (auto&& option : options.m_keys)
+     av_opt_set(m_pCodecContext, option.m_name.c_str(), option.m_value.c_str(), 0);
+ 
++  // this requests v4l2 buffers are allocated through cache. It will work if this is not supported,
++  // but subsequent operations like deinterlace may be less efficient
++  av_opt_set(m_pCodecContext->priv_data, "dmabuf_alloc", "cma", 0);
++
+   if (avcodec_open2(m_pCodecContext, pCodec, nullptr) < 0)
+   {
+     CLog::Log(LOGINFO, "CDVDVideoCodecDRMPRIME::{} - unable to open codec", __FUNCTION__);
+-- 
+2.39.2
+
diff --git a/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch b/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch
new file mode 100644
index 0000000000..c3534352b4
--- /dev/null
+++ b/projects/RPi/patches/kodi/0012-DVDVideoCodecDRMPRIME-Add-setting-to-enable-hw-deint.patch
@@ -0,0 +1,123 @@
+From 4eded8af13fe44c12ed2c26e40abfe9e9d08281f Mon Sep 17 00:00:00 2001
+From: Dom Cobley <popcornmix@gmail.com>
+Date: Wed, 21 Jun 2023 13:16:01 +0100
+Subject: [PATCH 12/12] DVDVideoCodecDRMPRIME: Add setting to enable hw
+ deinterlace
+
+HW deinterlace has lower cpu, but may have higher quality,
+so allow user to choose appropriate setting.
+---
+ .../resource.language.en_gb/resources/strings.po | 11 +++++++++++
+ system/settings/linux.xml                        | 12 ++++++++++++
+ .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp    | 16 +++++++++++++++-
+ xbmc/settings/Settings.h                         |  1 +
+ 4 files changed, 39 insertions(+), 1 deletion(-)
+
+diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
+index 062d3afd2b..8c6e31965a 100644
+--- a/addons/resource.language.en_gb/resources/strings.po
++++ b/addons/resource.language.en_gb/resources/strings.po
+@@ -7311,6 +7311,11 @@ msgctxt "#13438"
+ msgid "Allow hardware acceleration with DRM PRIME"
+ msgstr ""
+ 
++#: system/settings/settings.xml
++msgctxt "#13500"
++msgid "Allow hardware deinterlace with DRM PRIME"
++msgstr ""
++
+ #: system/settings/settings.xml
+ msgctxt "#13439"
+ msgid "Allow hardware acceleration - MediaCodec"
+@@ -19424,6 +19429,12 @@ msgctxt "#36172"
+ msgid "Enable PRIME decoding of video files"
+ msgstr ""
+ 
++#. Description of setting with label #13500 "Allow hardware deinterlace - PRIME"
++#: system/settings/settings.xml
++msgctxt "#36290"
++msgid "Enable PRIME hardware deinterlace of video files"
++msgstr ""
++
+ #. Description of setting with label #14109 "Short date format"
+ #: system/settings/settings.xml
+ msgctxt "#36173"
+diff --git a/system/settings/linux.xml b/system/settings/linux.xml
+index 531974f3f4..c2df62c047 100644
+--- a/system/settings/linux.xml
++++ b/system/settings/linux.xml
+@@ -180,6 +180,18 @@
+           <default>true</default>
+           <control type="toggle" />
+         </setting>
++        <setting id="videoplayer.primeallowhwdeinterlace" type="boolean" parent="videoplayer.useprimedecoder" label="13500" help="36290">
++          <requirement>HAS_GLES</requirement>
++          <visible>false</visible>
++          <dependencies>
++            <dependency type="enable">
++              <condition setting="videoplayer.useprimedecoder" operator="is">true</condition>
++            </dependency>
++          </dependencies>
++          <level>3</level>
++          <default>true</default>
++          <control type="toggle" />
++        </setting>
+         <setting id="videoplayer.useprimerenderer" type="integer" label="13462" help="13463">
+           <requirement>HAS_GLES</requirement>
+           <visible>false</visible>
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+index 0182f30a61..cd3b4e89a2 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+@@ -41,6 +41,7 @@ namespace
+ {
+ 
+ constexpr const char* SETTING_VIDEOPLAYER_USEPRIMEDECODERFORHW{"videoplayer.useprimedecoderforhw"};
++constexpr const char* SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE{"videoplayer.primeallowhwdeinterlace"};
+ 
+ static void ReleaseBuffer(void* opaque, uint8_t* data)
+ {
+@@ -149,6 +150,15 @@ void CDVDVideoCodecDRMPRIME::Register()
+ 
+   setting->SetVisible(true);
+ 
++  setting = settings->GetSetting(SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
++  if (!setting)
++  {
++    CLog::Log(LOGERROR, "Failed to load setting for: {}", SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
++    return;
++  }
++
++  setting->SetVisible(true);
++
+   CDVDFactoryCodec::RegisterHWVideoCodec("drm_prime", CDVDVideoCodecDRMPRIME::Create);
+ }
+ 
+@@ -650,7 +660,11 @@ void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt)
+   m_deintFilterName.clear();
+ 
+   // look twice, first for DRM_PRIME support, then for actual pixel format
+-  for (int i=0; i < 2; i++)
++
++  bool hw = CServiceBroker::GetSettingsComponent()->GetSettings()->GetBool(
++      SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
++
++  for (int i = hw ? 0 : 1; i < 2; i++)
+   {
+     const AVFilter* filter;
+     void* opaque{};
+diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h
+index a4f91e9f92..e9cb3dc2be 100644
+--- a/xbmc/settings/Settings.h
++++ b/xbmc/settings/Settings.h
+@@ -117,6 +117,7 @@ public:
+   static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODEC = "videoplayer.usemediacodec";
+   static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE =
+       "videoplayer.usemediacodecsurface";
++  static constexpr auto SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE = "videoplayer.primeallowhwdeinterlace";
+   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAU = "videoplayer.usevdpau";
+   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMIXER = "videoplayer.usevdpaumixer";
+   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMPEG2 = "videoplayer.usevdpaumpeg2";
+-- 
+2.39.2
+

From 3ac1df1390a627fea6e7c7a88e2adfa5504edffc Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Mon, 10 Jul 2023 20:54:33 +0200
Subject: [PATCH 2/2] ffmpeg: update rpi patch

Patch created using revisions ea3d24b..120058b
from branch dev/6.0/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 3107 ++++++++++++++++-
 1 file changed, 2971 insertions(+), 136 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 27c1326476..72cacc605c 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -1,7 +1,7 @@
 From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Apr 2021 12:34:50 +0100
-Subject: [PATCH 001/136] Add pi configs and scripts
+Subject: [PATCH 001/151] Add pi configs and scripts
 
 ---
  pi-util/BUILD.txt                  |  59 ++++++++
@@ -1682,7 +1682,7 @@ index 0000000000..5935a11ca5
 From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:27:39 +0100
-Subject: [PATCH 002/136] Add sand pix fmts & conversion fns
+Subject: [PATCH 002/151] Add sand pix fmts & conversion fns
 
 ---
  configure                     |   3 +
@@ -3503,7 +3503,7 @@ index 0000000000..634b55e800
 From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:36:47 +0100
-Subject: [PATCH 003/136] Add aarch64 asm sand conv functions
+Subject: [PATCH 003/151] Add aarch64 asm sand conv functions
 
 Many thanks to eiler.mike@gmail.com (Michael Eiler) for these
 optimizations
@@ -4310,7 +4310,7 @@ index ed0261b02f..1f543e9357 100644
 From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:56:02 +0100
-Subject: [PATCH 004/136] Add raw encoding for sand
+Subject: [PATCH 004/151] Add raw encoding for sand
 
 ---
  libavcodec/raw.c    |  6 +++
@@ -4459,7 +4459,7 @@ index 8c577006d9..594a77c42a 100644
 From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:02:09 +0100
-Subject: [PATCH 005/136] Deal with the lack of trivial sand cropping
+Subject: [PATCH 005/151] Deal with the lack of trivial sand cropping
 
 ---
  fftools/ffmpeg.c        |  4 ++--
@@ -4559,7 +4559,7 @@ index 2580269549..3a9d323325 100644
 From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:31:16 +0100
-Subject: [PATCH 006/136] Add an unsand filter
+Subject: [PATCH 006/151] Add an unsand filter
 
 ---
  configure                |   1 +
@@ -4857,7 +4857,7 @@ index 0000000000..7100f2fc9b
 From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:37:07 +0100
-Subject: [PATCH 007/136] Reduce mmal compile warnings
+Subject: [PATCH 007/151] Reduce mmal compile warnings
 
 ---
  libavcodec/mmaldec.c | 4 ++++
@@ -4889,7 +4889,7 @@ index 3092f58510..6f41b41ac4 100644
 From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 17:56:16 +0100
-Subject: [PATCH 008/136] Add chroma location to hevc parse
+Subject: [PATCH 008/151] Add chroma location to hevc parse
 
 ---
  libavcodec/hevc_parser.c | 13 +++++++++++++
@@ -4948,7 +4948,7 @@ index 567e8d81d4..b6cfea64d3 100644
 From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:20:50 +0100
-Subject: [PATCH 009/136] hwaccel: Add .abort_frame & use in hevcdec
+Subject: [PATCH 009/151] hwaccel: Add .abort_frame & use in hevcdec
 
 ---
  libavcodec/avcodec.h | 11 +++++++++++
@@ -5000,7 +5000,7 @@ index b6cfea64d3..8a0246fa21 100644
 From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:26:17 +0100
-Subject: [PATCH 010/136] hwaccel: Add CAP_MT_SAFE for accels that can use
+Subject: [PATCH 010/151] hwaccel: Add CAP_MT_SAFE for accels that can use
  multi-thread
 
 ---
@@ -5049,7 +5049,7 @@ index d9d5afaa82..2cc89a41f5 100644
 From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 17:59:08 +0100
-Subject: [PATCH 011/136] Weak link utils
+Subject: [PATCH 011/151] Weak link utils
 
 ---
  libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++
@@ -5199,7 +5199,7 @@ index 0000000000..415b6a27a0
 From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 19:23:26 +0100
-Subject: [PATCH 012/136] Add v4l2_req V4L2 request H265 drm_prime decode
+Subject: [PATCH 012/151] Add v4l2_req V4L2 request H265 drm_prime decode
 
 Has the abiliy to switch between kernel API versions at runtime. This
 could be removed later once teher is no chance of usage on an old
@@ -10674,7 +10674,7 @@ index 0000000000..f14f594564
 From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 19:30:36 +0100
-Subject: [PATCH 013/136] Add no_cvt_hw option to ffmpeg
+Subject: [PATCH 013/151] Add no_cvt_hw option to ffmpeg
 
 ---
  fftools/ffmpeg.c     | 6 ++++--
@@ -10744,7 +10744,7 @@ index 055275d813..761db36588 100644
 From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 10:16:39 +0100
-Subject: [PATCH 014/136] Add vout_drm
+Subject: [PATCH 014/151] Add vout_drm
 
 ---
  configure                |   4 +
@@ -11457,7 +11457,7 @@ index 0000000000..cfb33ce7c3
 From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 11:34:18 +0100
-Subject: [PATCH 015/136] Add vout_egl
+Subject: [PATCH 015/151] Add vout_egl
 
 ---
  configure                |   6 +
@@ -12357,7 +12357,7 @@ index 0000000000..7b9c610ace
 From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 12:51:22 +0100
-Subject: [PATCH 016/136] V4L2 stateful rework
+Subject: [PATCH 016/151] V4L2 stateful rework
 
 ---
  libavcodec/Makefile       |   3 +-
@@ -14780,7 +14780,7 @@ index 4944d08511..7f6033ac2c 100644
 From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:46:21 +0100
-Subject: [PATCH 017/136] Fix crash in hw_device_default_name if type not found
+Subject: [PATCH 017/151] Fix crash in hw_device_default_name if type not found
  (NONE)
 
 ---
@@ -14804,7 +14804,7 @@ index 88fa782470..740a5e7153 100644
 From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:59:18 +0100
-Subject: [PATCH 018/136] Allow v4l2m2m to select non-drm_prime output formats
+Subject: [PATCH 018/151] Allow v4l2m2m to select non-drm_prime output formats
 
 ---
  libavcodec/v4l2_buffers.c |  2 +-
@@ -14871,7 +14871,7 @@ index 7f6033ac2c..a4b5a4e7e9 100644
 From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:59:38 +0100
-Subject: [PATCH 019/136] Fix YUV420P output from v4l2m2m
+Subject: [PATCH 019/151] Fix YUV420P output from v4l2m2m
 
 Also put get_width get_height inlines in header as they are generally
 useful.
@@ -14988,7 +14988,7 @@ index 24a9c94864..8f054f2f50 100644
 From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 19:23:44 +0100
-Subject: [PATCH 020/136] Report buffer overflows in v4l2m2m
+Subject: [PATCH 020/151] Report buffer overflows in v4l2m2m
 
 ---
  libavcodec/v4l2_buffers.c | 14 ++++++++++----
@@ -15064,7 +15064,7 @@ index 6fe2586627..81aced0c2b 100644
 From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 14 Jun 2021 11:55:16 +0100
-Subject: [PATCH 021/136] Increase V4L2 H264 stateful coded buffer size
+Subject: [PATCH 021/151] Increase V4L2 H264 stateful coded buffer size
 
 Try to set a min size of frame size / 2 for bitbuffers passed to V4l2.
 This fixes a few streams that have large I-frames.  You would hope
@@ -15188,7 +15188,7 @@ index a4b5a4e7e9..1851acbc93 100644
 From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 28 Jun 2021 12:13:35 +0100
-Subject: [PATCH 022/136] Fix raw video s.t. it respects any remaining cropping
+Subject: [PATCH 022/151] Fix raw video s.t. it respects any remaining cropping
 
 This fixes the long standing CONFWIN_A conformance test failure for drm.
 ---
@@ -15458,7 +15458,7 @@ index 7a9fdbd263..baf18920fa 100644
 From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 13 Aug 2021 15:38:28 +0100
-Subject: [PATCH 023/136] Set frame interlace from V4L2 buffer field
+Subject: [PATCH 023/151] Set frame interlace from V4L2 buffer field
 
 ---
  libavcodec/v4l2_buffers.c | 12 ++++++++++++
@@ -15498,7 +15498,7 @@ index de31f7ced9..97b8eb1db3 100644
 From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 13 Aug 2021 16:11:53 +0100
-Subject: [PATCH 024/136] Fix V4L2 stateful to avoid crash if flush before
+Subject: [PATCH 024/151] Fix V4L2 stateful to avoid crash if flush before
  start
 
 ---
@@ -15524,7 +15524,7 @@ index a17ae027a6..eb901e8fab 100644
 From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Sep 2021 17:44:13 +0100
-Subject: [PATCH 025/136] Copy properties from frame to v4l2 buffer
+Subject: [PATCH 025/151] Copy properties from frame to v4l2 buffer
 
 Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that
 ff_v4l2_buffer_buf_to_avframe copies
@@ -15695,7 +15695,7 @@ index 97b8eb1db3..126d2a17f4 100644
 From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 16:49:01 +0000
-Subject: [PATCH 026/136] ffmpeg: Do not inc DTS on no decode output
+Subject: [PATCH 026/151] ffmpeg: Do not inc DTS on no decode output
 
 V4L2 H264 decode has long latency and sometimes spits out a long stream
 of output without input. In this case incrementing DTS is wrong. There
@@ -15727,7 +15727,7 @@ index 5dc2cd73c1..ba0c1898cf 100644
 From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:32:59 +0000
-Subject: [PATCH 027/136] v4l2_m2m_dec: Adjust timebase if H264
+Subject: [PATCH 027/151] v4l2_m2m_dec: Adjust timebase if H264
 
 Adjust AVCodecContext time_base if H264 in the same way that the
 software decoder does.
@@ -15760,7 +15760,7 @@ index 1851acbc93..aa1e5c1597 100644
 From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:38:27 +0000
-Subject: [PATCH 028/136] v4l2_m2m_dec: Produce best guess PTSs if none
+Subject: [PATCH 028/151] v4l2_m2m_dec: Produce best guess PTSs if none
  supplied
 
 Filter scheduling gets confused by missing PTSs and makes poor guesses
@@ -15895,7 +15895,7 @@ index aa1e5c1597..a5a2afbd27 100644
 From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:59:27 +0000
-Subject: [PATCH 029/136] v4l2_m2m_dec: Try harder to get an initial frame
+Subject: [PATCH 029/151] v4l2_m2m_dec: Try harder to get an initial frame
 
 If the input Q is full then wait on a short timeout for a capture frame
 rather than stuffing yet still another frame into the input if we could
@@ -15936,7 +15936,7 @@ index a5a2afbd27..b49f470c0a 100644
 From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 18:04:56 +0000
-Subject: [PATCH 030/136] Add a V4L2 M2M deinterlace filter
+Subject: [PATCH 030/151] Add a V4L2 M2M deinterlace filter
 
 Add a V4L2 deinterlace filter that will accept DRMPRIME frames.
 
@@ -17277,7 +17277,7 @@ index 0000000000..1a933b7e0a
 From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 2 Dec 2021 17:49:55 +0000
-Subject: [PATCH 031/136] Put no_pts_rescale in context which makes more sense
+Subject: [PATCH 031/151] Put no_pts_rescale in context which makes more sense
  than an arg
 
 ---
@@ -17558,7 +17558,7 @@ index b49f470c0a..36754b314a 100644
 From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 8 Dec 2021 15:00:37 +0000
-Subject: [PATCH 032/136] Use bitbuf min size for all streams
+Subject: [PATCH 032/151] Use bitbuf min size for all streams
 
 ---
  libavcodec/v4l2_m2m_dec.c | 5 +----
@@ -17589,7 +17589,7 @@ index 36754b314a..48a6810d18 100644
 From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 3 Dec 2021 12:54:18 +0000
-Subject: [PATCH 033/136] Track pending frames in v4l2 stateful
+Subject: [PATCH 033/151] Track pending frames in v4l2 stateful
 
 Track which frames are pending decode in the v4l2 stateful decoder.
 This relies on DTS & PTS having some relationship to reality, so
@@ -17847,7 +17847,7 @@ index 48a6810d18..d8ebb466cd 100644
 From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 17:58:21 +0000
-Subject: [PATCH 034/136] Use pending tracking to reduce v4l2 latency
+Subject: [PATCH 034/151] Use pending tracking to reduce v4l2 latency
 
 If there are more than 5 pending decodes outstanding then add a small
 timeout to the capture poll to reduce the rate at which frames are
@@ -17970,7 +17970,7 @@ index d8ebb466cd..7e7e4729d0 100644
 From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 12:23:54 +0000
-Subject: [PATCH 035/136] Allow logger() to take const ctx
+Subject: [PATCH 035/151] Allow logger() to take const ctx
 
 ---
  libavcodec/v4l2_buffers.c | 2 +-
@@ -18015,7 +18015,7 @@ index 64540a37b3..d3df48aed4 100644
 From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 13:00:27 +0000
-Subject: [PATCH 036/136] Track numbere of bufs qed with an atomic
+Subject: [PATCH 036/151] Track numbere of bufs qed with an atomic
 
 Safer and faster than counting status
 ---
@@ -18089,7 +18089,7 @@ index 4cc164886c..a4176448d5 100644
 From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Dec 2021 12:01:25 +0000
-Subject: [PATCH 037/136] Clear pkt_buf on flush
+Subject: [PATCH 037/151] Clear pkt_buf on flush
 
 ---
  libavcodec/v4l2_m2m_dec.c | 3 +++
@@ -18113,7 +18113,7 @@ index 7e7e4729d0..09ec496351 100644
 From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 12:52:56 +0000
-Subject: [PATCH 038/136] Rework v4l2 buffer dequeue
+Subject: [PATCH 038/151] Rework v4l2 buffer dequeue
 
 ---
  libavcodec/v4l2_context.c | 543 ++++++++++++++++++--------------------
@@ -19150,7 +19150,7 @@ index 09ec496351..e4b6569ba5 100644
 From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Dec 2021 18:51:00 +0000
-Subject: [PATCH 039/136] Honor result of ff_get_format if possible
+Subject: [PATCH 039/151] Honor result of ff_get_format if possible
 
 ---
  libavcodec/v4l2_m2m_dec.c | 6 +++++-
@@ -19185,7 +19185,7 @@ index e4b6569ba5..c9655bcc3b 100644
 From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 14 Dec 2021 16:11:10 +0000
-Subject: [PATCH 040/136] Add an always-reinit quirk
+Subject: [PATCH 040/151] Add an always-reinit quirk
 
 ---
  libavcodec/v4l2_context.c |  7 +++++--
@@ -19291,7 +19291,7 @@ index c9655bcc3b..e2b10f5e3a 100644
 From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 4 Jan 2022 16:58:31 +0000
-Subject: [PATCH 041/136] v4l2_buffers: rework flags for keyframe
+Subject: [PATCH 041/151] v4l2_buffers: rework flags for keyframe
 
 Previously flags could become confused and keyframe info could be lost.
 This fixes that and removes the duplicate flags field in V4L2Buffer.
@@ -19400,7 +19400,7 @@ index c11b5e6863..53b522d43e 100644
 From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 22 Mar 2022 11:44:30 +0000
-Subject: [PATCH 042/136] v4l2m2m: Rework decode to wait for missing buffer,
+Subject: [PATCH 042/151] v4l2m2m: Rework decode to wait for missing buffer,
  add dynamic pending
 
 Previously receive_frame exited with EAGAIN if no capture buffer
@@ -19620,7 +19620,7 @@ index e2b10f5e3a..2e30449dfc 100644
 From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 25 Mar 2022 15:37:58 +0000
-Subject: [PATCH 043/136] v4l2_m2m2_dec: Avoid loop if unable to resize buffers
+Subject: [PATCH 043/151] v4l2_m2m2_dec: Avoid loop if unable to resize buffers
 
 If source change signals a buffer size that cannot be honored give up
 rather than looping indefinitely.  This happens on Pi if (say) a
@@ -19667,7 +19667,7 @@ index 7ddb759810..007a58c8f1 100644
 From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 25 Mar 2022 18:14:40 +0000
-Subject: [PATCH 044/136] v4l2dec: Improve size/format validation on init
+Subject: [PATCH 044/151] v4l2dec: Improve size/format validation on init
 
 ---
  libavcodec/v4l2_m2m_dec.c      | 84 ++++++++++++++++++++++++++++++++--
@@ -19809,7 +19809,7 @@ index b0a5930844..76ab0916cd 100644
 From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 13 Apr 2022 16:05:56 +0000
-Subject: [PATCH 045/136] v4l2 stateless hevc: Add another API variation for
+Subject: [PATCH 045/151] v4l2 stateless hevc: Add another API variation for
  linux 5.18
 
 This is probably going to be a short lived variation and may end up
@@ -20255,7 +20255,7 @@ index f14f594564..ed48d62e2d 100644
 From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 3 May 2022 12:44:42 +0000
-Subject: [PATCH 046/136] Remove V4l2 frame size check for meson-vdec
+Subject: [PATCH 046/151] Remove V4l2 frame size check for meson-vdec
 
 ---
  libavcodec/v4l2_m2m.h     |  3 ++-
@@ -20315,7 +20315,7 @@ index 8dcadf461b..888ba67fea 100644
 From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 23 May 2022 18:05:20 +0100
-Subject: [PATCH 047/136] v4l2m2m_dec: Make some error rturns a bit more robust
+Subject: [PATCH 047/151] v4l2m2m_dec: Make some error rturns a bit more robust
 
 ---
  libavcodec/v4l2_context.c |  5 ++---
@@ -20384,7 +20384,7 @@ index 888ba67fea..88a341aae2 100644
 From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 24 May 2022 17:02:58 +0000
-Subject: [PATCH 048/136] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA
+Subject: [PATCH 048/151] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA
 
 Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA.  Should
 also detect and complain about unexpected streams of empty packets.
@@ -20494,7 +20494,7 @@ index 88a341aae2..392a68f0c7 100644
 From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 24 May 2022 20:02:48 +0000
-Subject: [PATCH 049/136] v4l2m2m_dec: Catch repeated Q fulls
+Subject: [PATCH 049/151] v4l2m2m_dec: Catch repeated Q fulls
 
 ---
  libavcodec/v4l2_m2m_dec.c | 8 +++++++-
@@ -20536,7 +20536,7 @@ index 392a68f0c7..7e17044706 100644
 From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 25 May 2022 15:22:12 +0000
-Subject: [PATCH 050/136] Remove requirement for epoxy & libudev config options
+Subject: [PATCH 050/151] Remove requirement for epoxy & libudev config options
 
 ---
  configure              | 26 +++++++++++++++++---------
@@ -20663,7 +20663,7 @@ index 65576846e8..37cea71756 100755
 From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 27 May 2022 09:36:51 +0000
-Subject: [PATCH 051/136] hevc: If hwaccel avoid creation of s/w only vars
+Subject: [PATCH 051/151] hevc: If hwaccel avoid creation of s/w only vars
 
 ---
  libavcodec/hevc_refs.c | 35 +++++++++++++++++++++--------------
@@ -20801,7 +20801,7 @@ index 2867cb2e16..17f53322fb 100644
 From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 30 May 2022 17:51:44 +0100
-Subject: [PATCH 052/136] rpi_sand: Add SAND30->NV12 conversion
+Subject: [PATCH 052/151] rpi_sand: Add SAND30->NV12 conversion
 
 C code only. Reworks the hwcontext_drm conversion to use the
 rpi_sand_fns generic frame convert fn rather than calling the
@@ -21023,7 +21023,7 @@ index 634b55e800..462ccb8abd 100644
 From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 1 Jun 2022 17:49:26 +0000
-Subject: [PATCH 053/136] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8
+Subject: [PATCH 053/151] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8
 
 Also reworks the previous Armv8 SAND30->Y16 function in a slightly more
 efficient way that makes it look more like the Armv7 version.
@@ -21962,7 +21962,7 @@ index 256c3d532f..b6071e2928 100644
 From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 7 Jun 2022 14:46:12 +0000
-Subject: [PATCH 054/136] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
+Subject: [PATCH 054/151] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
  frames
 
 ---
@@ -23337,7 +23337,7 @@ index 9a0837ecf3..05ff6ba726 100644
 From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 8 Jun 2022 16:13:31 +0000
-Subject: [PATCH 055/136] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
+Subject: [PATCH 055/151] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
  always NO_PTS
 
 If we do have DTS but don't have PTS then assume PTS=DTS.
@@ -23422,7 +23422,7 @@ index fbbfc81342..485a96f4b4 100644
 From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 30 Jun 2022 15:59:23 +0000
-Subject: [PATCH 056/136] v4l2: Update H265 request for current API
+Subject: [PATCH 056/151] v4l2: Update H265 request for current API
 
 This works with v9 of the H265 patch set which hopefully will be the
 last one. Hevc controls extracted from patched v4l2-controls into
@@ -24211,7 +24211,7 @@ index ed48d62e2d..d4adb3f812 100644
 From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sun, 3 Jul 2022 13:40:41 +0000
-Subject: [PATCH 057/136] v4l2_req: Observe limit on size of slice_array
+Subject: [PATCH 057/151] v4l2_req: Observe limit on size of slice_array
 
 This in fact provides some minor simplifications by combing the
 multi-slice and single-slice paths.
@@ -24342,7 +24342,7 @@ index d4adb3f812..0029e23309 100644
 From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 14:43:20 +0100
-Subject: [PATCH 058/136] v4l2_req: Add entry point offsets array control
+Subject: [PATCH 058/151] v4l2_req: Add entry point offsets array control
 
 ---
  libavcodec/v4l2_req_hevc_vx.c  | 88 +++++++++++++++++++++++++++-------
@@ -24580,7 +24580,7 @@ index 0029e23309..99c90064ea 100644
 From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 16:22:54 +0100
-Subject: [PATCH 059/136] v4l2_req: Support Annex B
+Subject: [PATCH 059/151] v4l2_req: Support Annex B
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------
@@ -24694,7 +24694,7 @@ index 43ef6631ed..5e0db9850a 100644
 From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 18:24:03 +0100
-Subject: [PATCH 060/136] v4l2_req: Add frame mode decode
+Subject: [PATCH 060/151] v4l2_req: Add frame mode decode
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------
@@ -24820,7 +24820,7 @@ index 5e0db9850a..ada53d0d44 100644
 From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 5 Jul 2022 12:54:22 +0000
-Subject: [PATCH 061/136] v4l2_req: Fix probe for frame based decode
+Subject: [PATCH 061/151] v4l2_req: Fix probe for frame based decode
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++----------
@@ -24903,7 +24903,7 @@ index ada53d0d44..5d083016f8 100644
 From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 26 Jul 2022 15:46:14 +0000
-Subject: [PATCH 062/136] vf_deinterlace_v4l2m2m: Support NV12 through
+Subject: [PATCH 062/151] vf_deinterlace_v4l2m2m: Support NV12 through
  deinterlace
 
 Supports NV12 (though not yet NV12M) through deinterlace.
@@ -25229,7 +25229,7 @@ index 1a933b7e0a..1a3bef5bcb 100644
 From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 19 Aug 2022 15:29:11 +0000
-Subject: [PATCH 063/136] v4l2_req: Enable use of MMAP for buffer alloc
+Subject: [PATCH 063/151] v4l2_req: Enable use of MMAP for buffer alloc
 
 Use MMAP rather than DMABUF if either the dmabuf device can't be opened
 or create_buf doesn't set the capability.
@@ -25961,7 +25961,7 @@ index cd79aad563..5cf17dd5e3 100644
 From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 22 Aug 2022 12:35:40 +0000
-Subject: [PATCH 064/136] Set buffer lengths on DQ
+Subject: [PATCH 064/151] Set buffer lengths on DQ
 
 ---
  libavcodec/v4l2_req_media.c | 8 ++++++++
@@ -25990,7 +25990,7 @@ index 910ac77bb6..1a9944774a 100644
 From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 22 Aug 2022 17:11:24 +0000
-Subject: [PATCH 065/136] Fix compile if videodev2.h defines V4L2 HEVC request
+Subject: [PATCH 065/151] Fix compile if videodev2.h defines V4L2 HEVC request
  API
 
 If videodev2.h does define the HEVC request API it is really hard to
@@ -26117,7 +26117,7 @@ index 5cf17dd5e3..614a1b4d99 100644
 From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Sep 2022 17:59:22 +0100
-Subject: [PATCH 066/136] v4l2_m2m_enc: Send headers in in pkt side_data
+Subject: [PATCH 066/151] v4l2_m2m_enc: Send headers in in pkt side_data
 
 If GLOBAL_HEADERS are requested then we can't provide them at init time
 so send as NEW_EXTRADATA side data in a similar way to some AV1
@@ -26198,7 +26198,7 @@ index 05ff6ba726..099ad23928 100644
 From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 14 Sep 2022 15:44:10 +0000
-Subject: [PATCH 067/136] matroskaenc: Allow H264 SPS/PPS headers in packet
+Subject: [PATCH 067/151] matroskaenc: Allow H264 SPS/PPS headers in packet
  sidedata
 
 ---
@@ -26267,7 +26267,7 @@ index 113541bd9a..61e4c976ef 100644
 From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 14 Sep 2022 15:55:15 +0000
-Subject: [PATCH 068/136] movenc: Allow H264 SPS/PPS headers in packet sidedata
+Subject: [PATCH 068/151] movenc: Allow H264 SPS/PPS headers in packet sidedata
 
 ---
  libavformat/movenc.c | 1 +
@@ -26289,7 +26289,7 @@ index c4fcb5f8b1..891adbf7b2 100644
 From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 12:45:05 +0100
-Subject: [PATCH 069/136] Allow ffmpeg to select codec internal hwfmts if
+Subject: [PATCH 069/151] Allow ffmpeg to select codec internal hwfmts if
  no_cvt_hw
 
 This allows the selection of DRM_PRIME from v4l2m2m without forcing it
@@ -26326,7 +26326,7 @@ index ba0c1898cf..839da7b472 100644
 From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 1 Sep 2022 11:42:41 +0000
-Subject: [PATCH 070/136] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler
+Subject: [PATCH 070/151] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler
 
 The logic for running an isp based scaler is pretty much identical to
 that for the deinterlacer so add to the deinterlacer. This requires
@@ -27809,7 +27809,7 @@ index 1a3bef5bcb..2df39ec0f1 100644
 From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 14:54:46 +0000
-Subject: [PATCH 071/136] v4l2_m2m: Adjust buffer allocation based on min/max
+Subject: [PATCH 071/151] v4l2_m2m: Adjust buffer allocation based on min/max
  controls
 
 Clip requested buffer count to min/max declared by driver.
@@ -27861,7 +27861,7 @@ index 6b97eab41e..ba36689ff3 100644
 From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 15:00:12 +0000
-Subject: [PATCH 072/136] v4l2_m2m_dec: If src Q is full then wait indefinitely
+Subject: [PATCH 072/151] v4l2_m2m_dec: If src Q is full then wait indefinitely
  for buffer
 
 If it is not possible to add another buffer to the src Q then alawys
@@ -27894,7 +27894,7 @@ index 485a96f4b4..bb183097f6 100644
 From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 15:12:27 +0000
-Subject: [PATCH 073/136] vf_deinterlace_v4l2m2m: Add Q name to structure for
+Subject: [PATCH 073/151] vf_deinterlace_v4l2m2m: Add Q name to structure for
  debug
 
 ---
@@ -27928,7 +27928,7 @@ index 2df39ec0f1..4edecc02bf 100644
 From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 16:08:42 +0000
-Subject: [PATCH 074/136] v4l2_m2m_enc: Set src buffer count to min+2 by
+Subject: [PATCH 074/151] v4l2_m2m_enc: Set src buffer count to min+2 by
  default
 
 Set output.num_buffers to 0 by default which will then be set to min+2
@@ -27960,7 +27960,7 @@ index 099ad23928..b8ba815c37 100644
 From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 16:13:57 +0000
-Subject: [PATCH 075/136] vf_deinterlace_m2m: For deinterlace set outlink FR to
+Subject: [PATCH 075/151] vf_deinterlace_m2m: For deinterlace set outlink FR to
  twice inlink
 
 We used to set the outlink framerate to unknown but it turns out that
@@ -27997,7 +27997,7 @@ index 4edecc02bf..c52dae1c44 100644
 From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 23 Sep 2022 11:30:56 +0000
-Subject: [PATCH 076/136] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
+Subject: [PATCH 076/151] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
  a Q
 
 Useful for where (encode) we might have drmprime buffers that we want to
@@ -28055,7 +28055,7 @@ index 21265f1bd7..523c53e97d 100644
 From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 23 Sep 2022 11:38:36 +0000
-Subject: [PATCH 077/136] v4l2_m2m_enc: DQ output more frequently
+Subject: [PATCH 077/151] v4l2_m2m_enc: DQ output more frequently
 
 Ensure that we DQ any released src buffers on every op to avoid deadlock
 with source.
@@ -28114,7 +28114,7 @@ index b8ba815c37..a992a3cccc 100644
 From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:20:00 +0100
-Subject: [PATCH 078/136] conf_native: Remove --enable-rpi from all builds
+Subject: [PATCH 078/151] conf_native: Remove --enable-rpi from all builds
 
 ---
  pi-util/conf_native.sh | 5 +++--
@@ -28148,7 +28148,7 @@ index 37cea71756..f22d531ca4 100755
 From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 29 Sep 2022 19:48:08 +0000
-Subject: [PATCH 079/136] v4l2_m2m_dec: Deal correctly with avcC H264 data in
+Subject: [PATCH 079/151] v4l2_m2m_dec: Deal correctly with avcC H264 data in
  extradata
 
 Decoders expect AnnexB style headers, mkv and similar formats have
@@ -28391,7 +28391,7 @@ index bb183097f6..6bd9926b3f 100644
 From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 30 Sep 2022 14:20:23 +0000
-Subject: [PATCH 080/136] v4l2_request_hevc: Fix up
+Subject: [PATCH 080/151] v4l2_request_hevc: Fix up
  V4L2_CID_CODEC_STATELESS_BASE if missing
 
 ---
@@ -28420,7 +28420,7 @@ index 7829d82084..c02fdbe5a8 100644
 From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sat, 1 Oct 2022 13:40:57 +0000
-Subject: [PATCH 081/136] vf_deinterlace_v4l2m2m: Fix compile on m/c without
+Subject: [PATCH 081/151] vf_deinterlace_v4l2m2m: Fix compile on m/c without
  V4L2 SAND
 
 ---
@@ -28554,7 +28554,7 @@ index c52dae1c44..716789f988 100644
 From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sun, 2 Oct 2022 12:36:43 +0000
-Subject: [PATCH 082/136] configure: Fix v4l2_req_hevc_vx setup; set after deps
+Subject: [PATCH 082/151] configure: Fix v4l2_req_hevc_vx setup; set after deps
  fixups
 
 ---
@@ -28592,7 +28592,7 @@ index 5c00a183e3..94c8161b91 100755
 From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sat, 1 Oct 2022 12:39:45 +0000
-Subject: [PATCH 083/136] vf_deinterlace_v4l2m2m: Ensure we get consistent
+Subject: [PATCH 083/151] vf_deinterlace_v4l2m2m: Ensure we get consistent
  final frames
 
 On getting EOS at the input of the filster do not simply drop everything
@@ -28944,7 +28944,7 @@ index 716789f988..ce875c2c61 100644
 From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 5 Oct 2022 16:12:02 +0000
-Subject: [PATCH 084/136] v4l2_m2m_dec: Rework decode pending heuristic
+Subject: [PATCH 084/151] v4l2_m2m_dec: Rework decode pending heuristic
 
 The old code measured the length of the entire Q in the decoder and
 attempted to dynamically guess an appropriate length. This was prone to
@@ -29115,7 +29115,7 @@ index 6bd9926b3f..bec9b22fcf 100644
 From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 21 Oct 2022 13:48:07 +0000
-Subject: [PATCH 085/136] pthread_frame: Fix MT hwaccel. Recent change broke
+Subject: [PATCH 085/151] pthread_frame: Fix MT hwaccel. Recent change broke
  it.
 
 Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the
@@ -29222,7 +29222,7 @@ index 2cc89a41f5..b14f8e9360 100644
 From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:18:27 +0000
-Subject: [PATCH 086/136] v4l2_req: Add swfmt to init logging
+Subject: [PATCH 086/151] v4l2_req: Add swfmt to init logging
 
 (cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf)
 ---
@@ -29259,7 +29259,7 @@ index 614a1b4d99..767ecb036a 100644
 From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:39:54 +0000
-Subject: [PATCH 087/136] v4l2_m2m: Avoid polling on a queue that is streamoff
+Subject: [PATCH 087/151] v4l2_m2m: Avoid polling on a queue that is streamoff
 
 (cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b)
 ---
@@ -29304,7 +29304,7 @@ index 4a359bf45e..b296dc111c 100644
 From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:07:04 +0000
-Subject: [PATCH 088/136] v4l2_m2m: Add function to get number of queued
+Subject: [PATCH 088/151] v4l2_m2m: Add function to get number of queued
  buffers
 
 (cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4)
@@ -29336,7 +29336,7 @@ index 523c53e97d..8e4f681643 100644
 From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:48:20 +0000
-Subject: [PATCH 089/136] v4l2_m2m: Add timeouts to dq_all and dequeue_packet
+Subject: [PATCH 089/151] v4l2_m2m: Add timeouts to dq_all and dequeue_packet
 
 Add timeouts and use them to have better flow control in encode
 
@@ -29505,7 +29505,7 @@ index a992a3cccc..d0d27e5bc2 100644
 From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:23:32 +0000
-Subject: [PATCH 090/136] v4l2_m2m_enc: Improve debug trace
+Subject: [PATCH 090/151] v4l2_m2m_enc: Improve debug trace
 
 (cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5)
 ---
@@ -29565,7 +29565,7 @@ index d0d27e5bc2..c8c2de3d47 100644
 From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:22:36 +0000
-Subject: [PATCH 091/136] v4l2_m2m_enc: Copy dest packets to memory if short of
+Subject: [PATCH 091/151] v4l2_m2m_enc: Copy dest packets to memory if short of
  v4l2 buffers
 
 (cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5)
@@ -29604,7 +29604,7 @@ index c8c2de3d47..c23187e6e6 100644
 From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 11:00:16 +0000
-Subject: [PATCH 092/136] v4l2_m2m_dec: Fix pts_best_effort guessing for
+Subject: [PATCH 092/151] v4l2_m2m_dec: Fix pts_best_effort guessing for
  initial pts
 
 (cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67)
@@ -29629,7 +29629,7 @@ index bec9b22fcf..47b2735f82 100644
 From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:47:04 +0000
-Subject: [PATCH 093/136] v4l2_m2m_enc: Wait for frame or space in src Q in
+Subject: [PATCH 093/151] v4l2_m2m_enc: Wait for frame or space in src Q in
  rx_pkt
 
 If receive_packet we should ensure that there is space in the source Q
@@ -29691,7 +29691,7 @@ index c23187e6e6..524e9424a5 100644
 From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:54:29 +0000
-Subject: [PATCH 094/136] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
+Subject: [PATCH 094/151] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
  in trace
 
 (cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a)
@@ -29718,7 +29718,7 @@ index ce875c2c61..7c6751b69c 100644
 From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:55:21 +0000
-Subject: [PATCH 095/136] vf_deinterlace_v4l2m2m: Ignore "wanted" when
+Subject: [PATCH 095/151] vf_deinterlace_v4l2m2m: Ignore "wanted" when
  processing input
 
 If we gate send a frame to the outlink on its frame_wanted flag then we
@@ -29751,7 +29751,7 @@ index 7c6751b69c..a173a291f8 100644
 From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 15:00:43 +0000
-Subject: [PATCH 096/136] conf_native: Add --enable-gpl
+Subject: [PATCH 096/151] conf_native: Add --enable-gpl
 
 (cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15)
 ---
@@ -29774,7 +29774,7 @@ index f22d531ca4..082d9b5832 100755
 From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 15 Nov 2022 13:33:00 +0000
-Subject: [PATCH 097/136] egl_vout: Make formatting consistent - no code
+Subject: [PATCH 097/151] egl_vout: Make formatting consistent - no code
  changes
 
 ---
@@ -30758,7 +30758,7 @@ index 7b9c610ace..a52cabb082 100644
 From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 16:49:43 +0000
-Subject: [PATCH 098/136] v4l2m2m: reporganise get_raw_format for loop logic
+Subject: [PATCH 098/151] v4l2m2m: reporganise get_raw_format for loop logic
 
 ---
  libavcodec/v4l2_context.c | 16 +++++-----------
@@ -30806,7 +30806,7 @@ index 7031f3d340..79a31cf930 100644
 From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:49:12 +0000
-Subject: [PATCH 099/136] drm_vout: Set zpos on the plane we pick to ensure it
+Subject: [PATCH 099/151] drm_vout: Set zpos on the plane we pick to ensure it
  is at the front
 
 ---
@@ -30876,7 +30876,7 @@ index cfb33ce7c3..9bd9e04421 100644
 From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:51:46 +0000
-Subject: [PATCH 100/136] drm_vout: Only set modifier flag and pass modifiers
+Subject: [PATCH 100/151] drm_vout: Only set modifier flag and pass modifiers
  if there are some
 
 ---
@@ -30936,7 +30936,7 @@ index 9bd9e04421..a56adea866 100644
 From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:52:58 +0000
-Subject: [PATCH 101/136] drm_vout: Fix typo in error message
+Subject: [PATCH 101/151] drm_vout: Fix typo in error message
 
 ---
  libavdevice/drm_vout.c | 2 +-
@@ -30959,7 +30959,7 @@ index a56adea866..351abf1d60 100644
 From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 18:00:41 +0000
-Subject: [PATCH 102/136] drm_vout: Add option to name the drm_module to use
+Subject: [PATCH 102/151] drm_vout: Add option to name the drm_module to use
 
 ---
  libavdevice/drm_vout.c | 8 +++++---
@@ -31012,7 +31012,7 @@ index 351abf1d60..491e1dc608 100644
 From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 13:01:00 +0000
-Subject: [PATCH 103/136] dmabufs: Rework to allow for non-CMA backends
+Subject: [PATCH 103/151] dmabufs: Rework to allow for non-CMA backends
 
 ---
  libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++----------
@@ -31266,7 +31266,7 @@ index c4bbed18c6..1c3a5e861f 100644
 From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 13:07:58 +0000
-Subject: [PATCH 104/136] dmabufs: Use unref rather than deleet on cmabufs_ctl
+Subject: [PATCH 104/151] dmabufs: Use unref rather than deleet on cmabufs_ctl
 
 ---
  libavcodec/v4l2_req_dmabufs.c  | 12 +++++++++++-
@@ -31354,7 +31354,7 @@ index 767ecb036a..db7ed13b6d 100644
 From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 14:21:40 +0000
-Subject: [PATCH 105/136] egl_vout: Remove redundant & completely broken debug
+Subject: [PATCH 105/151] egl_vout: Remove redundant & completely broken debug
 
 ---
  libavdevice/egl_vout.c | 25 -------------------------
@@ -31400,7 +31400,7 @@ index a52cabb082..afc7afd13e 100644
 From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 16:12:12 +0000
-Subject: [PATCH 106/136] v4l2m2m: Use offset from querybuf rather than always
+Subject: [PATCH 106/151] v4l2m2m: Use offset from querybuf rather than always
  0
 
 ---
@@ -31455,7 +31455,7 @@ index 1ac32c5989..d91d5d1dd0 100644
 From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 17:57:27 +0000
-Subject: [PATCH 107/136] v4l2m2m: Fix crash if init errors out before setting
+Subject: [PATCH 107/151] v4l2m2m: Fix crash if init errors out before setting
  avctx
 
 ---
@@ -31479,7 +31479,7 @@ index 1e30d15fd8..ac6bae0dc3 100644
 From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 18:10:30 +0000
-Subject: [PATCH 108/136] v4l2_buffers: Add and use ctx_to_m2mctx + error debug
+Subject: [PATCH 108/151] v4l2_buffers: Add and use ctx_to_m2mctx + error debug
 
 ---
  libavcodec/v4l2_buffers.c | 22 +++++++++++++++-------
@@ -31546,7 +31546,7 @@ index 5ca58ea593..e28ef2d1e8 100644
 From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 18:53:22 +0000
-Subject: [PATCH 109/136] v4l2m2m: Add ability to use cma alloced dmabufs as
+Subject: [PATCH 109/151] v4l2m2m: Add ability to use cma alloced dmabufs as
  well as v4l2 mmap
 
 ---
@@ -31807,7 +31807,7 @@ index 47b2735f82..4d17057298 100644
 From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 19:05:47 +0000
-Subject: [PATCH 110/136] testfilt: Skeleton of hw filter test code
+Subject: [PATCH 110/151] testfilt: Skeleton of hw filter test code
 
 ---
  pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++
@@ -31907,7 +31907,7 @@ index 0000000000..b322dac0c2
 From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 5 Jan 2023 14:39:30 +0000
-Subject: [PATCH 111/136] pixfmt: Add a #define to indicate presence of SAND
+Subject: [PATCH 111/151] pixfmt: Add a #define to indicate presence of SAND
  formats
 
 ---
@@ -31931,7 +31931,7 @@ index 22f70007c3..5cc780e7d5 100644
 From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 11 Jan 2023 16:30:37 +0000
-Subject: [PATCH 112/136] v4l2_m2m_dec: Fix initial pkt send if no extradata
+Subject: [PATCH 112/151] v4l2_m2m_dec: Fix initial pkt send if no extradata
 
 ---
  libavcodec/v4l2_m2m_dec.c | 4 ++--
@@ -31963,7 +31963,7 @@ index 4d17057298..9daf05adfe 100644
 From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 16 Jan 2023 16:05:09 +0000
-Subject: [PATCH 113/136] v4l2m2m_dec: Make capture timeout long once pending
+Subject: [PATCH 113/151] v4l2m2m_dec: Make capture timeout long once pending
  count > 31
 
 For some applications (ffmpeg command line) the current heuristic of adding
@@ -32060,7 +32060,7 @@ index 9daf05adfe..c8ab883d7e 100644
 From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 6 Feb 2023 19:23:16 +0000
-Subject: [PATCH 114/136] Initial buffersink alloc callback code
+Subject: [PATCH 114/151] Initial buffersink alloc callback code
 
 (cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb)
 ---
@@ -32155,7 +32155,7 @@ index 64e08de53e..09737d322f 100644
 From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 30 Jan 2023 17:23:12 +0000
-Subject: [PATCH 115/136] v4l2_m2m_dec: Add a profile check
+Subject: [PATCH 115/151] v4l2_m2m_dec: Add a profile check
 
 Check the profile in avctx aginst what the v4l2 driver advertises. If
 the driver doesn't support the check then just accept anything.
@@ -32312,7 +32312,7 @@ index c8ab883d7e..098adf4821 100644
 From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 1 Feb 2023 17:24:39 +0000
-Subject: [PATCH 116/136] v4l2_m2m_dec: Add extradata parse for h264 & hevc
+Subject: [PATCH 116/151] v4l2_m2m_dec: Add extradata parse for h264 & hevc
 
 If we have extradata we can extract profile & level and potentailly
 other useful info from it. Use the codec parser to get it if the decoder
@@ -32443,7 +32443,7 @@ index 098adf4821..e64bc707d3 100644
 From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 20 Mar 2023 18:12:51 +0000
-Subject: [PATCH 117/136] clean_usr_libs: Now wipes the include files too
+Subject: [PATCH 117/151] clean_usr_libs: Now wipes the include files too
 
 When swapping ffmpeg versions obsolete makefiles could confuse
 configure utilities.
@@ -32480,7 +32480,7 @@ index b3b2d5509d..01bd6a6a22 100755
 From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 20 Mar 2023 18:15:08 +0000
-Subject: [PATCH 118/136] vulkan: Add missing decode extension defines
+Subject: [PATCH 118/151] vulkan: Add missing decode extension defines
 
 When building on bookworm the video decode extension names
 were missing. This adds them. I expect this patch will be
@@ -32512,7 +32512,7 @@ index 2a9b5f4aac..11e7945f18 100644
 From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 21 Mar 2023 14:20:05 +0000
-Subject: [PATCH 119/136] v4l2_m2m_dec: Fix config file for finding if decoder
+Subject: [PATCH 119/151] v4l2_m2m_dec: Fix config file for finding if decoder
  enabled
 
 Fixes parsing of extradata for profile testing. 5.x changed where that
@@ -32538,7 +32538,7 @@ index e64bc707d3..91136f03da 100644
 From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 21 Mar 2023 14:23:20 +0000
-Subject: [PATCH 120/136] v4l2_m2m_dec: Display profile given if skipped in
+Subject: [PATCH 120/151] v4l2_m2m_dec: Display profile given if skipped in
  debug
 
 ---
@@ -32562,7 +32562,7 @@ index 91136f03da..d124c7b1fc 100644
 From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 22 Mar 2023 16:08:08 +0000
-Subject: [PATCH 121/136] conf_native: Fix for 64-bit kernel with 32-bit
+Subject: [PATCH 121/151] conf_native: Fix for 64-bit kernel with 32-bit
  userspace
 
 (cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327)
@@ -32618,7 +32618,7 @@ index 082d9b5832..0a7d230f1b 100755
 From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 20 Apr 2023 11:48:25 +0000
-Subject: [PATCH 122/136] conf_native: Add install prefix variation
+Subject: [PATCH 122/151] conf_native: Add install prefix variation
 
 (cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d)
 ---
@@ -32732,7 +32732,7 @@ index 0a7d230f1b..f0ed159594 100755
 From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Apr 2023 10:47:58 +0000
-Subject: [PATCH 123/136] swcale: Add explicit bgr24->yv12 conversion
+Subject: [PATCH 123/151] swcale: Add explicit bgr24->yv12 conversion
 
 (cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434)
 ---
@@ -32890,7 +32890,7 @@ index 9af2e7ecc3..9047030ae4 100644
 From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 20 Apr 2023 11:26:10 +0000
-Subject: [PATCH 124/136] swscale: Add unscaled XRGB->YUV420P functions
+Subject: [PATCH 124/151] swscale: Add unscaled XRGB->YUV420P functions
 
 (cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8)
 ---
@@ -33222,7 +33222,7 @@ index 9047030ae4..053c06adf5 100644
 From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 20 Apr 2023 11:35:44 +0000
-Subject: [PATCH 125/136] swscale: Add aarch64 unscaled RGB24->YUV420P
+Subject: [PATCH 125/151] swscale: Add aarch64 unscaled RGB24->YUV420P
 
 (cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160)
 ---
@@ -33480,7 +33480,7 @@ index d81110ec57..8cf40b65f5 100644
 From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 27 Apr 2023 13:03:52 +0000
-Subject: [PATCH 126/136] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh
+Subject: [PATCH 126/151] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh
 
 (cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d)
 ---
@@ -34010,7 +34010,7 @@ index 8cf40b65f5..978ab443ea 100644
 From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 26 Apr 2023 15:36:07 +0000
-Subject: [PATCH 127/136] rgb2rgb: Use asm unconditionally
+Subject: [PATCH 127/151] rgb2rgb: Use asm unconditionally
 
 (cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1)
 ---
@@ -34074,7 +34074,7 @@ index f10c4ef2de..6a0e2dcc09 100644
 From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 27 Apr 2023 13:01:43 +0000
-Subject: [PATCH 128/136] tests/swscale: Add options for width and height on
+Subject: [PATCH 128/151] tests/swscale: Add options for width and height on
  the command line
 
 (cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271)
@@ -34233,7 +34233,7 @@ index 6c38041ddb..4cf41d9f64 100644
 From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 26 Apr 2023 16:31:23 +0000
-Subject: [PATCH 129/136] tests/swscale: Add a timing option
+Subject: [PATCH 129/151] tests/swscale: Add a timing option
 
 -t <n>   Where n is the number of time to loop the scale op.
          Often useful to do it 10 times or so for better resolution
@@ -34318,7 +34318,7 @@ index 4cf41d9f64..12776ffec7 100644
 From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 20 Apr 2023 13:40:36 +0000
-Subject: [PATCH 130/136] swscale: RGB->YUV420 fix C template to allow odd
+Subject: [PATCH 130/151] swscale: RGB->YUV420 fix C template to allow odd
  widths
 
 (cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8)
@@ -34455,7 +34455,7 @@ index 053c06adf5..52469b2e4a 100644
 From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 4 May 2023 14:26:14 +0000
-Subject: [PATCH 131/136] rtpenc: Add code to send H264 new extradata in
+Subject: [PATCH 131/151] rtpenc: Add code to send H264 new extradata in
  sidedata
 
 Fixes issue with pi V4L2 H264 encode which cannot create extradata
@@ -34508,7 +34508,7 @@ index a8d296a154..f67dc2a15a 100644
 From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 5 Jun 2023 08:34:38 +0000
-Subject: [PATCH 132/136] rgb2rgb: Fix luma narrow+saturation instruction
+Subject: [PATCH 132/151] rgb2rgb: Fix luma narrow+saturation instruction
 
 (cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a)
 ---
@@ -34579,7 +34579,7 @@ index 978ab443ea..476ca723a0 100644
 From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sun, 4 Jun 2023 13:37:59 +0000
-Subject: [PATCH 133/136] v4l2_m2m_dec: Tweak pending count to use dts &
+Subject: [PATCH 133/151] v4l2_m2m_dec: Tweak pending count to use dts &
  reorder size
 
 (cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a)
@@ -34744,7 +34744,7 @@ index d124c7b1fc..13af62e819 100644
 From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 7 Jun 2023 11:14:52 +0000
-Subject: [PATCH 134/136] v4l2_m2m: Add encode size check
+Subject: [PATCH 134/151] v4l2_m2m: Add encode size check
 
 Previously an out of bounds size would fail whilst trying to copy the
 buffer with an unhelpful message. This produces a better error at init
@@ -34820,7 +34820,7 @@ index f802687b1b..28d9ed4988 100644
 From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 9 Jun 2023 10:28:12 +0000
-Subject: [PATCH 135/136] vf_bwdif: Add attributes to ask for vectorization
+Subject: [PATCH 135/151] vf_bwdif: Add attributes to ask for vectorization
 
 (cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7)
 ---
@@ -34933,7 +34933,7 @@ index 65c617ebb3..09e68523bb 100644
 From f4012f09da1c57a0aa5db01f9096992d0c385f7b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Jun 2023 13:07:55 +0000
-Subject: [PATCH 136/136] v4l2m2m_dec: Fix h264 reorder size if no sps
+Subject: [PATCH 136/151] v4l2m2m_dec: Fix h264 reorder size if no sps
  initially
 
 (cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4)
@@ -34955,3 +34955,2838 @@ index 13af62e819..11c83b2d66 100644
              }
              ff_h264_ps_uninit(&ps);
              break;
+
+From fd31937e4befa2368d48e234d66fb962246bf777 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Fri, 30 Jun 2023 18:03:29 +0000
+Subject: [PATCH 137/151] sand_fns: Add missing uxtw for neon stride
+
+---
+ libavutil/aarch64/rpi_sand_neon.S | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
+index 2f07d9674c..19411cf3f1 100644
+--- a/libavutil/aarch64/rpi_sand_neon.S
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -469,6 +469,7 @@ endfunc
+ function ff_rpi_sand30_lines_to_planar_y16, export=1
+                 lsl             w4,  w4,  #7
+                 sub             w4,  w4,  #64
++                uxtw            x4,  w4
+                 sub             w1,  w1,  w7, lsl #1
+                 uxtw            x6,  w6
+                 add             x8,  x2,  x6, lsl #7
+@@ -634,6 +635,7 @@ endfunc
+ function ff_rpi_sand30_lines_to_planar_y8, export=1
+                 lsl             w4,  w4,  #7
+                 sub             w4,  w4,  #64
++                uxtw            x4,  w4
+                 sub             w1,  w1,  w7
+                 uxtw            x6,  w6
+                 add             x8,  x2,  x6, lsl #7
+
+From f6a19a36ffe0dbe0a6e2e450dafec6711db19057 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Fri, 30 Jun 2023 18:12:16 +0000
+Subject: [PATCH 138/151] sand_fns: Rework aarch64 neon
+ sand30_lines_to_planar_c16
+
+Previous version could overflow its write buffer on small buffers
+which sometimes crashed WPP_F_ericsson_MAIN10_2.
+
+This version is probably faster too
+---
+ libavutil/aarch64/rpi_sand_neon.S | 329 ++++++++++++++----------------
+ 1 file changed, 151 insertions(+), 178 deletions(-)
+
+diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
+index 19411cf3f1..af7e2a88c4 100644
+--- a/libavutil/aarch64/rpi_sand_neon.S
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -248,199 +248,172 @@ incomplete_block_loop_end_c8:
+     ret
+ endfunc
+ 
+-//void ff_rpi_sand30_lines_to_planar_c16(
+-//  uint8_t * dst_u,            // [x0]
+-//  unsigned int dst_stride_u,  // [w1] == _w*2
+-//  uint8_t * dst_v,            // [x2]
+-//  unsigned int dst_stride_v,  // [w3] == _w*2
+-//  const uint8_t * src,        // [x4]
+-//  unsigned int stride1,       // [w5] == 128
+-//  unsigned int stride2,       // [w6] 
+-//  unsigned int _x,            // [w7] == 0
+-//  unsigned int y,             // [sp, #0] == 0
+-//  unsigned int _w,            // [sp, #8] -> w3
+-//  unsigned int h);            // [sp, #16] -> w7
+-
+-.macro rpi_sand30_lines_to_planar_c16_block_half
+-    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
+-
+-    xtn v4.4h, v0.4s
+-    ushr v0.4s, v0.4s, #10
+-    xtn v5.4h, v0.4s
+-    ushr v0.4s, v0.4s, #10
+-    xtn v6.4h, v0.4s
+-    xtn2 v4.8h, v1.4s
+-    ushr v1.4s, v1.4s, #10
+-    xtn2 v5.8h, v1.4s
+-    ushr v1.4s, v1.4s, #10
+-    xtn2 v6.8h, v1.4s
+-    and v4.16b, v4.16b, v16.16b
+-    and v5.16b, v5.16b, v16.16b
+-    and v6.16b, v6.16b, v16.16b
+-    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
+-    
+-    xtn v4.4h, v2.4s
+-    ushr v2.4s, v2.4s, #10
+-    xtn v5.4h, v2.4s
+-    ushr v2.4s, v2.4s, #10
+-    xtn v6.4h, v2.4s
+-    xtn2 v4.8h, v3.4s
+-    ushr v3.4s, v3.4s, #10
+-    xtn2 v5.8h, v3.4s
+-    ushr v3.4s, v3.4s, #10
+-    xtn2 v6.8h, v3.4s
+-    and v4.16b, v4.16b, v16.16b
+-    and v5.16b, v5.16b, v16.16b
+-    and v6.16b, v6.16b, v16.16b
+-    st3 { v4.8h, v5.8h, v6.8h }, [sp]
+-    sub sp, sp, #48
+-.endm
+-
+-function ff_rpi_sand30_lines_to_planar_c16, export=1
+-    stp x19, x20, [sp, #-48]!
+-    stp x21, x22, [sp, #16]
+-    stp x23, x24, [sp, #32]
+-
+-    ldr w3, [sp, #48+8]    // w3 = width
+-    ldr w7, [sp, #48+16]   // w7 = height
+-
+-    // reserve space on the stack for intermediate results
+-    sub sp, sp, #256
++// Unzip chroma
++//
++// On entry:
++// a0 = V0, U2,  ...
++// a1 = U0, V1,  ...
++// a2 = U1, V2,  ...
++// b0 = V8, U10, ...
++// b1 = U8, V9,  ...
++// b2 = U9, V10, ...
++//
++// On exit:
++// d0 = U0, U3, ...
++// ...
++// a0 = V0, V3, ..
++// ...
++//
++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs)
+ 
+-    // number of 128byte blocks per row, w8 = width / 48
+-    mov w9, #48
+-    udiv w8, w3, w9
++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2
++                uzp1            \d0\().8h, \a1\().8h, \b1\().8h
++                uzp1            \d1\().8h, \a2\().8h, \b2\().8h
++                uzp2            \d2\().8h, \a0\().8h, \b0\().8h
+ 
+-    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
+-    mul w9, w8, w9
+-    sub w9, w3, w9
++                uzp1            \a0\().8h, \a0\().8h, \b0\().8h
++                uzp2            \a1\().8h, \a1\().8h, \b1\().8h
++                uzp2            \a2\().8h, \a2\().8h, \b2\().8h
++.endm
+ 
+-    // row offset, the beginning of the next row to process
+-    eor w10, w10, w10
++// SAND30 -> 10bit
++.macro USAND10 d0, d1, d2, a0, a1
++                shrn            \d2\().4h, \a0\().4s, #14
++                xtn             \d0\().4h, \a0\().4s
++                shrn            \d1\().4h, \a0\().4s, #10
+ 
+-    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
+-    lsl w11, w6, #7
+-    sub w11, w11, #128
++                shrn2           \d2\().8h, \a1\().4s, #14
++                xtn2            \d0\().8h, \a1\().4s
++                shrn2           \d1\().8h, \a1\().4s, #10
+ 
+-    // decrease the height by one and in case of remaining pixels increase the block count by one
+-    sub w7, w7, #1
+-    cmp w9, #0
+-    cset w19, ne    // w19 == 1 iff reamining pixels != 0
+-    add w8, w8, w19
++                ushr            \d2\().8h, \d2\().8h, #6
++                bic             \d0\().8h, #0xfc,     lsl #8
++                bic             \d1\().8h, #0xfc,     lsl #8
++.endm
+ 
+-    // bytes we have to move dst back by at the end of every row
+-    mov w21, #48*2
+-    mul w21, w21, w8
+-    sub w21, w1, w21
++// void ff_rpi_sand30_lines_to_planar_c16(
++//   uint8_t * dst_u,            // [x0]
++//   unsigned int dst_stride_u,  // [w1]
++//   uint8_t * dst_v,            // [x2]
++//   unsigned int dst_stride_v,  // [w3]
++//   const uint8_t * src,        // [x4]
++//   unsigned int stride1,       // [w5]      128
++//   unsigned int stride2,       // [w6]
++//   unsigned int _x,            // [w7]      0
++//   unsigned int y,             // [sp, #0]
++//   unsigned int _w,            // [sp, #8]  w9
++//   unsigned int h);            // [sp, #16] w10
+ 
+-    mov w20, #0     // w20 = flag, last row processed
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++                ldr             w7,  [sp, #0]                   // y
++                ldr             w8,  [sp, #8]                   // _w
++                ldr             w10, [sp, #16]                  // h
++                lsl             w6,  w6,  #7                    // Fixup stride2
++                sub             w6,  w6,  #64
++                uxtw            x6,  w6
++                sub             w1,  w1,  w8,  LSL #1           // Fixup chroma strides
++                sub             w3,  w3,  w8,  LSL #1
++                lsl             w7,  w7,  #7                    // Add y to src
++                add             x4,  x4,  w7,  UXTW
++10:
++                mov             w13, #0
++                mov             x5,  x4
++                mov             w9,  w8
++1:
++                ld1             {v0.4s-v3.4s}, [x5], #64
++                ld1             {v4.4s-v7.4s}, [x5], x6
+ 
+-    mov x12, #0x03ff03ff03ff03ff
+-    dup v16.2d, x12
++                USAND10         v17, v16, v18, v0, v1
++                USAND10         v20, v19, v21, v2, v3
++                UZPH_C          v0, v1, v2, v16, v17, v18, v19, v20, v21
++                USAND10         v23, v22, v24, v4, v5
++                USAND10         v26, v25, v27, v6, v7
++                UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
+ 
+-    // iterate through rows, row counter = w12 = 0
+-    eor w12, w12, w12
+-row_loop_c16:
+-    cmp w12, w7
+-    bge row_loop_c16_fin
++                subs            w9,  w9,  #48
++                blt             2f
+ 
+-    // address of row data = src + row_offset
+-    mov x13, x4
+-    add x13, x13, x10
++                st3             {v0.8h-v2.8h},   [x0], #48
++                st3             {v4.8h-v6.8h},   [x0], #48
++                st3             {v16.8h-v18.8h}, [x2], #48
++                st3             {v22.8h-v24.8h}, [x2], #48
+ 
+-    eor w14, w14, w14
+-block_loop_c16:
+-    cmp w14, w8
+-    bge block_loop_c16_fin
+-
+-    rpi_sand30_lines_to_planar_c16_block_half
+-
+-    ld2 { v0.8h, v1.8h }, [sp], #32
+-    ld2 { v2.8h, v3.8h }, [sp], #32
+-    ld2 { v4.8h, v5.8h }, [sp]
+-    sub sp, sp, #64
+-
+-    st1 { v0.8h }, [x0], #16
+-    st1 { v2.8h }, [x0], #16
+-    st1 { v4.8h }, [x0], #16
+-    st1 { v1.8h }, [x2], #16
+-    st1 { v3.8h }, [x2], #16
+-    st1 { v5.8h }, [x2], #16
+-
+-    rpi_sand30_lines_to_planar_c16_block_half
+-
+-    ld2 { v0.8h, v1.8h }, [sp], #32
+-    ld2 { v2.8h, v3.8h }, [sp], #32
+-    ld2 { v4.8h, v5.8h }, [sp]
+-    sub sp, sp, #64
+-
+-    st1 { v0.8h }, [x0], #16
+-    st1 { v2.8h }, [x0], #16
+-    st1 { v4.8h }, [x0], #16
+-    st1 { v1.8h }, [x2], #16
+-    st1 { v3.8h }, [x2], #16
+-    st1 { v5.8h }, [x2], #16
+-
+-    add x13, x13, x11 // offset to next block
+-    add w14, w14, #1
+-    b block_loop_c16
+-block_loop_c16_fin:
++                bne             1b
++11:
++                subs            w10, w10, #1
++                add             x4,  x4,  #128
++                add             x0,  x0,  w1,  UXTW
++                add             x2,  x2,  w3,  UXTW
++                bne             10b
++99:
++                ret
+ 
+-    add w10, w10, #128
+-    add w12, w12, #1
+-    add x0, x0, w21, sxtw  // move dst pointers back by x21
+-    add x2, x2, w21, sxtw
+-    b row_loop_c16
+-row_loop_c16_fin:
+-
+-    cmp w20, #1
+-    beq row_loop_c16_fin2
+-    mov w20, #1
+-    sub w8, w8, w19 // decrease block count by w19
+-    add w7, w7, #1 // increase height
+-    b row_loop_c16
+-
+-row_loop_c16_fin2:
+-    sub x0, x0, w21, sxtw // readd x21 in case of the last row
+-    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
+-
+-    // last incomplete block to be finished
+-    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
+-    rpi_sand30_lines_to_planar_c16_block_half
+-    ld2 { v0.8h, v1.8h }, [sp], #32
+-    ld2 { v2.8h, v3.8h }, [sp], #32
+-    ld2 { v4.8h, v5.8h }, [sp], #32
+-    rpi_sand30_lines_to_planar_c16_block_half
+-    ld2 { v0.8h, v1.8h }, [sp], #32
+-    ld2 { v2.8h, v3.8h }, [sp], #32
+-    ld2 { v4.8h, v5.8h }, [sp]
+-    sub sp, sp, #160
+-
+-    mov x4, sp
+-    eor w20, w20, w20
+-rem_pix_c16_loop:
+-    cmp w20, w9
+-    bge rem_pix_c16_fin
+-
+-    ldr w22, [x4], #4
+-    str w22, [x0], #2
+-    lsr w22, w22, #16
+-    str w22, [x2], #2 
+-
+-    add w20, w20, #1
+-    b rem_pix_c16_loop
+-rem_pix_c16_fin:
+-
+-    add sp, sp, #256
+-
+-    ldp x23, x24, [sp, #32]
+-    ldp x21, x22, [sp, #16]
+-    ldp x19, x20, [sp], #48
+-    ret
++// Partial final write
++2:
++                cmp             w9,  #24-48
++                blt             1f
++                st3             {v0.8h  - v2.8h},  [x0], #48
++                st3             {v16.8h - v18.8h}, [x2], #48
++                beq             11b
++                mov             v0.16b,  v4.16b
++                mov             v1.16b,  v5.16b
++                sub             w9,  w9,  #24
++                mov             v2.16b,  v6.16b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                mov             v18.16b, v24.16b
++1:
++                cmp             w9,  #12-48
++                blt             1f
++                st3             {v0.4h  - v2.4h},  [x0], #24
++                st3             {v16.4h - v18.4h}, [x2], #24
++                beq             11b
++                mov             v0.2d[0],  v0.2d[1]
++                sub             w9,  w9,  #12
++                mov             v1.2d[0],  v1.2d[1]
++                mov             v2.2d[0],  v2.2d[1]
++                mov             v16.2d[0], v16.2d[1]
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w9,  #6-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v0.h  - v2.h}[1],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                st3             {v16.h - v18.h}[1], [x2], #6
++                beq             11b
++                mov             v0.s[0],  v0.s[1]
++                sub             w9,  w9,  #6
++                mov             v1.s[0],  v1.s[1]
++                mov             v2.s[0],  v2.s[1]
++                mov             v16.s[0], v16.s[1]
++                mov             v17.s[0], v17.s[1]
++                mov             v18.s[0], v18.s[1]
++1:
++                cmp             w9,  #3-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                beq             11b
++                mov             v0.h[0],  v0.h[1]
++                sub             w9,  w9,  #3
++                mov             v1.h[0],  v1.h[1]
++                mov             v16.h[0], v16.h[1]
++                mov             v17.h[0], v17.h[1]
++1:
++                cmp             w9,  #2-48
++                blt             1f
++                st2             {v0.h  - v1.h}[0],  [x0], #4
++                st2             {v16.h - v17.h}[0], [x2], #4
++                b               11b
++1:
++                st1             {v0.h}[0],  [x0], #2
++                st1             {v16.h}[0], [x2], #2
++                b               11b
+ endfunc
+ 
+ 
+-
+ //void ff_rpi_sand30_lines_to_planar_p010(
+ //  uint8_t * dest,
+ //  unsigned int dst_stride,
+
+From 68356e594ff32e18e419a476889d958dc24af4b2 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Fri, 30 Jun 2023 19:41:06 +0000
+Subject: [PATCH 139/151] sand_fns: Minor optimisations to aarch64 neon
+
+---
+ libavutil/aarch64/rpi_sand_neon.S | 140 ++++++------------------------
+ 1 file changed, 28 insertions(+), 112 deletions(-)
+
+diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
+index af7e2a88c4..11658de0c8 100644
+--- a/libavutil/aarch64/rpi_sand_neon.S
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -279,18 +279,37 @@ endfunc
+ // SAND30 -> 10bit
+ .macro USAND10 d0, d1, d2, a0, a1
+                 shrn            \d2\().4h, \a0\().4s, #14
+-                xtn             \d0\().4h, \a0\().4s
+                 shrn            \d1\().4h, \a0\().4s, #10
+ 
+                 shrn2           \d2\().8h, \a1\().4s, #14
+-                xtn2            \d0\().8h, \a1\().4s
+                 shrn2           \d1\().8h, \a1\().4s, #10
++                uzp1            \d0\().8h, \a0\().8h, \a1\().8h
+ 
+                 ushr            \d2\().8h, \d2\().8h, #6
+                 bic             \d0\().8h, #0xfc,     lsl #8
+                 bic             \d1\().8h, #0xfc,     lsl #8
+ .endm
+ 
++// SAND30 -> 8bit
++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2
++                shrn            \d1\().4h,  \a0\().4s,  #12
++                shrn2           \d1\().8h,  \a1\().4s,  #12
++                uzp1            \d0\().8h,  \a0\().8h,  \a1\().8h
++                uzp2            \d2\().8h,  \a0\().8h,  \a1\().8h
++
++                shrn            \t1\().4h,  \a2\().4s,  #12
++                shrn2           \t1\().8h,  \a3\().4s,  #12
++                uzp1            \t0\().8h,  \a2\().8h,  \a3\().8h
++                uzp2            \t2\().8h,  \a2\().8h,  \a3\().8h
++
++                shrn            \d0\().8b,  \d0\().8h,  #2
++                shrn2           \d0\().16b, \t0\().8h,  #2
++                shrn            \d2\().8b,  \d2\().8h,  #6
++                shrn2           \d2\().16b, \t2\().8h,  #6
++                uzp1            \d1\().16b, \d1\().16b, \t1\().16b
++.endm
++
++
+ // void ff_rpi_sand30_lines_to_planar_c16(
+ //   uint8_t * dst_u,            // [x0]
+ //   unsigned int dst_stride_u,  // [w1]
+@@ -322,6 +341,7 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
+ 1:
+                 ld1             {v0.4s-v3.4s}, [x5], #64
+                 ld1             {v4.4s-v7.4s}, [x5], x6
++                subs            w9,  w9,  #48
+ 
+                 USAND10         v17, v16, v18, v0, v1
+                 USAND10         v20, v19, v21, v2, v3
+@@ -330,7 +350,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
+                 USAND10         v26, v25, v27, v6, v7
+                 UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
+ 
+-                subs            w9,  w9,  #48
+                 blt             2f
+ 
+                 st3             {v0.8h-v2.8h},   [x0], #48
+@@ -457,61 +476,10 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
+ 
+                 subs            w5,  w5,  #96
+ 
+-                // v0, v1
+-
+-                shrn            v18.4h,  v0.4s,   #14
+-                xtn             v16.4h,  v0.4s
+-                shrn            v17.4h,  v0.4s,   #10
+-
+-                shrn2           v18.8h,  v1.4s,   #14
+-                xtn2            v16.8h,  v1.4s
+-                shrn2           v17.8h,  v1.4s,   #10
+-
+-                ushr            v18.8h,  v18.8h,  #6
+-                bic             v16.8h,  #0xfc,   lsl #8
+-                bic             v17.8h,  #0xfc,   lsl #8
+-
+-                // v2, v3
+-
+-                shrn            v21.4h,  v2.4s,   #14
+-                xtn             v19.4h,  v2.4s
+-                shrn            v20.4h,  v2.4s,   #10
+-
+-                shrn2           v21.8h,  v3.4s,   #14
+-                xtn2            v19.8h,  v3.4s
+-                shrn2           v20.8h,  v3.4s,   #10
+-
+-                ushr            v21.8h,  v21.8h,  #6
+-                bic             v19.8h,  #0xfc,   lsl #8
+-                bic             v20.8h,  #0xfc,   lsl #8
+-
+-                // v4, v5
+-
+-                shrn            v24.4h,  v4.4s,   #14
+-                xtn             v22.4h,  v4.4s
+-                shrn            v23.4h,  v4.4s,   #10
+-
+-                shrn2           v24.8h,  v5.4s,   #14
+-                xtn2            v22.8h,  v5.4s
+-                shrn2           v23.8h,  v5.4s,   #10
+-
+-                ushr            v24.8h,  v24.8h,  #6
+-                bic             v22.8h,  #0xfc,   lsl #8
+-                bic             v23.8h,  #0xfc,   lsl #8
+-
+-                // v6, v7
+-
+-                shrn            v27.4h,  v6.4s,   #14
+-                xtn             v25.4h,  v6.4s
+-                shrn            v26.4h,  v6.4s,   #10
+-
+-                shrn2           v27.8h,  v7.4s,   #14
+-                xtn2            v25.8h,  v7.4s
+-                shrn2           v26.8h,  v7.4s,   #10
+-
+-                ushr            v27.8h,  v27.8h,  #6
+-                bic             v25.8h,  #0xfc,   lsl #8
+-                bic             v26.8h,  #0xfc,   lsl #8
++                USAND10         v16, v17, v18, v0, v1
++                USAND10         v19, v20, v21, v2, v3
++                USAND10         v22, v23, v24, v4, v5
++                USAND10         v25, v26, v27, v6, v7
+ 
+                 blt             2f
+ 
+@@ -624,60 +592,8 @@ function ff_rpi_sand30_lines_to_planar_y8, export=1
+                 subs            w5,  w5,  #96
+ 
+                 // v0, v1
+-
+-                shrn            v18.4h,  v0.4s,   #16
+-                xtn             v16.4h,  v0.4s
+-                shrn            v17.4h,  v0.4s,   #12
+-
+-                shrn2           v18.8h,  v1.4s,   #16
+-                xtn2            v16.8h,  v1.4s
+-                shrn2           v17.8h,  v1.4s,   #12
+-
+-                shrn            v18.8b,  v18.8h,  #6
+-                shrn            v16.8b,  v16.8h,  #2
+-                xtn             v17.8b,  v17.8h
+-
+-                // v2, v3
+-
+-                shrn            v21.4h,  v2.4s,   #16
+-                xtn             v19.4h,  v2.4s
+-                shrn            v20.4h,  v2.4s,   #12
+-
+-                shrn2           v21.8h,  v3.4s,   #16
+-                xtn2            v19.8h,  v3.4s
+-                shrn2           v20.8h,  v3.4s,   #12
+-
+-                shrn2           v18.16b, v21.8h,  #6
+-                shrn2           v16.16b, v19.8h,  #2
+-                xtn2            v17.16b, v20.8h
+-
+-                // v4, v5
+-
+-                shrn            v24.4h,  v4.4s,   #16
+-                xtn             v22.4h,  v4.4s
+-                shrn            v23.4h,  v4.4s,   #12
+-
+-                shrn2           v24.8h,  v5.4s,   #16
+-                xtn2            v22.8h,  v5.4s
+-                shrn2           v23.8h,  v5.4s,   #12
+-
+-                shrn            v21.8b,  v24.8h,  #6
+-                shrn            v19.8b,  v22.8h,  #2
+-                xtn             v20.8b,  v23.8h
+-
+-                // v6, v7
+-
+-                shrn            v27.4h,  v6.4s,   #16
+-                xtn             v25.4h,  v6.4s
+-                shrn            v26.4h,  v6.4s,   #12
+-
+-                shrn2           v27.8h,  v7.4s,   #16
+-                xtn2            v25.8h,  v7.4s
+-                shrn2           v26.8h,  v7.4s,   #12
+-
+-                shrn2           v21.16b, v27.8h,  #6
+-                shrn2           v19.16b, v25.8h,  #2
+-                xtn2            v20.16b, v26.8h
++                USAND8          v16, v17, v18, v0, v1, v2, v3, v22, v23, v24
++                USAND8          v19, v20, v21, v4, v5, v6, v7, v22, v23, v24
+ 
+                 blt             2f
+ 
+
+From 3abb0dcc453aba0a069bc1a8f26ba77913c5ef2b Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Sat, 1 Jul 2023 18:43:32 +0000
+Subject: [PATCH 140/151] sand_fns: Add test for neon to sand30 fns so they can
+ be tested by checkasm
+
+---
+ libavutil/rpi_sand_fns.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+index b6071e2928..0626bb06cb 100644
+--- a/libavutil/rpi_sand_fns.c
++++ b/libavutil/rpi_sand_fns.c
+@@ -35,10 +35,12 @@ Authors: John Cox
+ #include "frame.h"
+ 
+ #if ARCH_ARM && HAVE_NEON
+-#include "arm/rpi_sand_neon.h"
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
+ #define HAVE_SAND_ASM 1
+ #elif ARCH_AARCH64 && HAVE_NEON
+-#include "aarch64/rpi_sand_neon.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
+ #define HAVE_SAND_ASM 1
+ #else
+ #define HAVE_SAND_ASM 0
+@@ -97,7 +99,7 @@ void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+ 
+ #if HAVE_SAND_ASM
+-    if (_x == 0) {
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
+         ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
+         return;
+     }
+@@ -163,7 +165,7 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
+     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
+ 
+ #if HAVE_SAND_ASM
+-    if (_x == 0) {
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
+         ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
+                                        src, stride1, stride2, _x, y, _w, h);
+         return;
+
+From fb72aa34ec2c42fc595bb1a6c32b599da870fa2b Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Sat, 1 Jul 2023 18:43:57 +0000
+Subject: [PATCH 141/151] checkasm: Add tests for rpi_sand sand30 fns
+
+Something of a kludge for function selection as, at the moment, the
+rpi_sand fns don't have a jump table that we could use for selection.
+---
+ tests/checkasm/Makefile   |   3 +-
+ tests/checkasm/checkasm.c |   3 +
+ tests/checkasm/checkasm.h |   1 +
+ tests/checkasm/rpi_sand.c | 118 ++++++++++++++++++++++++++++++++++++++
+ tests/fate/checkasm.mak   |   1 +
+ 5 files changed, 125 insertions(+), 1 deletion(-)
+ create mode 100644 tests/checkasm/rpi_sand.c
+
+diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
+index a6f06c7007..66291baf33 100644
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -59,8 +59,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
+ AVUTILOBJS                              += av_tx.o
+ AVUTILOBJS                              += fixed_dsp.o
+ AVUTILOBJS                              += float_dsp.o
++AVUTILOBJS-$(CONFIG_SAND)               += rpi_sand.o
+ 
+-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
++CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
+ 
+ CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
+ CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
+diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
+index e96d84a7da..57e0091b80 100644
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -210,6 +210,9 @@ static const struct {
+         { "fixed_dsp", checkasm_check_fixed_dsp },
+         { "float_dsp", checkasm_check_float_dsp },
+         { "av_tx",     checkasm_check_av_tx },
++    #if CONFIG_SAND
++        { "rpi_sand",  checkasm_check_rpi_sand },
++    #endif
+ #endif
+     { NULL }
+ };
+diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
+index 8744a81218..f4a0d20358 100644
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -73,6 +73,7 @@ void checkasm_check_motion(void);
+ void checkasm_check_nlmeans(void);
+ void checkasm_check_opusdsp(void);
+ void checkasm_check_pixblockdsp(void);
++void checkasm_check_rpi_sand(void);
+ void checkasm_check_sbrdsp(void);
+ void checkasm_check_synth_filter(void);
+ void checkasm_check_sw_gbrp(void);
+diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c
+new file mode 100644
+index 0000000000..0888714c4c
+--- /dev/null
++++ b/tests/checkasm/rpi_sand.c
+@@ -0,0 +1,118 @@
++/*
++ * Copyright (c) 2023 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++#include "checkasm.h"
++#include "libavutil/common.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#if ARCH_ARM
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
++#elif ARCH_AARCH64
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
++#endif
++
++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c)
++{
++    return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20);
++}
++
++void checkasm_check_rpi_sand(void)
++{
++    const unsigned int w = 1280;
++    const unsigned int h = 66;
++    const unsigned int stride1 = 128;
++    const unsigned int stride2 = h*3/2;
++    const unsigned int ssize = ((w+95)/96)*128*h*3/2;
++    const unsigned int ysize = ((w + 32) * (h + 32) * 2);
++
++    uint8_t * sbuf0 = malloc(ssize);
++    uint8_t * sbuf1 = malloc(ssize);
++    uint8_t * ybuf0 = malloc(ysize);
++    uint8_t * ybuf1 = malloc(ysize);
++    uint8_t * vbuf0 = malloc(ysize);
++    uint8_t * vbuf1 = malloc(ysize);
++    uint8_t * yframe0 = (w + 32) * 16 + ybuf0;
++    uint8_t * yframe1 = (w + 32) * 16 + ybuf1;
++    uint8_t * vframe0 = (w + 32) * 16 + vbuf0;
++    uint8_t * vframe1 = (w + 32) * 16 + vbuf1;
++    unsigned int i;
++
++    for (i = 0; i != ssize; i += 4)
++        *(uint32_t*)(sbuf0 + i) = rnd();
++    memcpy(sbuf1, sbuf0, ssize);
++
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) {
++        declare_func(void, uint8_t * dst, const unsigned int dst_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h);
++        call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize))
++            fail();
++
++        bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
++    }
++
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) {
++        declare_func(void, uint8_t * u_dst, const unsigned int u_stride,
++                     uint8_t * v_dst, const unsigned int v_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++        memset(vbuf0, 0xbb, ysize);
++        memset(vbuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2);
++        call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize)
++            || memcmp(vbuf0, vbuf1, ysize))
++            fail();
++
++        bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
++    }
++
++
++    report("sand30");
++
++    free(sbuf0);
++    free(sbuf1);
++    free(ybuf0);
++    free(ybuf1);
++    free(vbuf0);
++    free(vbuf1);
++}
++
+diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
+index a4e95541f5..6fda6d227e 100644
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-motion                                    \
+                 fate-checkasm-opusdsp                                   \
+                 fate-checkasm-pixblockdsp                               \
++                fate-checkasm-rpi_sand                                  \
+                 fate-checkasm-sbrdsp                                    \
+                 fate-checkasm-synth_filter                              \
+                 fate-checkasm-sw_gbrp                                   \
+
+From d798c7b90dd63bca6f9878b1fb30ec1d8f0b9a5e Mon Sep 17 00:00:00 2001
+From: James Darnley <jdarnley@obe.tv>
+Date: Mon, 20 Feb 2023 20:55:08 +0100
+Subject: [PATCH 142/151] avfilter/bwdif: move filter_line init to a dedicated
+ function
+
+(cherry picked from commit b503b5a0cf80f38ecf4737c012b621b7e94f242a)
+---
+ libavfilter/bwdif.h             |  3 ++-
+ libavfilter/vf_bwdif.c          | 13 +++++++++----
+ libavfilter/x86/vf_bwdif_init.c |  4 +---
+ 3 files changed, 12 insertions(+), 8 deletions(-)
+
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index 889ff772ed..5749345f78 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -37,6 +37,7 @@ typedef struct BWDIFContext {
+                         int parity, int clip_max, int spat);
+ } BWDIFContext;
+ 
+-void ff_bwdif_init_x86(BWDIFContext *bwdif);
++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+ 
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 09e68523bb..539fabbd46 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -341,7 +341,14 @@ static int config_props(AVFilterLink *link)
+ 
+     yadif->csp = av_pix_fmt_desc_get(link->format);
+     yadif->filter = filter;
+-    if (yadif->csp->comp[0].depth > 8) {
++    ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
++
++    return 0;
++}
++
++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
++{
++    if (bit_depth > 8) {
+         s->filter_intra = filter_intra_16bit;
+         s->filter_line  = filter_line_c_16bit;
+         s->filter_edge  = filter_edge_16bit;
+@@ -352,10 +359,8 @@ static int config_props(AVFilterLink *link)
+     }
+ 
+ #if ARCH_X86
+-    ff_bwdif_init_x86(s);
++    ff_bwdif_init_x86(s, bit_depth);
+ #endif
+-
+-    return 0;
+ }
+ 
+ 
+diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
+index e24e5cd9b1..ba7bc40c3d 100644
+--- a/libavfilter/x86/vf_bwdif_init.c
++++ b/libavfilter/x86/vf_bwdif_init.c
+@@ -42,11 +42,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
+                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                       int mrefs4, int parity, int clip_max);
+ 
+-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
+ {
+-    YADIFContext *yadif = &bwdif->yadif;
+     int cpu_flags = av_get_cpu_flags();
+-    int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;
+ 
+     if (bit_depth <= 8) {
+         if (EXTERNAL_SSE2(cpu_flags))
+
+From 0eb9c627c07931cf93c4932e07e0df6c0ce860fd Mon Sep 17 00:00:00 2001
+From: James Darnley <jdarnley@obe.tv>
+Date: Mon, 20 Feb 2023 20:55:08 +0100
+Subject: [PATCH 143/151] checkasm: add test for bwdif
+
+(cherry picked from commit 087faf8cac51e5e20a5f41b36b8d4c2705a10039)
+---
+ tests/checkasm/Makefile   |  1 +
+ tests/checkasm/checkasm.c |  3 ++
+ tests/checkasm/checkasm.h |  1 +
+ tests/checkasm/vf_bwdif.c | 84 +++++++++++++++++++++++++++++++++++++++
+ tests/fate/checkasm.mak   |  1 +
+ 5 files changed, 90 insertions(+)
+ create mode 100644 tests/checkasm/vf_bwdif.c
+
+diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
+index 66291baf33..2c80d8e661 100644
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -40,6 +40,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+ # libavfilter tests
+ AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
+ AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
+ AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
+ AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
+diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
+index 57e0091b80..4f983d7fbc 100644
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -179,6 +179,9 @@ static const struct {
+     #if CONFIG_BLEND_FILTER
+         { "vf_blend", checkasm_check_blend },
+     #endif
++    #if CONFIG_BWDIF_FILTER
++        { "vf_bwdif", checkasm_check_vf_bwdif },
++    #endif
+     #if CONFIG_COLORSPACE_FILTER
+         { "vf_colorspace", checkasm_check_colorspace },
+     #endif
+diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
+index f4a0d20358..d69bc43999 100644
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -83,6 +83,7 @@ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
+ void checkasm_check_vc1dsp(void);
++void checkasm_check_vf_bwdif(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+new file mode 100644
+index 0000000000..46224bb575
+--- /dev/null
++++ b/tests/checkasm/vf_bwdif.c
+@@ -0,0 +1,84 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++#include "checkasm.h"
++#include "libavcodec/internal.h"
++#include "libavfilter/bwdif.h"
++
++#define WIDTH 256
++
++#define randomize_buffers(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = rnd() & mask
++
++#define BODY(type, depth)                                                      \
++    do {                                                                       \
++        type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
++        type next0[9*WIDTH], next1[9*WIDTH];                                   \
++        type cur0[9*WIDTH], cur1[9*WIDTH];                                     \
++        type dst0[WIDTH], dst1[WIDTH];                                         \
++        const int stride = WIDTH;                                              \
++        const int mask = (1<<depth)-1;                                         \
++                                                                               \
++        declare_func(void, void *dst, void *prev, void *cur, void *next,       \
++                        int w, int prefs, int mrefs, int prefs2, int mrefs2,   \
++                        int prefs3, int mrefs3, int prefs4, int mrefs4,        \
++                        int parity, int clip_max);                             \
++                                                                               \
++        randomize_buffers(prev0, prev1, mask, 9*WIDTH);                        \
++        randomize_buffers(next0, next1, mask, 9*WIDTH);                        \
++        randomize_buffers( cur0,  cur1, mask, 9*WIDTH);                        \
++                                                                               \
++        call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++        call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++                                                                               \
++        if (memcmp(dst0, dst1, sizeof dst0)                                    \
++                || memcmp(prev0, prev1, sizeof prev0)                          \
++                || memcmp(next0, next1, sizeof next0)                          \
++                || memcmp( cur0,  cur1, sizeof cur0))                          \
++            fail();                                                            \
++        bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,      \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++    } while (0)
++
++void checkasm_check_vf_bwdif(void)
++{
++    BWDIFContext ctx_8, ctx_10;
++
++    ff_bwdif_init_filter_line(&ctx_8, 8);
++    ff_bwdif_init_filter_line(&ctx_10, 10);
++
++    if (check_func(ctx_8.filter_line, "bwdif8")) {
++        BODY(uint8_t, 8);
++        report("bwdif8");
++    }
++
++    if (check_func(ctx_10.filter_line, "bwdif10")) {
++        BODY(uint16_t, 10);
++        report("bwdif10");
++    }
++}
+diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
+index 6fda6d227e..1620ab0be0 100644
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -38,6 +38,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-v210enc                                   \
+                 fate-checkasm-vc1dsp                                    \
+                 fate-checkasm-vf_blend                                  \
++                fate-checkasm-vf_bwdif                                  \
+                 fate-checkasm-vf_colorspace                             \
+                 fate-checkasm-vf_eq                                     \
+                 fate-checkasm-vf_gblur                                  \
+
+From c19ab5f6e7f8dd9dff5704510db98a81a1304f80 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 6 Jul 2023 13:56:18 +0000
+Subject: [PATCH 144/151] Revert "vf_bwdif: Add attributes to ask for
+ vectorization"
+
+This reverts commit 281250290ba5c2dcd8676e9a261050e65c10bcb7.
+Will be replaced by hand coded asm as on upstream
+---
+ libavfilter/vf_bwdif.c | 29 ++++++++++++++---------------
+ 1 file changed, 14 insertions(+), 15 deletions(-)
+
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 539fabbd46..34e8c5e234 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -74,10 +74,10 @@ typedef struct ThreadData {
+         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
+         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
+         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
+- {/*\
++ \
+         if (!diff) { \
+             dst[0] = d; \
+-        } else {*/
++        } else {
+ 
+ #define SPAT_CHECK() \
+             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
+@@ -89,16 +89,15 @@ typedef struct ThreadData {
+             diff = FFMAX3(diff, min, -max);
+ 
+ #define FILTER_LINE() \
+-            int i1, i2; \
+             SPAT_CHECK() \
+-            /*if (FFABS(c - e) > temporal_diff0)*/ { \
+-                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
++            if (FFABS(c - e) > temporal_diff0) { \
++                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
+                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
+                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
+                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            } /*else*/ { \
+-                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
++            } else { \
++                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
++            }
+ 
+ #define FILTER_EDGE() \
+             if (spat) { \
+@@ -112,7 +111,7 @@ typedef struct ThreadData {
+             else if (interpol < d - diff) \
+                 interpol = d - diff; \
+  \
+-            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
++            dst[0] = av_clip(interpol, 0, clip_max); \
+         } \
+  \
+         dst++; \
+@@ -123,7 +122,7 @@ typedef struct ThreadData {
+         next2++; \
+     }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
++static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                          int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint8_t *dst = dst1;
+@@ -133,7 +132,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restr
+     FILTER_INTRA()
+ }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                           int prefs3, int mrefs3, int prefs4, int mrefs4,
+                           int parity, int clip_max)
+@@ -151,7 +150,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *rest
+     FILTER2()
+ }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat)
+ {
+@@ -168,7 +167,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restri
+     FILTER2()
+ }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
++static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint16_t *dst = dst1;
+@@ -178,7 +177,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void
+     FILTER_INTRA()
+ }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
+                                 int parity, int clip_max)
+@@ -196,7 +195,7 @@ static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void
+     FILTER2()
+ }
+ 
+-static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+ {
+
+From 093eddd9ef66a7db9e637f3acfe51d950c87f613 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:39 +0000
+Subject: [PATCH 145/151] tests/checkasm: Add test for vf_bwdif filter_intra
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 7caa8d6b91e738ad2c1ea61746b6c062c470f7d3)
+---
+ tests/checkasm/vf_bwdif.c | 37 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 37 insertions(+)
+
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+index 46224bb575..034bbabb4c 100644
+--- a/tests/checkasm/vf_bwdif.c
++++ b/tests/checkasm/vf_bwdif.c
+@@ -20,6 +20,7 @@
+ #include "checkasm.h"
+ #include "libavcodec/internal.h"
+ #include "libavfilter/bwdif.h"
++#include "libavutil/mem_internal.h"
+ 
+ #define WIDTH 256
+ 
+@@ -81,4 +82,40 @@ void checkasm_check_vf_bwdif(void)
+         BODY(uint16_t, 10);
+         report("bwdif10");
+     }
++
++    if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++
++        declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
++                     int prefs3, int mrefs3, int parity, int clip_max);
++
++        randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++        memset(dst0, 0xba, WIDTH * 3);
++        memset(dst1, 0xba, WIDTH * 3);
++
++        call_ref(dst0 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++        call_new(dst1 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++
++        if (memcmp(dst0, dst1, WIDTH*3)
++                || memcmp( cur0,  cur1, WIDTH*11))
++            fail();
++
++        bench_new(dst1 + stride,
++                  cur0 + stride * 4, WIDTH,
++                  stride, -stride, stride * 3, -stride * 3,
++                  0, mask);
++
++        report("bwdif8.intra");
++    }
+ }
+
+From 28ef7402381b6fe241f81e21f302a23f8af674bf Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:40 +0000
+Subject: [PATCH 146/151] avfilter/vf_bwdif: Add neon for filter_intra
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Adds an outline for aarch neon functions
+Adds common macros and consts for aarch64 neon
+Exports C filter_intra needed for tail fixup of neon code
+Adds neon for filter_intra
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 5075cfb4e6a21f6b4da9e62bdb0bad4cb32a4673)
+---
+ libavfilter/aarch64/Makefile                |   2 +
+ libavfilter/aarch64/vf_bwdif_init_aarch64.c |  56 ++++++++
+ libavfilter/aarch64/vf_bwdif_neon.S         | 136 ++++++++++++++++++++
+ libavfilter/bwdif.h                         |   4 +
+ libavfilter/vf_bwdif.c                      |   8 +-
+ 5 files changed, 203 insertions(+), 3 deletions(-)
+ create mode 100644 libavfilter/aarch64/vf_bwdif_init_aarch64.c
+ create mode 100644 libavfilter/aarch64/vf_bwdif_neon.S
+
+diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
+index b58daa3a3f..b68209bc94 100644
+--- a/libavfilter/aarch64/Makefile
++++ b/libavfilter/aarch64/Makefile
+@@ -1,3 +1,5 @@
++OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
+ 
++NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+new file mode 100644
+index 0000000000..3ffaa07ab3
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -0,0 +1,56 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/common.h"
++#include "libavfilter/bwdif.h"
++#include "libavutil/aarch64/cpu.h"
++
++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max);
++
++
++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
++                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++}
++
++void
++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
++{
++    const int cpu_flags = av_get_cpu_flags();
++
++    if (bit_depth != 8)
++        return;
++
++    if (!have_neon(cpu_flags))
++        return;
++
++    s->filter_intra = filter_intra_helper;
++}
++
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+new file mode 100644
+index 0000000000..e288efbe6c
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -0,0 +1,136 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#include "libavutil/aarch64/asm.S"
++
++// Space taken on the stack by an int (32-bit)
++#ifdef __APPLE__
++.set    SP_INT, 4
++#else
++.set    SP_INT, 8
++#endif
++
++.macro SQSHRUNN b, s0, s1, s2, s3, n
++        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
++        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
++        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
++        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
++        uzp2            \b\().16b, \s0\().16b, \s1\().16b
++.endm
++
++.macro SMULL4K a0, a1, a2, a3, s0, s1, k
++        smull           \a0\().4s, \s0\().4h, \k
++        smull2          \a1\().4s, \s0\().8h, \k
++        smull           \a2\().4s, \s1\().4h, \k
++        smull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMULL4K a0, a1, a2, a3, s0, s1, k
++        umull           \a0\().4s, \s0\().4h, \k
++        umull2          \a1\().4s, \s0\().8h, \k
++        umull           \a2\().4s, \s1\().4h, \k
++        umull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
++        umlal           \a0\().4s, \s0\().4h, \k
++        umlal2          \a1\().4s, \s0\().8h, \k
++        umlal           \a2\().4s, \s1\().4h, \k
++        umlal2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
++        umlsl           \a0\().4s, \s0\().4h, \k
++        umlsl2          \a1\().4s, \s0\().8h, \k
++        umlsl           \a2\().4s, \s1\().4h, \k
++        umlsl2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro LDR_COEFFS d, t0
++        movrel          \t0, coeffs, 0
++        ld1             {\d\().8h}, [\t0]
++.endm
++
++// static const uint16_t coef_lf[2] = { 4309, 213 };
++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
++// static const uint16_t coef_sp[2] = { 5077, 981 };
++
++const coeffs, align=4   // align 4 means align on 2^4 boundry
++        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
++        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
++        .hword          5077, 981                       // sp[0] = v0.h[6]
++endconst
++
++// ============================================================================
++//
++// void ff_bwdif_filter_intra_neon(
++//      void *dst1,     // x0
++//      void *cur1,     // x1
++//      int w,          // w2
++//      int prefs,      // w3
++//      int mrefs,      // w4
++//      int prefs3,     // w5
++//      int mrefs3,     // w6
++//      int parity,     // w7       unused
++//      int clip_max)   // [sp, #0] unused
++
++function ff_bwdif_filter_intra_neon, export=1
++        cmp             w2, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++//    for (x = 0; x < w; x++) {
++10:
++
++//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
++        ldr             q31, [x1, w4, sxtw]
++        ldr             q30, [x1, w3, sxtw]
++        ldr             q29, [x1, w6, sxtw]
++        ldr             q28, [x1, w5, sxtw]
++
++        uaddl           v20.8h,  v31.8b,  v30.8b
++        uaddl2          v21.8h,  v31.16b, v30.16b
++
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
++
++        uaddl           v20.8h,  v29.8b,  v28.8b
++        uaddl2          v21.8h,  v29.16b, v28.16b
++
++        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        SQSHRUNN        v2, v2, v3, v4, v5, 13
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++
++        subs            w2,  w2,  #16
++        add             x1,  x1,  #16
++        bgt             10b
++
++99:
++        ret
++endfunc
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index 5749345f78..ae6f6ce223 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -39,5 +39,9 @@ typedef struct BWDIFContext {
+ 
+ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+ void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
++
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max);
+ 
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 34e8c5e234..6ec8bbab5d 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -122,8 +122,8 @@ typedef struct ThreadData {
+         next2++; \
+     }
+ 
+-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+-                         int prefs3, int mrefs3, int parity, int clip_max)
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint8_t *dst = dst1;
+     uint8_t *cur = cur1;
+@@ -352,13 +352,15 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+         s->filter_line  = filter_line_c_16bit;
+         s->filter_edge  = filter_edge_16bit;
+     } else {
+-        s->filter_intra = filter_intra;
++        s->filter_intra = ff_bwdif_filter_intra_c;
+         s->filter_line  = filter_line_c;
+         s->filter_edge  = filter_edge;
+     }
+ 
+ #if ARCH_X86
+     ff_bwdif_init_x86(s, bit_depth);
++#elif ARCH_AARCH64
++    ff_bwdif_init_aarch64(s, bit_depth);
+ #endif
+ }
+ 
+
+From 2f8199a41cfd43595352899e722646052b0db2ee Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:41 +0000
+Subject: [PATCH 147/151] tests/checkasm: Add test for vf_bwdif filter_edge
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 7ed7c00f55a50ac88589f9e17c172d4a4fce0581)
+---
+ tests/checkasm/vf_bwdif.c | 54 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 54 insertions(+)
+
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+index 034bbabb4c..5fdba09fdc 100644
+--- a/tests/checkasm/vf_bwdif.c
++++ b/tests/checkasm/vf_bwdif.c
+@@ -83,6 +83,60 @@ void checkasm_check_vf_bwdif(void)
+         report("bwdif10");
+     }
+ 
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int spat;
++        int parity;
++
++        for (spat = 0; spat != 2; ++spat) {
++            for (parity = 0; parity != 2; ++parity) {
++                if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
++
++                    declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
++                                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                                            int parity, int clip_max, int spat);
++
++                    randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                    randomize_buffers(next0, next1, mask, 11*WIDTH);
++                    randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++                    memset(dst0, 0xba, WIDTH * 3);
++                    memset(dst1, 0xba, WIDTH * 3);
++
++                    call_ref(dst0 + stride,
++                             prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                    call_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++
++                    if (memcmp(dst0, dst1, WIDTH*3)
++                            || memcmp(prev0, prev1, WIDTH*11)
++                            || memcmp(next0, next1, WIDTH*11)
++                            || memcmp( cur0,  cur1, WIDTH*11))
++                        fail();
++
++                    bench_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                }
++            }
++        }
++
++        report("bwdif8.edge");
++    }
++
+     if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
+         LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
+         LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
+
+From 171d7f201503812617b8e320c83cc33120425923 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:42 +0000
+Subject: [PATCH 148/151] avfilter/vf_bwdif: Add neon for filter_edge
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Adds clip and spatial macros for aarch64 neon
+Exports C filter_edge needed for tail fixup of neon code
+Adds neon for filter_edge
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 8130df83e0fbd3264fe990fb4e084ecbd452d0b1)
+---
+ libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 +++
+ libavfilter/aarch64/vf_bwdif_neon.S         | 177 ++++++++++++++++++++
+ libavfilter/bwdif.h                         |   4 +
+ libavfilter/vf_bwdif.c                      |   8 +-
+ 4 files changed, 205 insertions(+), 4 deletions(-)
+
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+index 3ffaa07ab3..e75cf2f204 100644
+--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -24,10 +24,29 @@
+ #include "libavfilter/bwdif.h"
+ #include "libavutil/aarch64/cpu.h"
+ 
++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat);
++
+ void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                 int prefs3, int mrefs3, int parity, int clip_max);
+ 
+ 
++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
++                              parity, clip_max, spat);
++
++    if (w0 < w)
++        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2,
++                               parity, clip_max, spat);
++}
++
+ static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                 int prefs3, int mrefs3, int parity, int clip_max)
+ {
+@@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+         return;
+ 
+     s->filter_intra = filter_intra_helper;
++    s->filter_edge  = filter_edge_helper;
+ }
+ 
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+index e288efbe6c..389302b813 100644
+--- a/libavfilter/aarch64/vf_bwdif_neon.S
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -66,6 +66,79 @@
+         umlsl2          \a3\().4s, \s1\().8h, \k
+ .endm
+ 
++//      int b = m2s1 - m1;
++//      int f = p2s1 - p1;
++//      int dc = c0s1 - m1;
++//      int de = c0s1 - p1;
++//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
++//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
++//      sp_min = FFMIN(sp_min, FFMAX(b,f));
++//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
++        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
++        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
++        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
++        umax            \t3\().16b, \t3\().16b, \t1\().16b
++        umin            \t3\().16b, \t3\().16b, \t2\().16b
++
++        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
++        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
++        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
++        umax            \t0\().16b, \t0\().16b, \t1\().16b
++        umin            \t2\().16b, \t2\().16b, \t0\().16b
++
++        cmeq            \t1\().16b, \diff\().16b, #0
++        umax            \diff\().16b, \diff\().16b, \t3\().16b
++        umax            \diff\().16b, \diff\().16b, \t2\().16b
++        bic             \diff\().16b, \diff\().16b, \t1\().16b
++.endm
++
++//      i0 = s0;
++//      if (i0 > d0 + diff0)
++//          i0 = d0 + diff0;
++//      else if (i0 < d0 - diff0)
++//          i0 = d0 - diff0;
++//
++// i0 = s0 is safe
++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
++        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
++        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
++        umin            \i0\().16b, \s0\().16b, \t0\().16b
++        umax            \i0\().16b, \i0\().16b, \t1\().16b
++.endm
++
++//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
++//      DIFF_CLIP
++//
++// i0 = i1 is safe
++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
++        uabd            \t0\().16b, \m1\().16b, \p1\().16b
++        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
++        bsl             \t0\().16b, \i1\().16b, \i2\().16b
++        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
++.endm
++
++.macro PUSH_VREGS
++        stp             d8,  d9,  [sp, #-64]!
++        stp             d10, d11, [sp, #16]
++        stp             d12, d13, [sp, #32]
++        stp             d14, d15, [sp, #48]
++.endm
++
++.macro POP_VREGS
++        ldp             d14, d15, [sp, #48]
++        ldp             d12, d13, [sp, #32]
++        ldp             d10, d11, [sp, #16]
++        ldp             d8,  d9,  [sp], #64
++.endm
++
+ .macro LDR_COEFFS d, t0
+         movrel          \t0, coeffs, 0
+         ld1             {\d\().8h}, [\t0]
+@@ -81,6 +154,110 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
+         .hword          5077, 981                       // sp[0] = v0.h[6]
+ endconst
+ 
++// ============================================================================
++//
++// void ff_bwdif_filter_edge_neon(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int parity,     // [sp, #SP_INT]
++//      int clip_max,   // [sp, #SP_INT*2]  unused
++//      int spat);      // [sp, #SP_INT*3]
++
++function ff_bwdif_filter_edge_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++// #define prev2 cur
++//     const uint8_t * restrict next2 = parity ? prev : next;
++
++        ldr             w8,  [sp, #0]                   // mrefs2
++
++        ldr             w17, [sp, #SP_INT]              // parity
++        ldr             w16, [sp, #SP_INT*3]            // spat
++        cmp             w17, #0
++        csel            x17, x1, x3, ne
++
++//     for (x = 0; x < w; x++) {
++
++10:
++//        int m1 = cur[mrefs];
++//        int d = (prev2[0] + next2[0]) >> 1;
++//        int p1 = cur[prefs];
++//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
++//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
++        ldr             q31, [x2]
++        ldr             q21, [x17]
++        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
++        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
++        ldr             q24, [x2, w6, sxtw]             // m1 = v24
++        ldr             q22, [x2, w5, sxtw]             // p1 = v22
++
++        ldr             q0,  [x1, w6, sxtw]             // prev[mrefs]
++        ldr             q2,  [x1, w5, sxtw]             // prev[prefs]
++        ldr             q1,  [x3, w6, sxtw]             // next[mrefs]
++        ldr             q3,  [x3, w5, sxtw]             // next[prefs]
++
++        ushr            v29.16b, v17.16b, #1
++
++        uabd            v31.16b, v0.16b,  v24.16b
++        uabd            v30.16b, v2.16b,  v22.16b
++        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
++
++        uabd            v31.16b, v1.16b,  v24.16b
++        uabd            v30.16b, v3.16b,  v22.16b
++        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
++
++        umax            v0.16b,  v0.16b,  v29.16b
++        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
++
++//        if (spat) {
++//            SPAT_CHECK()
++//        }
++//        i0 = (m1 + p1) >> 1;
++        cbz             w16, 1f
++
++        ldr             q31, [x2,  w8, sxtw]
++        ldr             q18, [x17, w8, sxtw]
++        ldr             q30, [x2,  w7, sxtw]
++        ldr             q19, [x17, w7, sxtw]
++        uhadd           v18.16b, v18.16b, v31.16b
++        uhadd           v19.16b, v19.16b, v30.16b
++
++        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
++
++1:
++        uhadd           v2.16b,  v22.16b, v24.16b
++
++        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
++        DIFF_CLIP       v2, v2, v16, v0, v31, v30
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++        subs            w4,  w4,  #16
++        add             x1,  x1,  #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++99:
++        ret
++endfunc
++
+ // ============================================================================
+ //
+ // void ff_bwdif_filter_intra_neon(
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index ae6f6ce223..ae1616d366 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -41,6 +41,10 @@ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+ void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
+ void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
+ 
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat);
++
+ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                              int prefs3, int mrefs3, int parity, int clip_max);
+ 
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 6ec8bbab5d..688c2d2572 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -150,9 +150,9 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
+-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                        int parity, int clip_max, int spat)
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat)
+ {
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -354,7 +354,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+     } else {
+         s->filter_intra = ff_bwdif_filter_intra_c;
+         s->filter_line  = filter_line_c;
+-        s->filter_edge  = filter_edge;
++        s->filter_edge  = ff_bwdif_filter_edge_c;
+     }
+ 
+ #if ARCH_X86
+
+From abf6588935bce275ba302766bcd8c3bb7a523d3c Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:43 +0000
+Subject: [PATCH 149/151] avfilter/vf_bwdif: Add neon for filter_line
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Exports C filter_line needed for tail fixup of neon code
+Adds neon for filter_line
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 94cb94a2c0910d364a7181fc5cc0e9556b777d0a)
+---
+ libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
+ libavfilter/aarch64/vf_bwdif_neon.S         | 203 ++++++++++++++++++++
+ libavfilter/bwdif.h                         |   5 +
+ libavfilter/vf_bwdif.c                      |  10 +-
+ 4 files changed, 234 insertions(+), 5 deletions(-)
+
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+index e75cf2f204..21e67884ab 100644
+--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
+ void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                 int prefs3, int mrefs3, int parity, int clip_max);
+ 
++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max);
++
++
++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
++                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++}
+ 
+ static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
+@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+         return;
+ 
+     s->filter_intra = filter_intra_helper;
++    s->filter_line  = filter_line_helper;
+     s->filter_edge  = filter_edge_helper;
+ }
+ 
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+index 389302b813..f185e94e3c 100644
+--- a/libavfilter/aarch64/vf_bwdif_neon.S
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -154,6 +154,209 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
+         .hword          5077, 981                       // sp[0] = v0.h[6]
+ endconst
+ 
++// ===========================================================================
++//
++// void filter_line(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int prefs3,     // [sp, #SP_INT]
++//      int mrefs3,     // [sp, #SP_INT*2]
++//      int prefs4,     // [sp, #SP_INT*3]
++//      int mrefs4,     // [sp, #SP_INT*4]
++//      int parity,     // [sp, #SP_INT*5]
++//      int clip_max)   // [sp, #SP_INT*6]
++
++function ff_bwdif_filter_line_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++        // Rearrange regs to be the same as line3 for ease of debug!
++        mov             w10, w4                         // w10 = loop count
++        mov             w9,  w6                         // w9  = mref
++        mov             w12, w7                         // w12 = pref2
++        mov             w11, w5                         // w11 = pref
++        ldr             w8,  [sp, #0]                   // w8 =  mref2
++        ldr             w7,  [sp, #SP_INT*2]            // w7  = mref3
++        ldr             w6,  [sp, #SP_INT*4]            // w6  = mref4
++        ldr             w13, [sp, #SP_INT]              // w13 = pref3
++        ldr             w14, [sp, #SP_INT*3]            // w14 = pref4
++
++        mov             x4,  x3
++        mov             x3,  x2
++        mov             x2,  x1
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        ldr             w17, [sp, #SP_INT*5]            // parity
++        cmp             w17, #0
++        csel            x17, x2, x4, ne
++
++        PUSH_VREGS
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
++//             d0  = c0 >> 1;                       // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             m3 = cur[mrefs3];                    // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                    // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);    // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++//             m1 = cur[mrefs];                     // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                       // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                     // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
+ // ============================================================================
+ //
+ // void ff_bwdif_filter_edge_neon(
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index ae1616d366..cce99953f3 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                              int prefs3, int mrefs3, int parity, int clip_max);
+ 
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max);
++
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 688c2d2572..2dc47f9614 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+-                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                          int prefs3, int mrefs3, int prefs4, int mrefs4,
+-                          int parity, int clip_max)
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max)
+ {
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -353,7 +353,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+         s->filter_edge  = filter_edge_16bit;
+     } else {
+         s->filter_intra = ff_bwdif_filter_intra_c;
+-        s->filter_line  = filter_line_c;
++        s->filter_line  = ff_bwdif_filter_line_c;
+         s->filter_edge  = ff_bwdif_filter_edge_c;
+     }
+ 
+
+From 7601de6ab2604d1f530e4b8f20f409d1ec2ae6a4 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:44 +0000
+Subject: [PATCH 150/151] avfilter/vf_bwdif: Add a filter_line3 method for
+ optimisation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add an optional filter_line3 to the available optimisations.
+
+filter_line3 is equivalent to filter_line, memcpy, filter_line
+
+filter_line shares quite a number of loads and some calculations in
+common with its next iteration and testing shows that using aarch64
+neon filter_line3s performance is 30% better than two filter_lines
+and a memcpy.
+
+Adds a test for vf_bwdif filter_line3 to checkasm
+
+Rounds job start lines down to a multiple of 4. This means that if
+filter_line3 exists then filter_line will not sometimes be called
+once at the end of a slice depending on thread count. The final slice
+may do up to 3 extra lines but filter_edge is faster than filter_line
+so it is unlikely to create any noticable thread load variation.
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit 697533e76dbea8cc7fd6a0642bc60050cc05ead8)
+---
+ libavfilter/bwdif.h       |  7 ++++
+ libavfilter/vf_bwdif.c    | 44 +++++++++++++++++++--
+ tests/checkasm/vf_bwdif.c | 81 +++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 129 insertions(+), 3 deletions(-)
+
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index cce99953f3..496cec72ef 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -35,6 +35,9 @@ typedef struct BWDIFContext {
+     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat);
++    void (*filter_line3)(void *dst, int dstride,
++                         const void *prev, const void *cur, const void *next, int prefs,
++                         int w, int parity, int clip_max);
+ } BWDIFContext;
+ 
+ void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
+@@ -53,4 +56,8 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                             int prefs3, int mrefs3, int prefs4, int mrefs4,
+                             int parity, int clip_max);
+ 
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max);
++
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 2dc47f9614..9847d38b6a 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -150,6 +150,31 @@ void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
++#define NEXT_LINE()\
++    dst += d_stride; \
++    prev += prefs; \
++    cur  += prefs; \
++    next += prefs;
++
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max)
++{
++    const int prefs = s_stride;
++    uint8_t * dst  = dst1;
++    const uint8_t * prev = prev1;
++    const uint8_t * cur  = cur1;
++    const uint8_t * next = next1;
++
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++    NEXT_LINE();
++    memcpy(dst, cur, w);
++    NEXT_LINE();
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++}
++
+ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
+                             int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                             int parity, int clip_max, int spat)
+@@ -212,6 +237,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
++// Round job start line down to multiple of 4 so that if filter_line3 exists
++// and the frame is a multiple of 4 high then filter_line will never be called
++static inline int job_start(const int jobnr, const int nb_jobs, const int h)
++{
++    return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
++}
++
+ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+ {
+     BWDIFContext *s = ctx->priv;
+@@ -221,8 +253,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+     int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
+     int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
+     int refs = linesize / df;
+-    int slice_start = (td->h *  jobnr   ) / nb_jobs;
+-    int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
++    int slice_start = job_start(jobnr, nb_jobs, td->h);
++    int slice_end   = job_start(jobnr + 1, nb_jobs, td->h);
+     int y;
+ 
+     for (y = slice_start; y < slice_end; y++) {
+@@ -244,6 +276,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+                                refs << 1, -(refs << 1),
+                                td->parity ^ td->tff, clip_max,
+                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
++            } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
++                s->filter_line3(dst, td->frame->linesize[td->plane],
++                                prev, cur, next, linesize, td->w,
++                                td->parity ^ td->tff, clip_max);
++                y += 2;
+             } else {
+                 s->filter_line(dst, prev, cur, next, td->w,
+                                refs, -refs, refs << 1, -(refs << 1),
+@@ -280,7 +317,7 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
+         td.plane = i;
+ 
+         ff_filter_execute(ctx, filter_slice, &td, NULL,
+-                          FFMIN(h, ff_filter_get_nb_threads(ctx)));
++                          FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
+     }
+     if (yadif->current_field == YADIF_FIELD_END) {
+         yadif->current_field = YADIF_FIELD_NORMAL;
+@@ -347,6 +384,7 @@ static int config_props(AVFilterLink *link)
+ 
+ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
+ {
++    s->filter_line3 = 0;
+     if (bit_depth > 8) {
+         s->filter_intra = filter_intra_16bit;
+         s->filter_line  = filter_line_c_16bit;
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+index 5fdba09fdc..3399cacdf7 100644
+--- a/tests/checkasm/vf_bwdif.c
++++ b/tests/checkasm/vf_bwdif.c
+@@ -28,6 +28,10 @@
+     for (size_t i = 0; i < count; i++) \
+         buf0[i] = buf1[i] = rnd() & mask
+ 
++#define randomize_overflow_check(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
++
+ #define BODY(type, depth)                                                      \
+     do {                                                                       \
+         type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
+@@ -83,6 +87,83 @@ void checkasm_check_vf_bwdif(void)
+         report("bwdif10");
+     }
+ 
++    if (!ctx_8.filter_line3)
++        ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
++
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int parity;
++
++        for (parity = 0; parity != 2; ++parity) {
++            if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
++
++                declare_func(void, void * dst1, int d_stride,
++                                          const void * prev1, const void * cur1, const void * next1, int prefs,
++                                          int w, int parity, int clip_max);
++
++                randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                randomize_buffers(next0, next1, mask, 11*WIDTH);
++                randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++
++                call_ref(dst0, stride,
++                         prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                         WIDTH, parity, mask);
++                call_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++
++                if (memcmp(dst0, dst1, WIDTH*3)
++                        || memcmp(prev0, prev1, WIDTH*11)
++                        || memcmp(next0, next1, WIDTH*11)
++                        || memcmp( cur0,  cur1, WIDTH*11))
++                    fail();
++
++                bench_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++            }
++        }
++
++        // Use just 0s and ~0s to try to provoke bad cropping or overflow
++        // Parity makes no difference to this test so just test 0
++        if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
++
++            declare_func(void, void * dst1, int d_stride,
++                                      const void * prev1, const void * cur1, const void * next1, int prefs,
++                                      int w, int parity, int clip_max);
++
++            randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
++            randomize_overflow_check(next0, next1, mask, 11*WIDTH);
++            randomize_overflow_check( cur0,  cur1, mask, 11*WIDTH);
++
++            call_ref(dst0, stride,
++                     prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                     WIDTH, 0, mask);
++            call_new(dst1, stride,
++                     prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                     WIDTH, 0, mask);
++
++            if (memcmp(dst0, dst1, WIDTH*3)
++                    || memcmp(prev0, prev1, WIDTH*11)
++                    || memcmp(next0, next1, WIDTH*11)
++                    || memcmp( cur0,  cur1, WIDTH*11))
++                fail();
++
++            // No point to benching
++        }
++
++        report("bwdif8.line3");
++    }
++
+     {
+         LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
+         LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
+
+From 120058b7abd0db1d222b1e197207de8226fdfd94 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Tue, 4 Jul 2023 14:04:45 +0000
+Subject: [PATCH 151/151] avfilter/vf_bwdif: Add neon for filter_line3
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: John Cox <jc@kynesim.co.uk>
+Signed-off-by: Martin Storsjö <martin@martin.st>
+(cherry picked from commit f00222e81f7d6a59d977fbb280d67989818e0ad2)
+---
+ libavfilter/aarch64/vf_bwdif_init_aarch64.c |  28 ++
+ libavfilter/aarch64/vf_bwdif_neon.S         | 272 ++++++++++++++++++++
+ 2 files changed, 300 insertions(+)
+
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+index 21e67884ab..f52bc4b9b4 100644
+--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -36,6 +36,33 @@ void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                                int prefs3, int mrefs3, int prefs4, int mrefs4,
+                                int parity, int clip_max);
+ 
++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max);
++
++
++static void filter_line3_helper(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max)
++{
++    // Asm works on 16 byte chunks
++    // If w is a multiple of 16 then all is good - if not then if width rounded
++    // up to nearest 16 will fit in both src & dst strides then allow the asm
++    // to write over the padding bytes as that is almost certainly faster than
++    // having to invoke the C version to clean up the tail.
++    const int w1 = FFALIGN(w, 16);
++    const int w0 = clip_max != 255 ? 0 :
++                   d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
++
++    ff_bwdif_filter_line3_neon(dst1, d_stride,
++                               prev1, cur1, next1, s_stride,
++                               w0, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
++                                (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
++                                w - w0, parity, clip_max);
++}
+ 
+ static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
+@@ -93,5 +120,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
+     s->filter_intra = filter_intra_helper;
+     s->filter_line  = filter_line_helper;
+     s->filter_edge  = filter_edge_helper;
++    s->filter_line3 = filter_line3_helper;
+ }
+ 
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+index f185e94e3c..ae9aab20cd 100644
+--- a/libavfilter/aarch64/vf_bwdif_neon.S
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -154,6 +154,278 @@ const coeffs, align=4   // align 4 means align on 2^4 boundry
+         .hword          5077, 981                       // sp[0] = v0.h[6]
+ endconst
+ 
++// ===========================================================================
++//
++// void ff_bwdif_filter_line3_neon(
++//         void * dst1,         // x0
++//         int d_stride,        // w1
++//         const void * prev1,  // x2
++//         const void * cur1,   // x3
++//         const void * next1,  // x4
++//         int s_stride,        // w5
++//         int w,               // w6
++//         int parity,          // w7
++//         int clip_max);       // [sp, #0] (Ignored)
++
++function ff_bwdif_filter_line3_neon, export=1
++        // Sanity check w
++        cmp             w6, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        cmp             w7, #0
++        csel            x17, x2, x4, ne
++
++        // We want all the V registers - save all the ones we must
++        PUSH_VREGS
++
++        // Some rearrangement of initial values for nice layout of refs in regs
++        mov             w10, w6                         // w10 = loop count
++        neg             w9,  w5                         // w9  = mref
++        lsl             w8,  w9,  #1                    // w8 =  mref2
++        add             w7,  w9,  w9, LSL #1            // w7  = mref3
++        lsl             w6,  w9,  #2                    // w6  = mref4
++        mov             w11, w5                         // w11 = pref
++        lsl             w12, w5,  #1                    // w12 = pref2
++        add             w13, w5,  w5, LSL #1            // w13 = pref3
++        lsl             w14, w5,  #2                    // w14 = pref4
++        add             w15, w5,  w5, LSL #2            // w15 = pref5
++        add             w16, w14, w12                   // w16 = pref6
++
++        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];                // c0 = v20, v21
++//             d0  = c0 >> 1;                           // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                    // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];      // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];      // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             j1 = -coef_hf[1] * (c0 + p4);            // j1 = v6-v9  (-c0:v20,v21)
++        add             v20.8h,  v20.8h,  v24.8h
++        add             v21.8h,  v21.8h,  v25.8h
++        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
++
++//             m3 = cur[mrefs3];                        // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                        // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);            // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);        // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        ldr             q28, [x3, w16, sxtw]
++        ldr             q25, [x17, w16, sxtw]
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];      // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             p6 = prev2[prefs6] + next2[prefs6];      // p6 = v24,v25
++        uaddl           v24.8h,  v25.8b,  v28.8b
++        uaddl2          v25.8h,  v25.16b, v28.16b
++
++//             j1 += coef_hf[2] * (m2 + p6);            // (-p6:v24,v25)
++        add             v24.8h,  v24.8h,  v22.8h
++        add             v25.8h,  v25.8h,  v23.8h
++        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
++
++//             m1 = cur[mrefs];                         // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p5 = cur[prefs5];                        // p5 = v25
++        ldr             q25, [x3, w15, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];      // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                           // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             j1 += coef_hf[0] * p2;                   // -
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
++
++//             i1 -= coef_hf[1] * (m2 + p2);            // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                         // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             j1 -= coef_lf[1] * 4 * (m1 + p5);        // -
++        uaddl           v26.8h,  v24.8b,  v25.8b
++        uaddl2          v27.8h,  v24.16b, v25.16b
++        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
++
++//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
++        uaddl           v18.8h,  v22.8b,  v21.8b
++        uaddl2          v19.8h,  v22.16b, v21.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v24.8b,  v25.8b
++        uaddl2          v19.8h,  v24.16b, v25.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v16, v28, v29, v30, v31, 13
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);        // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++//             j1 += coef_lf[0] * 4 * (p1 + p3);        // p1 = v22, p3 = v21
++        uaddl           v26.8h,  v21.8b,  v22.8b
++        uaddl2          v27.8h,  v21.16b, v22.16b
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                               // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             j1 >>= 15;                               // j1 = v3, -v6*, -v7*, -v8*, -v9*
++        SQSHRUNN        v3, v6, v7, v8, v9, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++        ldr             q27, [x2, w13, sxtw]
++        ldr             q26, [x4, w13, sxtw]
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++//             }                                        // v28, v30 preserved for next block
++//             {  // tdiff2 = v14
++//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
++//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
++        uabd            v31.16b, v21.16b, v27.16b
++        uabd            v29.16b, v21.16b, v26.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
++        ushr            v19.16b, v14.16b, #1
++        umax            v19.16b, v19.16b, v31.16b
++        umax            v19.16b, v19.16b, v29.16b
++//             }
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        //  diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
++        SPAT_CHECK      v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
++
++        // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
++        INTERPOL        v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
++
++//                 dst[d_stride * 2] = av_clip_uint8(interpol);
++        str             q3,  [x0, w5, sxtw]
++
++//             dst[d_stride] = p1;
++        str             q22, [x0, w1, sxtw]
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
+ // ===========================================================================
+ //
+ // void filter_line(