diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 1dbe304818..8dfcbe0fa6 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -1,7 +1,7 @@
 From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Apr 2021 12:34:50 +0100
-Subject: [PATCH 001/121] Add pi configs and scripts
+Subject: [PATCH 001/135] Add pi configs and scripts
 
 ---
  pi-util/BUILD.txt                  |  59 ++++++++
@@ -1682,7 +1682,7 @@ index 0000000000..5935a11ca5
 From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:27:39 +0100
-Subject: [PATCH 002/121] Add sand pix fmts & conversion fns
+Subject: [PATCH 002/135] Add sand pix fmts & conversion fns
 
 ---
  configure                     |   3 +
@@ -3503,7 +3503,7 @@ index 0000000000..634b55e800
 From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:36:47 +0100
-Subject: [PATCH 003/121] Add aarch64 asm sand conv functions
+Subject: [PATCH 003/135] Add aarch64 asm sand conv functions
 
 Many thanks to eiler.mike@gmail.com (Michael Eiler) for these
 optimizations
@@ -4310,7 +4310,7 @@ index ed0261b02f..1f543e9357 100644
 From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 11:56:02 +0100
-Subject: [PATCH 004/121] Add raw encoding for sand
+Subject: [PATCH 004/135] Add raw encoding for sand
 
 ---
  libavcodec/raw.c    |  6 +++
@@ -4459,7 +4459,7 @@ index 8c577006d9..594a77c42a 100644
 From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:02:09 +0100
-Subject: [PATCH 005/121] Deal with the lack of trivial sand cropping
+Subject: [PATCH 005/135] Deal with the lack of trivial sand cropping
 
 ---
  fftools/ffmpeg.c        |  4 ++--
@@ -4559,7 +4559,7 @@ index 2580269549..3a9d323325 100644
 From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:31:16 +0100
-Subject: [PATCH 006/121] Add an unsand filter
+Subject: [PATCH 006/135] Add an unsand filter
 
 ---
  configure                |   1 +
@@ -4857,7 +4857,7 @@ index 0000000000..7100f2fc9b
 From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 12:37:07 +0100
-Subject: [PATCH 007/121] Reduce mmal compile warnings
+Subject: [PATCH 007/135] Reduce mmal compile warnings
 
 ---
  libavcodec/mmaldec.c | 4 ++++
@@ -4889,7 +4889,7 @@ index 3092f58510..6f41b41ac4 100644
 From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 17:56:16 +0100
-Subject: [PATCH 008/121] Add chroma location to hevc parse
+Subject: [PATCH 008/135] Add chroma location to hevc parse
 
 ---
  libavcodec/hevc_parser.c | 13 +++++++++++++
@@ -4948,7 +4948,7 @@ index 567e8d81d4..b6cfea64d3 100644
 From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:20:50 +0100
-Subject: [PATCH 009/121] hwaccel: Add .abort_frame & use in hevcdec
+Subject: [PATCH 009/135] hwaccel: Add .abort_frame & use in hevcdec
 
 ---
  libavcodec/avcodec.h | 11 +++++++++++
@@ -5000,7 +5000,7 @@ index b6cfea64d3..8a0246fa21 100644
 From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:26:17 +0100
-Subject: [PATCH 010/121] hwaccel: Add CAP_MT_SAFE for accels that can use
+Subject: [PATCH 010/135] hwaccel: Add CAP_MT_SAFE for accels that can use
  multi-thread
 
 ---
@@ -5049,7 +5049,7 @@ index d9d5afaa82..2cc89a41f5 100644
 From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 17:59:08 +0100
-Subject: [PATCH 011/121] Weak link utils
+Subject: [PATCH 011/135] Weak link utils
 
 ---
  libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++
@@ -5199,7 +5199,7 @@ index 0000000000..415b6a27a0
 From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 19:23:26 +0100
-Subject: [PATCH 012/121] Add v4l2_req V4L2 request H265 drm_prime decode
+Subject: [PATCH 012/135] Add v4l2_req V4L2 request H265 drm_prime decode
 
 Has the abiliy to switch between kernel API versions at runtime. This
 could be removed later once teher is no chance of usage on an old
@@ -10674,7 +10674,7 @@ index 0000000000..f14f594564
 From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 27 Apr 2021 19:30:36 +0100
-Subject: [PATCH 013/121] Add no_cvt_hw option to ffmpeg
+Subject: [PATCH 013/135] Add no_cvt_hw option to ffmpeg
 
 ---
  fftools/ffmpeg.c     | 6 ++++--
@@ -10744,7 +10744,7 @@ index 055275d813..761db36588 100644
 From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 10:16:39 +0100
-Subject: [PATCH 014/121] Add vout_drm
+Subject: [PATCH 014/135] Add vout_drm
 
 ---
  configure                |   4 +
@@ -11457,7 +11457,7 @@ index 0000000000..cfb33ce7c3
 From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 11:34:18 +0100
-Subject: [PATCH 015/121] Add vout_egl
+Subject: [PATCH 015/135] Add vout_egl
 
 ---
  configure                |   6 +
@@ -12357,7 +12357,7 @@ index 0000000000..7b9c610ace
 From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 28 Apr 2021 12:51:22 +0100
-Subject: [PATCH 016/121] V4L2 stateful rework
+Subject: [PATCH 016/135] V4L2 stateful rework
 
 ---
  libavcodec/Makefile       |   3 +-
@@ -14780,7 +14780,7 @@ index 4944d08511..7f6033ac2c 100644
 From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:46:21 +0100
-Subject: [PATCH 017/121] Fix crash in hw_device_default_name if type not found
+Subject: [PATCH 017/135] Fix crash in hw_device_default_name if type not found
  (NONE)
 
 ---
@@ -14804,7 +14804,7 @@ index 88fa782470..740a5e7153 100644
 From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:59:18 +0100
-Subject: [PATCH 018/121] Allow v4l2m2m to select non-drm_prime output formats
+Subject: [PATCH 018/135] Allow v4l2m2m to select non-drm_prime output formats
 
 ---
  libavcodec/v4l2_buffers.c |  2 +-
@@ -14871,7 +14871,7 @@ index 7f6033ac2c..a4b5a4e7e9 100644
 From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 18:59:38 +0100
-Subject: [PATCH 019/121] Fix YUV420P output from v4l2m2m
+Subject: [PATCH 019/135] Fix YUV420P output from v4l2m2m
 
 Also put get_width get_height inlines in header as they are generally
 useful.
@@ -14988,7 +14988,7 @@ index 24a9c94864..8f054f2f50 100644
 From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 10 Jun 2021 19:23:44 +0100
-Subject: [PATCH 020/121] Report buffer overflows in v4l2m2m
+Subject: [PATCH 020/135] Report buffer overflows in v4l2m2m
 
 ---
  libavcodec/v4l2_buffers.c | 14 ++++++++++----
@@ -15064,7 +15064,7 @@ index 6fe2586627..81aced0c2b 100644
 From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 14 Jun 2021 11:55:16 +0100
-Subject: [PATCH 021/121] Increase V4L2 H264 stateful coded buffer size
+Subject: [PATCH 021/135] Increase V4L2 H264 stateful coded buffer size
 
 Try to set a min size of frame size / 2 for bitbuffers passed to V4l2.
 This fixes a few streams that have large I-frames.  You would hope
@@ -15188,7 +15188,7 @@ index a4b5a4e7e9..1851acbc93 100644
 From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 28 Jun 2021 12:13:35 +0100
-Subject: [PATCH 022/121] Fix raw video s.t. it respects any remaining cropping
+Subject: [PATCH 022/135] Fix raw video s.t. it respects any remaining cropping
 
 This fixes the long standing CONFWIN_A conformance test failure for drm.
 ---
@@ -15458,7 +15458,7 @@ index 7a9fdbd263..baf18920fa 100644
 From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 13 Aug 2021 15:38:28 +0100
-Subject: [PATCH 023/121] Set frame interlace from V4L2 buffer field
+Subject: [PATCH 023/135] Set frame interlace from V4L2 buffer field
 
 ---
  libavcodec/v4l2_buffers.c | 12 ++++++++++++
@@ -15498,7 +15498,7 @@ index de31f7ced9..97b8eb1db3 100644
 From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 13 Aug 2021 16:11:53 +0100
-Subject: [PATCH 024/121] Fix V4L2 stateful to avoid crash if flush before
+Subject: [PATCH 024/135] Fix V4L2 stateful to avoid crash if flush before
  start
 
 ---
@@ -15524,7 +15524,7 @@ index a17ae027a6..eb901e8fab 100644
 From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Sep 2021 17:44:13 +0100
-Subject: [PATCH 025/121] Copy properties from frame to v4l2 buffer
+Subject: [PATCH 025/135] Copy properties from frame to v4l2 buffer
 
 Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that
 ff_v4l2_buffer_buf_to_avframe copies
@@ -15695,7 +15695,7 @@ index 97b8eb1db3..126d2a17f4 100644
 From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 16:49:01 +0000
-Subject: [PATCH 026/121] ffmpeg: Do not inc DTS on no decode output
+Subject: [PATCH 026/135] ffmpeg: Do not inc DTS on no decode output
 
 V4L2 H264 decode has long latency and sometimes spits out a long stream
 of output without input. In this case incrementing DTS is wrong. There
@@ -15727,7 +15727,7 @@ index 5dc2cd73c1..ba0c1898cf 100644
 From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:32:59 +0000
-Subject: [PATCH 027/121] v4l2_m2m_dec: Adjust timebase if H264
+Subject: [PATCH 027/135] v4l2_m2m_dec: Adjust timebase if H264
 
 Adjust AVCodecContext time_base if H264 in the same way that the
 software decoder does.
@@ -15760,7 +15760,7 @@ index 1851acbc93..aa1e5c1597 100644
 From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:38:27 +0000
-Subject: [PATCH 028/121] v4l2_m2m_dec: Produce best guess PTSs if none
+Subject: [PATCH 028/135] v4l2_m2m_dec: Produce best guess PTSs if none
  supplied
 
 Filter scheduling gets confused by missing PTSs and makes poor guesses
@@ -15895,7 +15895,7 @@ index aa1e5c1597..a5a2afbd27 100644
 From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 17:59:27 +0000
-Subject: [PATCH 029/121] v4l2_m2m_dec: Try harder to get an initial frame
+Subject: [PATCH 029/135] v4l2_m2m_dec: Try harder to get an initial frame
 
 If the input Q is full then wait on a short timeout for a capture frame
 rather than stuffing yet still another frame into the input if we could
@@ -15936,7 +15936,7 @@ index a5a2afbd27..b49f470c0a 100644
 From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 17 Nov 2021 18:04:56 +0000
-Subject: [PATCH 030/121] Add a V4L2 M2M deinterlace filter
+Subject: [PATCH 030/135] Add a V4L2 M2M deinterlace filter
 
 Add a V4L2 deinterlace filter that will accept DRMPRIME frames.
 
@@ -17277,7 +17277,7 @@ index 0000000000..1a933b7e0a
 From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 2 Dec 2021 17:49:55 +0000
-Subject: [PATCH 031/121] Put no_pts_rescale in context which makes more sense
+Subject: [PATCH 031/135] Put no_pts_rescale in context which makes more sense
  than an arg
 
 ---
@@ -17558,7 +17558,7 @@ index b49f470c0a..36754b314a 100644
 From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 8 Dec 2021 15:00:37 +0000
-Subject: [PATCH 032/121] Use bitbuf min size for all streams
+Subject: [PATCH 032/135] Use bitbuf min size for all streams
 
 ---
  libavcodec/v4l2_m2m_dec.c | 5 +----
@@ -17589,7 +17589,7 @@ index 36754b314a..48a6810d18 100644
 From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 3 Dec 2021 12:54:18 +0000
-Subject: [PATCH 033/121] Track pending frames in v4l2 stateful
+Subject: [PATCH 033/135] Track pending frames in v4l2 stateful
 
 Track which frames are pending decode in the v4l2 stateful decoder.
 This relies on DTS & PTS having some relationship to reality, so
@@ -17847,7 +17847,7 @@ index 48a6810d18..d8ebb466cd 100644
 From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 17:58:21 +0000
-Subject: [PATCH 034/121] Use pending tracking to reduce v4l2 latency
+Subject: [PATCH 034/135] Use pending tracking to reduce v4l2 latency
 
 If there are more than 5 pending decodes outstanding then add a small
 timeout to the capture poll to reduce the rate at which frames are
@@ -17970,7 +17970,7 @@ index d8ebb466cd..7e7e4729d0 100644
 From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 12:23:54 +0000
-Subject: [PATCH 035/121] Allow logger() to take const ctx
+Subject: [PATCH 035/135] Allow logger() to take const ctx
 
 ---
  libavcodec/v4l2_buffers.c | 2 +-
@@ -18015,7 +18015,7 @@ index 64540a37b3..d3df48aed4 100644
 From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 13:00:27 +0000
-Subject: [PATCH 036/121] Track numbere of bufs qed with an atomic
+Subject: [PATCH 036/135] Track numbere of bufs qed with an atomic
 
 Safer and faster than counting status
 ---
@@ -18089,7 +18089,7 @@ index 4cc164886c..a4176448d5 100644
 From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Dec 2021 12:01:25 +0000
-Subject: [PATCH 037/121] Clear pkt_buf on flush
+Subject: [PATCH 037/135] Clear pkt_buf on flush
 
 ---
  libavcodec/v4l2_m2m_dec.c | 3 +++
@@ -18113,7 +18113,7 @@ index 7e7e4729d0..09ec496351 100644
 From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 15 Dec 2021 12:52:56 +0000
-Subject: [PATCH 038/121] Rework v4l2 buffer dequeue
+Subject: [PATCH 038/135] Rework v4l2 buffer dequeue
 
 ---
  libavcodec/v4l2_context.c | 543 ++++++++++++++++++--------------------
@@ -19150,7 +19150,7 @@ index 09ec496351..e4b6569ba5 100644
 From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 9 Dec 2021 18:51:00 +0000
-Subject: [PATCH 039/121] Honor result of ff_get_format if possible
+Subject: [PATCH 039/135] Honor result of ff_get_format if possible
 
 ---
  libavcodec/v4l2_m2m_dec.c | 6 +++++-
@@ -19185,7 +19185,7 @@ index e4b6569ba5..c9655bcc3b 100644
 From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 14 Dec 2021 16:11:10 +0000
-Subject: [PATCH 040/121] Add an always-reinit quirk
+Subject: [PATCH 040/135] Add an always-reinit quirk
 
 ---
  libavcodec/v4l2_context.c |  7 +++++--
@@ -19291,7 +19291,7 @@ index c9655bcc3b..e2b10f5e3a 100644
 From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 4 Jan 2022 16:58:31 +0000
-Subject: [PATCH 041/121] v4l2_buffers: rework flags for keyframe
+Subject: [PATCH 041/135] v4l2_buffers: rework flags for keyframe
 
 Previously flags could become confused and keyframe info could be lost.
 This fixes that and removes the duplicate flags field in V4L2Buffer.
@@ -19400,7 +19400,7 @@ index c11b5e6863..53b522d43e 100644
 From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 22 Mar 2022 11:44:30 +0000
-Subject: [PATCH 042/121] v4l2m2m: Rework decode to wait for missing buffer,
+Subject: [PATCH 042/135] v4l2m2m: Rework decode to wait for missing buffer,
  add dynamic pending
 
 Previously receive_frame exited with EAGAIN if no capture buffer
@@ -19620,7 +19620,7 @@ index e2b10f5e3a..2e30449dfc 100644
 From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 25 Mar 2022 15:37:58 +0000
-Subject: [PATCH 043/121] v4l2_m2m2_dec: Avoid loop if unable to resize buffers
+Subject: [PATCH 043/135] v4l2_m2m2_dec: Avoid loop if unable to resize buffers
 
 If source change signals a buffer size that cannot be honored give up
 rather than looping indefinitely.  This happens on Pi if (say) a
@@ -19667,7 +19667,7 @@ index 7ddb759810..007a58c8f1 100644
 From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 25 Mar 2022 18:14:40 +0000
-Subject: [PATCH 044/121] v4l2dec: Improve size/format validation on init
+Subject: [PATCH 044/135] v4l2dec: Improve size/format validation on init
 
 ---
  libavcodec/v4l2_m2m_dec.c      | 84 ++++++++++++++++++++++++++++++++--
@@ -19809,7 +19809,7 @@ index b0a5930844..76ab0916cd 100644
 From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 13 Apr 2022 16:05:56 +0000
-Subject: [PATCH 045/121] v4l2 stateless hevc: Add another API variation for
+Subject: [PATCH 045/135] v4l2 stateless hevc: Add another API variation for
  linux 5.18
 
 This is probably going to be a short lived variation and may end up
@@ -20255,7 +20255,7 @@ index f14f594564..ed48d62e2d 100644
 From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 3 May 2022 12:44:42 +0000
-Subject: [PATCH 046/121] Remove V4l2 frame size check for meson-vdec
+Subject: [PATCH 046/135] Remove V4l2 frame size check for meson-vdec
 
 ---
  libavcodec/v4l2_m2m.h     |  3 ++-
@@ -20315,7 +20315,7 @@ index 8dcadf461b..888ba67fea 100644
 From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 23 May 2022 18:05:20 +0100
-Subject: [PATCH 047/121] v4l2m2m_dec: Make some error rturns a bit more robust
+Subject: [PATCH 047/135] v4l2m2m_dec: Make some error rturns a bit more robust
 
 ---
  libavcodec/v4l2_context.c |  5 ++---
@@ -20384,7 +20384,7 @@ index 888ba67fea..88a341aae2 100644
 From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 24 May 2022 17:02:58 +0000
-Subject: [PATCH 048/121] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA
+Subject: [PATCH 048/135] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA
 
 Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA.  Should
 also detect and complain about unexpected streams of empty packets.
@@ -20494,7 +20494,7 @@ index 88a341aae2..392a68f0c7 100644
 From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 24 May 2022 20:02:48 +0000
-Subject: [PATCH 049/121] v4l2m2m_dec: Catch repeated Q fulls
+Subject: [PATCH 049/135] v4l2m2m_dec: Catch repeated Q fulls
 
 ---
  libavcodec/v4l2_m2m_dec.c | 8 +++++++-
@@ -20536,7 +20536,7 @@ index 392a68f0c7..7e17044706 100644
 From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 25 May 2022 15:22:12 +0000
-Subject: [PATCH 050/121] Remove requirement for epoxy & libudev config options
+Subject: [PATCH 050/135] Remove requirement for epoxy & libudev config options
 
 ---
  configure              | 26 +++++++++++++++++---------
@@ -20663,7 +20663,7 @@ index 65576846e8..37cea71756 100755
 From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 27 May 2022 09:36:51 +0000
-Subject: [PATCH 051/121] hevc: If hwaccel avoid creation of s/w only vars
+Subject: [PATCH 051/135] hevc: If hwaccel avoid creation of s/w only vars
 
 ---
  libavcodec/hevc_refs.c | 35 +++++++++++++++++++++--------------
@@ -20801,7 +20801,7 @@ index 2867cb2e16..17f53322fb 100644
 From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 30 May 2022 17:51:44 +0100
-Subject: [PATCH 052/121] rpi_sand: Add SAND30->NV12 conversion
+Subject: [PATCH 052/135] rpi_sand: Add SAND30->NV12 conversion
 
 C code only. Reworks the hwcontext_drm conversion to use the
 rpi_sand_fns generic frame convert fn rather than calling the
@@ -21023,7 +21023,7 @@ index 634b55e800..462ccb8abd 100644
 From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 1 Jun 2022 17:49:26 +0000
-Subject: [PATCH 053/121] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8
+Subject: [PATCH 053/135] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8
 
 Also reworks the previous Armv8 SAND30->Y16 function in a slightly more
 efficient way that makes it look more like the Armv7 version.
@@ -21962,7 +21962,7 @@ index 256c3d532f..b6071e2928 100644
 From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 7 Jun 2022 14:46:12 +0000
-Subject: [PATCH 054/121] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
+Subject: [PATCH 054/135] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
  frames
 
 ---
@@ -23337,7 +23337,7 @@ index 9a0837ecf3..05ff6ba726 100644
 From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 8 Jun 2022 16:13:31 +0000
-Subject: [PATCH 055/121] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
+Subject: [PATCH 055/135] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
  always NO_PTS
 
 If we do have DTS but don't have PTS then assume PTS=DTS.
@@ -23422,7 +23422,7 @@ index fbbfc81342..485a96f4b4 100644
 From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 30 Jun 2022 15:59:23 +0000
-Subject: [PATCH 056/121] v4l2: Update H265 request for current API
+Subject: [PATCH 056/135] v4l2: Update H265 request for current API
 
 This works with v9 of the H265 patch set which hopefully will be the
 last one. Hevc controls extracted from patched v4l2-controls into
@@ -24211,7 +24211,7 @@ index ed48d62e2d..d4adb3f812 100644
 From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sun, 3 Jul 2022 13:40:41 +0000
-Subject: [PATCH 057/121] v4l2_req: Observe limit on size of slice_array
+Subject: [PATCH 057/135] v4l2_req: Observe limit on size of slice_array
 
 This in fact provides some minor simplifications by combing the
 multi-slice and single-slice paths.
@@ -24342,7 +24342,7 @@ index d4adb3f812..0029e23309 100644
 From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 14:43:20 +0100
-Subject: [PATCH 058/121] v4l2_req: Add entry point offsets array control
+Subject: [PATCH 058/135] v4l2_req: Add entry point offsets array control
 
 ---
  libavcodec/v4l2_req_hevc_vx.c  | 88 +++++++++++++++++++++++++++-------
@@ -24580,7 +24580,7 @@ index 0029e23309..99c90064ea 100644
 From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 16:22:54 +0100
-Subject: [PATCH 059/121] v4l2_req: Support Annex B
+Subject: [PATCH 059/135] v4l2_req: Support Annex B
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------
@@ -24694,7 +24694,7 @@ index 43ef6631ed..5e0db9850a 100644
 From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 4 Jul 2022 18:24:03 +0100
-Subject: [PATCH 060/121] v4l2_req: Add frame mode decode
+Subject: [PATCH 060/135] v4l2_req: Add frame mode decode
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------
@@ -24820,7 +24820,7 @@ index 5e0db9850a..ada53d0d44 100644
 From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 5 Jul 2022 12:54:22 +0000
-Subject: [PATCH 061/121] v4l2_req: Fix probe for frame based decode
+Subject: [PATCH 061/135] v4l2_req: Fix probe for frame based decode
 
 ---
  libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++----------
@@ -24903,7 +24903,7 @@ index ada53d0d44..5d083016f8 100644
 From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 26 Jul 2022 15:46:14 +0000
-Subject: [PATCH 062/121] vf_deinterlace_v4l2m2m: Support NV12 through
+Subject: [PATCH 062/135] vf_deinterlace_v4l2m2m: Support NV12 through
  deinterlace
 
 Supports NV12 (though not yet NV12M) through deinterlace.
@@ -25229,7 +25229,7 @@ index 1a933b7e0a..1a3bef5bcb 100644
 From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 19 Aug 2022 15:29:11 +0000
-Subject: [PATCH 063/121] v4l2_req: Enable use of MMAP for buffer alloc
+Subject: [PATCH 063/135] v4l2_req: Enable use of MMAP for buffer alloc
 
 Use MMAP rather than DMABUF if either the dmabuf device can't be opened
 or create_buf doesn't set the capability.
@@ -25961,7 +25961,7 @@ index cd79aad563..5cf17dd5e3 100644
 From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 22 Aug 2022 12:35:40 +0000
-Subject: [PATCH 064/121] Set buffer lengths on DQ
+Subject: [PATCH 064/135] Set buffer lengths on DQ
 
 ---
  libavcodec/v4l2_req_media.c | 8 ++++++++
@@ -25990,7 +25990,7 @@ index 910ac77bb6..1a9944774a 100644
 From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 22 Aug 2022 17:11:24 +0000
-Subject: [PATCH 065/121] Fix compile if videodev2.h defines V4L2 HEVC request
+Subject: [PATCH 065/135] Fix compile if videodev2.h defines V4L2 HEVC request
  API
 
 If videodev2.h does define the HEVC request API it is really hard to
@@ -26117,7 +26117,7 @@ index 5cf17dd5e3..614a1b4d99 100644
 From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Sep 2022 17:59:22 +0100
-Subject: [PATCH 066/121] v4l2_m2m_enc: Send headers in in pkt side_data
+Subject: [PATCH 066/135] v4l2_m2m_enc: Send headers in in pkt side_data
 
 If GLOBAL_HEADERS are requested then we can't provide them at init time
 so send as NEW_EXTRADATA side data in a similar way to some AV1
@@ -26198,7 +26198,7 @@ index 05ff6ba726..099ad23928 100644
 From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 14 Sep 2022 15:44:10 +0000
-Subject: [PATCH 067/121] matroskaenc: Allow H264 SPS/PPS headers in packet
+Subject: [PATCH 067/135] matroskaenc: Allow H264 SPS/PPS headers in packet
  sidedata
 
 ---
@@ -26267,7 +26267,7 @@ index 113541bd9a..61e4c976ef 100644
 From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 14 Sep 2022 15:55:15 +0000
-Subject: [PATCH 068/121] movenc: Allow H264 SPS/PPS headers in packet sidedata
+Subject: [PATCH 068/135] movenc: Allow H264 SPS/PPS headers in packet sidedata
 
 ---
  libavformat/movenc.c | 1 +
@@ -26289,7 +26289,7 @@ index c4fcb5f8b1..891adbf7b2 100644
 From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 12:45:05 +0100
-Subject: [PATCH 069/121] Allow ffmpeg to select codec internal hwfmts if
+Subject: [PATCH 069/135] Allow ffmpeg to select codec internal hwfmts if
  no_cvt_hw
 
 This allows the selection of DRM_PRIME from v4l2m2m without forcing it
@@ -26326,7 +26326,7 @@ index ba0c1898cf..839da7b472 100644
 From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 1 Sep 2022 11:42:41 +0000
-Subject: [PATCH 070/121] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler
+Subject: [PATCH 070/135] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler
 
 The logic for running an isp based scaler is pretty much identical to
 that for the deinterlacer so add to the deinterlacer. This requires
@@ -27809,7 +27809,7 @@ index 1a3bef5bcb..2df39ec0f1 100644
 From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 14:54:46 +0000
-Subject: [PATCH 071/121] v4l2_m2m: Adjust buffer allocation based on min/max
+Subject: [PATCH 071/135] v4l2_m2m: Adjust buffer allocation based on min/max
  controls
 
 Clip requested buffer count to min/max declared by driver.
@@ -27861,7 +27861,7 @@ index 6b97eab41e..ba36689ff3 100644
 From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 15:00:12 +0000
-Subject: [PATCH 072/121] v4l2_m2m_dec: If src Q is full then wait indefinitely
+Subject: [PATCH 072/135] v4l2_m2m_dec: If src Q is full then wait indefinitely
  for buffer
 
 If it is not possible to add another buffer to the src Q then alawys
@@ -27894,7 +27894,7 @@ index 485a96f4b4..bb183097f6 100644
 From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 15:12:27 +0000
-Subject: [PATCH 073/121] vf_deinterlace_v4l2m2m: Add Q name to structure for
+Subject: [PATCH 073/135] vf_deinterlace_v4l2m2m: Add Q name to structure for
  debug
 
 ---
@@ -27928,7 +27928,7 @@ index 2df39ec0f1..4edecc02bf 100644
 From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 16:08:42 +0000
-Subject: [PATCH 074/121] v4l2_m2m_enc: Set src buffer count to min+2 by
+Subject: [PATCH 074/135] v4l2_m2m_enc: Set src buffer count to min+2 by
  default
 
 Set output.num_buffers to 0 by default which will then be set to min+2
@@ -27960,7 +27960,7 @@ index 099ad23928..b8ba815c37 100644
 From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 22 Sep 2022 16:13:57 +0000
-Subject: [PATCH 075/121] vf_deinterlace_m2m: For deinterlace set outlink FR to
+Subject: [PATCH 075/135] vf_deinterlace_m2m: For deinterlace set outlink FR to
  twice inlink
 
 We used to set the outlink framerate to unknown but it turns out that
@@ -27997,7 +27997,7 @@ index 4edecc02bf..c52dae1c44 100644
 From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 23 Sep 2022 11:30:56 +0000
-Subject: [PATCH 076/121] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
+Subject: [PATCH 076/135] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
  a Q
 
 Useful for where (encode) we might have drmprime buffers that we want to
@@ -28055,7 +28055,7 @@ index 21265f1bd7..523c53e97d 100644
 From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 23 Sep 2022 11:38:36 +0000
-Subject: [PATCH 077/121] v4l2_m2m_enc: DQ output more frequently
+Subject: [PATCH 077/135] v4l2_m2m_enc: DQ output more frequently
 
 Ensure that we DQ any released src buffers on every op to avoid deadlock
 with source.
@@ -28114,7 +28114,7 @@ index b8ba815c37..a992a3cccc 100644
 From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 26 Sep 2022 18:20:00 +0100
-Subject: [PATCH 078/121] conf_native: Remove --enable-rpi from all builds
+Subject: [PATCH 078/135] conf_native: Remove --enable-rpi from all builds
 
 ---
  pi-util/conf_native.sh | 5 +++--
@@ -28148,7 +28148,7 @@ index 37cea71756..f22d531ca4 100755
 From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 29 Sep 2022 19:48:08 +0000
-Subject: [PATCH 079/121] v4l2_m2m_dec: Deal correctly with avcC H264 data in
+Subject: [PATCH 079/135] v4l2_m2m_dec: Deal correctly with avcC H264 data in
  extradata
 
 Decoders expect AnnexB style headers, mkv and similar formats have
@@ -28391,7 +28391,7 @@ index bb183097f6..6bd9926b3f 100644
 From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 30 Sep 2022 14:20:23 +0000
-Subject: [PATCH 080/121] v4l2_request_hevc: Fix up
+Subject: [PATCH 080/135] v4l2_request_hevc: Fix up
  V4L2_CID_CODEC_STATELESS_BASE if missing
 
 ---
@@ -28420,7 +28420,7 @@ index 7829d82084..c02fdbe5a8 100644
 From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sat, 1 Oct 2022 13:40:57 +0000
-Subject: [PATCH 081/121] vf_deinterlace_v4l2m2m: Fix compile on m/c without
+Subject: [PATCH 081/135] vf_deinterlace_v4l2m2m: Fix compile on m/c without
  V4L2 SAND
 
 ---
@@ -28554,7 +28554,7 @@ index c52dae1c44..716789f988 100644
 From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sun, 2 Oct 2022 12:36:43 +0000
-Subject: [PATCH 082/121] configure: Fix v4l2_req_hevc_vx setup; set after deps
+Subject: [PATCH 082/135] configure: Fix v4l2_req_hevc_vx setup; set after deps
  fixups
 
 ---
@@ -28592,7 +28592,7 @@ index 5c00a183e3..94c8161b91 100755
 From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Sat, 1 Oct 2022 12:39:45 +0000
-Subject: [PATCH 083/121] vf_deinterlace_v4l2m2m: Ensure we get consistent
+Subject: [PATCH 083/135] vf_deinterlace_v4l2m2m: Ensure we get consistent
  final frames
 
 On getting EOS at the input of the filster do not simply drop everything
@@ -28944,7 +28944,7 @@ index 716789f988..ce875c2c61 100644
 From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 5 Oct 2022 16:12:02 +0000
-Subject: [PATCH 084/121] v4l2_m2m_dec: Rework decode pending heuristic
+Subject: [PATCH 084/135] v4l2_m2m_dec: Rework decode pending heuristic
 
 The old code measured the length of the entire Q in the decoder and
 attempted to dynamically guess an appropriate length. This was prone to
@@ -29115,7 +29115,7 @@ index 6bd9926b3f..bec9b22fcf 100644
 From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Fri, 21 Oct 2022 13:48:07 +0000
-Subject: [PATCH 085/121] pthread_frame: Fix MT hwaccel. Recent change broke
+Subject: [PATCH 085/135] pthread_frame: Fix MT hwaccel. Recent change broke
  it.
 
 Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the
@@ -29222,7 +29222,7 @@ index 2cc89a41f5..b14f8e9360 100644
 From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:18:27 +0000
-Subject: [PATCH 086/121] v4l2_req: Add swfmt to init logging
+Subject: [PATCH 086/135] v4l2_req: Add swfmt to init logging
 
 (cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf)
 ---
@@ -29259,7 +29259,7 @@ index 614a1b4d99..767ecb036a 100644
 From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:39:54 +0000
-Subject: [PATCH 087/121] v4l2_m2m: Avoid polling on a queue that is streamoff
+Subject: [PATCH 087/135] v4l2_m2m: Avoid polling on a queue that is streamoff
 
 (cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b)
 ---
@@ -29304,7 +29304,7 @@ index 4a359bf45e..b296dc111c 100644
 From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:07:04 +0000
-Subject: [PATCH 088/121] v4l2_m2m: Add function to get number of queued
+Subject: [PATCH 088/135] v4l2_m2m: Add function to get number of queued
  buffers
 
 (cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4)
@@ -29336,7 +29336,7 @@ index 523c53e97d..8e4f681643 100644
 From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:48:20 +0000
-Subject: [PATCH 089/121] v4l2_m2m: Add timeouts to dq_all and dequeue_packet
+Subject: [PATCH 089/135] v4l2_m2m: Add timeouts to dq_all and dequeue_packet
 
 Add timeouts and use them to have better flow control in encode
 
@@ -29505,7 +29505,7 @@ index a992a3cccc..d0d27e5bc2 100644
 From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 14:23:32 +0000
-Subject: [PATCH 090/121] v4l2_m2m_enc: Improve debug trace
+Subject: [PATCH 090/135] v4l2_m2m_enc: Improve debug trace
 
 (cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5)
 ---
@@ -29565,7 +29565,7 @@ index d0d27e5bc2..c8c2de3d47 100644
 From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 18 Oct 2022 13:22:36 +0000
-Subject: [PATCH 091/121] v4l2_m2m_enc: Copy dest packets to memory if short of
+Subject: [PATCH 091/135] v4l2_m2m_enc: Copy dest packets to memory if short of
  v4l2 buffers
 
 (cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5)
@@ -29604,7 +29604,7 @@ index c8c2de3d47..c23187e6e6 100644
 From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 11:00:16 +0000
-Subject: [PATCH 092/121] v4l2_m2m_dec: Fix pts_best_effort guessing for
+Subject: [PATCH 092/135] v4l2_m2m_dec: Fix pts_best_effort guessing for
  initial pts
 
 (cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67)
@@ -29629,7 +29629,7 @@ index bec9b22fcf..47b2735f82 100644
 From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:47:04 +0000
-Subject: [PATCH 093/121] v4l2_m2m_enc: Wait for frame or space in src Q in
+Subject: [PATCH 093/135] v4l2_m2m_enc: Wait for frame or space in src Q in
  rx_pkt
 
 If receive_packet we should ensure that there is space in the source Q
@@ -29691,7 +29691,7 @@ index c23187e6e6..524e9424a5 100644
 From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:54:29 +0000
-Subject: [PATCH 094/121] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
+Subject: [PATCH 094/135] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
  in trace
 
 (cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a)
@@ -29718,7 +29718,7 @@ index ce875c2c61..7c6751b69c 100644
 From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 14:55:21 +0000
-Subject: [PATCH 095/121] vf_deinterlace_v4l2m2m: Ignore "wanted" when
+Subject: [PATCH 095/135] vf_deinterlace_v4l2m2m: Ignore "wanted" when
  processing input
 
 If we gate send a frame to the outlink on its frame_wanted flag then we
@@ -29751,7 +29751,7 @@ index 7c6751b69c..a173a291f8 100644
 From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 19 Oct 2022 15:00:43 +0000
-Subject: [PATCH 096/121] conf_native: Add --enable-gpl
+Subject: [PATCH 096/135] conf_native: Add --enable-gpl
 
 (cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15)
 ---
@@ -29774,7 +29774,7 @@ index f22d531ca4..082d9b5832 100755
 From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 15 Nov 2022 13:33:00 +0000
-Subject: [PATCH 097/121] egl_vout: Make formatting consistent - no code
+Subject: [PATCH 097/135] egl_vout: Make formatting consistent - no code
  changes
 
 ---
@@ -30758,7 +30758,7 @@ index 7b9c610ace..a52cabb082 100644
 From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 16:49:43 +0000
-Subject: [PATCH 098/121] v4l2m2m: reporganise get_raw_format for loop logic
+Subject: [PATCH 098/135] v4l2m2m: reporganise get_raw_format for loop logic
 
 ---
  libavcodec/v4l2_context.c | 16 +++++-----------
@@ -30806,7 +30806,7 @@ index 7031f3d340..79a31cf930 100644
 From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:49:12 +0000
-Subject: [PATCH 099/121] drm_vout: Set zpos on the plane we pick to ensure it
+Subject: [PATCH 099/135] drm_vout: Set zpos on the plane we pick to ensure it
  is at the front
 
 ---
@@ -30876,7 +30876,7 @@ index cfb33ce7c3..9bd9e04421 100644
 From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:51:46 +0000
-Subject: [PATCH 100/121] drm_vout: Only set modifier flag and pass modifiers
+Subject: [PATCH 100/135] drm_vout: Only set modifier flag and pass modifiers
  if there are some
 
 ---
@@ -30936,7 +30936,7 @@ index 9bd9e04421..a56adea866 100644
 From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 17:52:58 +0000
-Subject: [PATCH 101/121] drm_vout: Fix typo in error message
+Subject: [PATCH 101/135] drm_vout: Fix typo in error message
 
 ---
  libavdevice/drm_vout.c | 2 +-
@@ -30959,7 +30959,7 @@ index a56adea866..351abf1d60 100644
 From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 12 Dec 2022 18:00:41 +0000
-Subject: [PATCH 102/121] drm_vout: Add option to name the drm_module to use
+Subject: [PATCH 102/135] drm_vout: Add option to name the drm_module to use
 
 ---
  libavdevice/drm_vout.c | 8 +++++---
@@ -31012,7 +31012,7 @@ index 351abf1d60..491e1dc608 100644
 From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 13:01:00 +0000
-Subject: [PATCH 103/121] dmabufs: Rework to allow for non-CMA backends
+Subject: [PATCH 103/135] dmabufs: Rework to allow for non-CMA backends
 
 ---
  libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++----------
@@ -31266,7 +31266,7 @@ index c4bbed18c6..1c3a5e861f 100644
 From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 13:07:58 +0000
-Subject: [PATCH 104/121] dmabufs: Use unref rather than deleet on cmabufs_ctl
+Subject: [PATCH 104/135] dmabufs: Use unref rather than deleet on cmabufs_ctl
 
 ---
  libavcodec/v4l2_req_dmabufs.c  | 12 +++++++++++-
@@ -31354,7 +31354,7 @@ index 767ecb036a..db7ed13b6d 100644
 From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 14:21:40 +0000
-Subject: [PATCH 105/121] egl_vout: Remove redundant & completely broken debug
+Subject: [PATCH 105/135] egl_vout: Remove redundant & completely broken debug
 
 ---
  libavdevice/egl_vout.c | 25 -------------------------
@@ -31400,7 +31400,7 @@ index a52cabb082..afc7afd13e 100644
 From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 16:12:12 +0000
-Subject: [PATCH 106/121] v4l2m2m: Use offset from querybuf rather than always
+Subject: [PATCH 106/135] v4l2m2m: Use offset from querybuf rather than always
  0
 
 ---
@@ -31455,7 +31455,7 @@ index 1ac32c5989..d91d5d1dd0 100644
 From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 17:57:27 +0000
-Subject: [PATCH 107/121] v4l2m2m: Fix crash if init errors out before setting
+Subject: [PATCH 107/135] v4l2m2m: Fix crash if init errors out before setting
  avctx
 
 ---
@@ -31479,7 +31479,7 @@ index 1e30d15fd8..ac6bae0dc3 100644
 From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 18:10:30 +0000
-Subject: [PATCH 108/121] v4l2_buffers: Add and use ctx_to_m2mctx + error debug
+Subject: [PATCH 108/135] v4l2_buffers: Add and use ctx_to_m2mctx + error debug
 
 ---
  libavcodec/v4l2_buffers.c | 22 +++++++++++++++-------
@@ -31546,7 +31546,7 @@ index 5ca58ea593..e28ef2d1e8 100644
 From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 18:53:22 +0000
-Subject: [PATCH 109/121] v4l2m2m: Add ability to use cma alloced dmabufs as
+Subject: [PATCH 109/135] v4l2m2m: Add ability to use cma alloced dmabufs as
  well as v4l2 mmap
 
 ---
@@ -31807,7 +31807,7 @@ index 47b2735f82..4d17057298 100644
 From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 13 Dec 2022 19:05:47 +0000
-Subject: [PATCH 110/121] testfilt: Skeleton of hw filter test code
+Subject: [PATCH 110/135] testfilt: Skeleton of hw filter test code
 
 ---
  pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++
@@ -31907,7 +31907,7 @@ index 0000000000..b322dac0c2
 From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Thu, 5 Jan 2023 14:39:30 +0000
-Subject: [PATCH 111/121] pixfmt: Add a #define to indicate presence of SAND
+Subject: [PATCH 111/135] pixfmt: Add a #define to indicate presence of SAND
  formats
 
 ---
@@ -31931,7 +31931,7 @@ index 22f70007c3..5cc780e7d5 100644
 From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 11 Jan 2023 16:30:37 +0000
-Subject: [PATCH 112/121] v4l2_m2m_dec: Fix initial pkt send if no extradata
+Subject: [PATCH 112/135] v4l2_m2m_dec: Fix initial pkt send if no extradata
 
 ---
  libavcodec/v4l2_m2m_dec.c | 4 ++--
@@ -31963,7 +31963,7 @@ index 4d17057298..9daf05adfe 100644
 From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 16 Jan 2023 16:05:09 +0000
-Subject: [PATCH 113/121] v4l2m2m_dec: Make capture timeout long once pending
+Subject: [PATCH 113/135] v4l2m2m_dec: Make capture timeout long once pending
  count > 31
 
 For some applications (ffmpeg command line) the current heuristic of adding
@@ -32060,7 +32060,7 @@ index 9daf05adfe..c8ab883d7e 100644
 From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 6 Feb 2023 19:23:16 +0000
-Subject: [PATCH 114/121] Initial buffersink alloc callback code
+Subject: [PATCH 114/135] Initial buffersink alloc callback code
 
 (cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb)
 ---
@@ -32155,7 +32155,7 @@ index 64e08de53e..09737d322f 100644
 From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 30 Jan 2023 17:23:12 +0000
-Subject: [PATCH 115/121] v4l2_m2m_dec: Add a profile check
+Subject: [PATCH 115/135] v4l2_m2m_dec: Add a profile check
 
 Check the profile in avctx aginst what the v4l2 driver advertises. If
 the driver doesn't support the check then just accept anything.
@@ -32312,7 +32312,7 @@ index c8ab883d7e..098adf4821 100644
 From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 1 Feb 2023 17:24:39 +0000
-Subject: [PATCH 116/121] v4l2_m2m_dec: Add extradata parse for h264 & hevc
+Subject: [PATCH 116/135] v4l2_m2m_dec: Add extradata parse for h264 & hevc
 
 If we have extradata we can extract profile & level and potentailly
 other useful info from it. Use the codec parser to get it if the decoder
@@ -32443,7 +32443,7 @@ index 098adf4821..e64bc707d3 100644
 From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 20 Mar 2023 18:12:51 +0000
-Subject: [PATCH 117/121] clean_usr_libs: Now wipes the include files too
+Subject: [PATCH 117/135] clean_usr_libs: Now wipes the include files too
 
 When swapping ffmpeg versions obsolete makefiles could confuse
 configure utilities.
@@ -32480,7 +32480,7 @@ index b3b2d5509d..01bd6a6a22 100755
 From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Mon, 20 Mar 2023 18:15:08 +0000
-Subject: [PATCH 118/121] vulkan: Add missing decode extension defines
+Subject: [PATCH 118/135] vulkan: Add missing decode extension defines
 
 When building on bookworm the video decode extension names
 were missing. This adds them. I expect this patch will be
@@ -32512,7 +32512,7 @@ index 2a9b5f4aac..11e7945f18 100644
 From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 21 Mar 2023 14:20:05 +0000
-Subject: [PATCH 119/121] v4l2_m2m_dec: Fix config file for finding if decoder
+Subject: [PATCH 119/135] v4l2_m2m_dec: Fix config file for finding if decoder
  enabled
 
 Fixes parsing of extradata for profile testing. 5.x changed where that
@@ -32538,7 +32538,7 @@ index e64bc707d3..91136f03da 100644
 From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Tue, 21 Mar 2023 14:23:20 +0000
-Subject: [PATCH 120/121] v4l2_m2m_dec: Display profile given if skipped in
+Subject: [PATCH 120/135] v4l2_m2m_dec: Display profile given if skipped in
  debug
 
 ---
@@ -32562,7 +32562,7 @@ index 91136f03da..d124c7b1fc 100644
 From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001
 From: John Cox <jc@kynesim.co.uk>
 Date: Wed, 22 Mar 2023 16:08:08 +0000
-Subject: [PATCH 121/121] conf_native: Fix for 64-bit kernel with 32-bit
+Subject: [PATCH 121/135] conf_native: Fix for 64-bit kernel with 32-bit
  userspace
 
 (cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327)
@@ -32614,3 +32614,2318 @@ index 082d9b5832..0a7d230f1b 100755
    exit 1
  fi
  
+
+From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 20 Apr 2023 11:48:25 +0000
+Subject: [PATCH 122/135] conf_native: Add install prefix variation
+
+(cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d)
+---
+ pi-util/BUILD.txt      | 32 ++++++++++++++++++++------------
+ pi-util/conf_native.sh | 14 ++++++++++++--
+ 2 files changed, 32 insertions(+), 14 deletions(-)
+
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
+index b050971f63..2b62d660c0 100644
+--- a/pi-util/BUILD.txt
++++ b/pi-util/BUILD.txt
+@@ -24,6 +24,8 @@ There are a few choices here
+          paths being confused and therefore running the wrong code,  Shared
+          is what is needed, in most cases, when building for use by other
+          programs.
++ --usr   Set install dir to /usr (i.e. system default) rather than in
++         <builddir>/install
+ 
+ So for a static build
+ ---------------------
+@@ -37,23 +39,29 @@ You can now run ffmpeg directly from where it was built
+ For a shared build
+ ------------------
+ 
+-$ pi-util/conf_native.sh
+-
+-You will normally want an install target if shared. Note that the script has
+-set this up to be generated in out/<builddir>/install, you don't have to worry
+-about overwriting your system libs.
++There are two choices here
+ 
++$ pi-util/conf_native.sh
+ $ make -j8 -C out/<builddir> install
+ 
+-You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
+-built or install the image on the system - you have to be careful to get rid
+-of all other ffmpeg libs or confusion may result.  There is a little script
+-that wipes all other versions - obviously use with care!
++This sets the install prefix to <builddir>/install and is probably what you
++want if you don't want to overwrite the system files.
+ 
+-$ sudo pi-util/clean_usr_libs.sh
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built. You can copy the contents of <build dir>/install to /usr and that mostly
++works. The only downside is that paths in pkgconfig end up being set to the
++install directory in your build directory which may be less than ideal when
++building other packages.
+ 
+-Then simply copying from the install to /usr works
++The alternative if you just want to replace the system libs is:
+ 
+-$ sudo cp -r out/<builddir>/install/* /usr
++$ pi-util/conf_native.sh --usr
++$ make -j8 -C out/<builddir>
++$ sudo pi-util/clean_usr_libs.sh
++$ sudo make -j8 -C out/<builddir> install
+ 
++The clean_usr_libs.sh step wipes any existing libs & includes (for all
++architectures) from the system which helps avoid confusion when running other
++progs as you can be sure you're not running old code which is unfortunately
++easy to do otherwise.
+ 
+diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
+index 0a7d230f1b..f0ed159594 100755
+--- a/pi-util/conf_native.sh
++++ b/pi-util/conf_native.sh
+@@ -9,6 +9,7 @@ RPI_KEEPS=""
+ 
+ NOSHARED=
+ MMAL=
++USR_PREFIX=
+ 
+ while [ "$1" != "" ] ; do
+     case $1 in
+@@ -18,8 +19,14 @@ while [ "$1" != "" ] ; do
+ 	--mmal)
+ 	    MMAL=1
+ 	    ;;
++	--usr)
++	    USR_PREFIX=/usr
++	    ;;
+ 	*)
+-	    echo "Usage $0: [--noshared] [--mmal]"
++	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
++	    echo "  noshared  Build static libs and executable - good for testing"
++	    echo "  mmal      Build mmal decoders"
++	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
+ 	    exit 1
+ 	    ;;
+     esac
+@@ -82,7 +89,9 @@ else
+   OUT=$BUILDBASE/$B-$C-$V-shared-rel
+ fi
+ 
+-USR_PREFIX=$OUT/install
++if [ ! $USR_PREFIX ]; then
++  USR_PREFIX=$OUT/install
++fi
+ LIB_PREFIX=$USR_PREFIX/lib/$A
+ INC_PREFIX=$USR_PREFIX/include/$A
+ 
+@@ -113,6 +122,7 @@ $FFSRC/configure \
+  --extra-libs="$RPI_EXTRALIBS"\
+  --extra-version="rpi"
+ 
++echo "Configured into $OUT"
+ 
+ # gcc option for getting asm listing
+ # -Wa,-ahls
+
+From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Wed, 19 Apr 2023 10:47:58 +0000
+Subject: [PATCH 123/135] swcale: Add explicit bgr24->yv12 conversion
+
+(cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434)
+---
+ libswscale/rgb2rgb.c          |  5 +++++
+ libswscale/rgb2rgb.h          |  7 +++++++
+ libswscale/rgb2rgb_template.c | 36 ++++++++++++++++++++++++++++++-----
+ libswscale/swscale_unscaled.c | 22 +++++++++++++++++++++
+ 4 files changed, 65 insertions(+), 5 deletions(-)
+
+diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
+index e98fdac8ea..84bb56e60e 100644
+--- a/libswscale/rgb2rgb.c
++++ b/libswscale/rgb2rgb.c
+@@ -83,6 +83,11 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+                        int width, int height,
+                        int lumStride, int chromStride, int srcStride,
+                        int32_t *rgb2yuv);
++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
++                       uint8_t *udst, uint8_t *vdst,
++                       int width, int height,
++                       int lumStride, int chromStride, int srcStride,
++                       int32_t *rgb2yuv);
+ void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                  int srcStride, int dstStride);
+ void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
+index f3951d523e..0028ab345f 100644
+--- a/libswscale/rgb2rgb.h
++++ b/libswscale/rgb2rgb.h
+@@ -79,6 +79,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height, int lumStride,
+                       int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                      uint8_t *vdst, int width, int height, int lumStride,
++                      int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ /**
+  * Height should be a multiple of 2 and width should be a multiple of 16.
+@@ -128,6 +131,10 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                               int width, int height,
+                               int lumStride, int chromStride, int srcStride,
+                               int32_t *rgb2yuv);
++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                              int width, int height,
++                              int lumStride, int chromStride, int srcStride,
++                              int32_t *rgb2yuv);
+ extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                         int srcStride, int dstStride);
+ 
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index 42c69801ba..e2437826dd 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+  * others are ignored in the C version.
+  * FIXME: Write HQ version.
+  */
+-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *rgb2yuv)
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
+ {
+-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+     int y;
+     const int chromWidth = width >> 1;
+ 
+@@ -707,6 +708,30 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+     }
+ }
+ 
++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    static const uint8_t x[9] = {
++        RY_IDX, GY_IDX, BY_IDX,
++        RU_IDX, GU_IDX, BU_IDX,
++        RV_IDX, GV_IDX, BV_IDX,
++    };
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
++}
++
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    static const uint8_t x[9] = {
++         BY_IDX, GY_IDX, RY_IDX,
++         BU_IDX, GU_IDX, RU_IDX,
++         BV_IDX, GV_IDX, RV_IDX,
++    };
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
++}
++
+ static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride)
+@@ -980,6 +1005,7 @@ static av_cold void rgb2rgb_init_c(void)
+     yuy2toyv12         = yuy2toyv12_c;
+     planar2x           = planar2x_c;
+     ff_rgb24toyv12     = ff_rgb24toyv12_c;
++    ff_bgr24toyv12     = ff_bgr24toyv12_c;
+     interleaveBytes    = interleaveBytes_c;
+     deinterleaveBytes  = deinterleaveBytes_c;
+     vu9_to_vu12        = vu9_to_vu12_c;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index 9af2e7ecc3..9047030ae4 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -1654,6 +1654,23 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+     return srcSliceH;
+ }
+ 
++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                              int srcStride[], int srcSliceY, int srcSliceH,
++                              uint8_t *dst[], int dstStride[])
++{
++    ff_bgr24toyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
+ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+@@ -2037,6 +2054,11 @@ void ff_get_unscaled_swscale(SwsContext *c)
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+         c->convert_unscaled = bgr24ToYv12Wrapper;
++    /* rgb24toYV12 */
++    if (srcFormat == AV_PIX_FMT_RGB24 &&
++        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
++        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        c->convert_unscaled = rgb24ToYv12Wrapper;
+ 
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+
+From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 20 Apr 2023 11:26:10 +0000
+Subject: [PATCH 124/135] swscale: Add unscaled XRGB->YUV420P functions
+
+(cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8)
+---
+ libswscale/rgb2rgb.c          |  20 ++++++
+ libswscale/rgb2rgb.h          |  16 +++++
+ libswscale/rgb2rgb_template.c | 123 ++++++++++++++++++++++++++++++----
+ libswscale/swscale_unscaled.c |  89 ++++++++++++++++++++++++
+ 4 files changed, 236 insertions(+), 12 deletions(-)
+
+diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
+index 84bb56e60e..c3b9079d2b 100644
+--- a/libswscale/rgb2rgb.c
++++ b/libswscale/rgb2rgb.c
+@@ -88,6 +88,26 @@ void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
+                        int width, int height,
+                        int lumStride, int chromStride, int srcStride,
+                        int32_t *rgb2yuv);
++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
+ void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                  int srcStride, int dstStride);
+ void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
+index 0028ab345f..a0dd3ffb79 100644
+--- a/libswscale/rgb2rgb.h
++++ b/libswscale/rgb2rgb.h
+@@ -135,6 +135,22 @@ extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                               int width, int height,
+                               int lumStride, int chromStride, int srcStride,
+                               int32_t *rgb2yuv);
++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
+ extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                         int srcStride, int dstStride);
+ 
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index e2437826dd..703de90690 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -708,30 +708,125 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+     }
+ }
+ 
++static const uint8_t x_rgb[9] = {
++    RY_IDX, GY_IDX, BY_IDX,
++    RU_IDX, GU_IDX, BU_IDX,
++    RV_IDX, GV_IDX, BV_IDX,
++};
++
++static const uint8_t x_bgr[9] = {
++     BY_IDX, GY_IDX, RY_IDX,
++     BU_IDX, GU_IDX, RU_IDX,
++     BV_IDX, GV_IDX, RV_IDX,
++};
++
+ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+                    int chromStride, int srcStride, int32_t *rgb2yuv)
+ {
+-    static const uint8_t x[9] = {
+-        RY_IDX, GY_IDX, BY_IDX,
+-        RU_IDX, GU_IDX, BU_IDX,
+-        RV_IDX, GV_IDX, BV_IDX,
+-    };
+-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
+ }
+ 
+ void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+                    int chromStride, int srcStride, int32_t *rgb2yuv)
+ {
+-    static const uint8_t x[9] = {
+-         BY_IDX, GY_IDX, RY_IDX,
+-         BU_IDX, GU_IDX, RU_IDX,
+-         BV_IDX, GV_IDX, RV_IDX,
+-    };
+-    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
+ }
+ 
++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
++{
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
++    int y;
++    const int chromWidth = width >> 1;
++
++    for (y = 0; y < height; y += 2) {
++        int i;
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        ydst += lumStride;
++        src  += srcStride;
++
++        if (y+1 == height)
++            break;
++
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        udst += chromStride;
++        vdst += chromStride;
++        ydst += lumStride;
++        src  += srcStride;
++    }
++}
++
++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++// As the general code does no SIMD-like ops simply adding 1 to the src address
++// will fix the ignored alpha position
++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++
+ static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride)
+@@ -1006,6 +1101,10 @@ static av_cold void rgb2rgb_init_c(void)
+     planar2x           = planar2x_c;
+     ff_rgb24toyv12     = ff_rgb24toyv12_c;
+     ff_bgr24toyv12     = ff_bgr24toyv12_c;
++    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
++    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
++    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
++    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
+     interleaveBytes    = interleaveBytes_c;
+     deinterleaveBytes  = deinterleaveBytes_c;
+     vu9_to_vu12        = vu9_to_vu12_c;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index 9047030ae4..053c06adf5 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -1671,6 +1671,74 @@ static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+     return srcSliceH;
+ }
+ 
++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_bgrxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_rgbxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xbgrtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xrgbtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
+ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+@@ -2060,6 +2128,27 @@ void ff_get_unscaled_swscale(SwsContext *c)
+         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
+         c->convert_unscaled = rgb24ToYv12Wrapper;
+ 
++    /* bgrxtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = bgrxToYv12Wrapper;
++    /* rgbx24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        c->convert_unscaled = rgbxToYv12Wrapper;
++    /* xbgrtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        c->convert_unscaled = xbgrToYv12Wrapper;
++    /* xrgb24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        c->convert_unscaled = xrgbToYv12Wrapper;
++
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
+
+From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 20 Apr 2023 11:35:44 +0000
+Subject: [PATCH 125/135] swscale: Add aarch64 unscaled RGB24->YUV420P
+
+(cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160)
+---
+ libswscale/aarch64/rgb2rgb.c      |  40 +++++++
+ libswscale/aarch64/rgb2rgb_neon.S | 181 ++++++++++++++++++++++++++++++
+ 2 files changed, 221 insertions(+)
+
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index a9bf6ff9e0..6d3e0000dc 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -30,6 +30,44 @@
+ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride);
++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
++
++// RGB to YUV asm fns process 16 pixels at once so ensure that the output
++// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
++// don't test for that
++// Fall back to C if we cannot use asm
++
++static inline int chkw(const int width, const int lumStride, const int chromStride)
++{
++    const int aw = FFALIGN(width, 16);
++    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
++}
++
++static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    if (chkw(width, lumStride, chromStride))
++        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
++    else
++        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
++}
++
++static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *bgr2yuv)
++{
++    if (chkw(width, lumStride, chromStride))
++        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
++    else
++        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
++}
++
+ 
+ av_cold void rgb2rgb_init_aarch64(void)
+ {
+@@ -37,5 +75,7 @@ av_cold void rgb2rgb_init_aarch64(void)
+ 
+     if (have_neon(cpu_flags)) {
+         interleaveBytes = ff_interleave_bytes_neon;
++        ff_rgb24toyv12 = rgb24toyv12_check;
++        ff_bgr24toyv12 = bgr24toyv12_check;
+     }
+ }
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index d81110ec57..8cf40b65f5 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -77,3 +77,184 @@ function ff_interleave_bytes_neon, export=1
+ 0:
+         ret
+ endfunc
++
++// void ff_rgb24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++function ff_rgb24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld1             {v3.s}[2], [x15], #4
++        ld1             {v3.s}[1], [x15], #4
++        ld1             {v3.s}[0], [x15], #4
++        ld1             {v4.s}[2], [x15], #4
++        ld1             {v4.s}[1], [x15], #4
++        ld1             {v4.s}[0], [x15], #4
++        ld1             {v5.s}[2], [x15], #4
++        ld1             {v5.s}[1], [x15], #4
++        ld1             {v5.s}[0], [x15]
++        b               99f
++endfunc
++
++// void ff_bgr24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++function ff_bgr24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[2], [x15]
++99:
++        ldr             w14, [sp, #0]
++        movi            v18.8b, #128
++        uxtl            v17.8h, v18.8b
++
++        // Even line - YUV
++1:
++        mov             x10, x0
++        mov             x11, x1
++        mov             x12, x2
++        mov             x13, x3
++        mov             w9,  w4
++
++0:
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        uxtl            v0.8h, v0.8b
++        uxtl            v1.8h, v1.8b
++        uxtl            v2.8h, v2.8b
++        // Y0
++        smull           v6.4s, v0.4h, v3.h[0]
++        smull2          v7.4s, v0.8h, v3.h[0]
++        smlal           v6.4s, v1.4h, v4.h[0]
++        smlal2          v7.4s, v1.8h, v4.h[0]
++        smlal           v6.4s, v2.4h, v5.h[0]
++        smlal2          v7.4s, v2.8h, v5.h[0]
++        shrn            v6.4h, v6.4s, #12
++        shrn2           v6.8h, v7.4s, #12
++        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
++        uqrshrn         v16.8b, v6.8h, #3
++        // Y1
++        smull           v6.4s, v20.4h, v3.h[0]
++        smull2          v7.4s, v20.8h, v3.h[0]
++        smlal           v6.4s, v21.4h, v4.h[0]
++        smlal2          v7.4s, v21.8h, v4.h[0]
++        smlal           v6.4s, v22.4h, v5.h[0]
++        smlal2          v7.4s, v22.8h, v5.h[0]
++        shrn            v6.4h, v6.4s, #12
++        shrn2           v6.8h, v7.4s, #12
++        add             v6.8h, v6.8h, v17.8h
++        uqrshrn2        v16.16b, v6.8h, #3
++        // Y0/Y1
++        st1             {v16.16b}, [x11], #16
++
++        uzp1            v0.8h, v0.8h, v20.8h
++        uzp1            v1.8h, v1.8h, v21.8h
++        uzp1            v2.8h, v2.8h, v22.8h
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v6.4s, v0.4h, v3.h[2]
++        smull2          v7.4s, v0.8h, v3.h[2]
++        smlal           v6.4s, v1.4h, v4.h[2]
++        smlal2          v7.4s, v1.8h, v4.h[2]
++        smlal           v6.4s, v2.4h, v5.h[2]
++        smlal2          v7.4s, v2.8h, v5.h[2]
++        shrn            v6.4h, v6.4s, #14
++        shrn2           v6.8h, v7.4s, #14
++        sqrshrn         v6.8b, v6.8h, #1
++        add             v6.8b, v6.8b, v18.8b     // +128
++        st1             {v6.8b}, [x12], #8
++
++        // V
++        smull           v6.4s, v0.4h, v3.h[4]
++        smull2          v7.4s, v0.8h, v3.h[4]
++        smlal           v6.4s, v1.4h, v4.h[4]
++        smlal2          v7.4s, v1.8h, v4.h[4]
++        smlal           v6.4s, v2.4h, v5.h[4]
++        smlal2          v7.4s, v2.8h, v5.h[4]
++        shrn            v6.4h, v6.4s, #14
++        shrn2           v6.8h, v7.4s, #14
++        sqrshrn         v6.8b, v6.8h, #1
++        add             v6.8b, v6.8b, v18.8b     // +128
++        st1             {v6.8b}, [x13], #8
++
++        subs            w9, w9, #16
++        b.gt            0b
++
++        // Odd line - Y only
++
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        mov             x10, x0
++        mov             x11, x1
++        mov             w9,  w4
++
++0:
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        uxtl            v0.8h, v0.8b
++        uxtl            v1.8h, v1.8b
++        uxtl            v2.8h, v2.8b
++        // Y0
++        smull           v6.4s, v0.4h, v3.h[0]
++        smull2          v7.4s, v0.8h, v3.h[0]
++        smlal           v6.4s, v1.4h, v4.h[0]
++        smlal2          v7.4s, v1.8h, v4.h[0]
++        smlal           v6.4s, v2.4h, v5.h[0]
++        smlal2          v7.4s, v2.8h, v5.h[0]
++        shrn            v6.4h, v6.4s, #12
++        shrn2           v6.8h, v7.4s, #12
++        add             v6.8h, v6.8h, v17.8h
++        uqrshrn         v16.8b, v6.8h, #3
++        // Y1
++        smull           v6.4s, v20.4h, v3.h[0]
++        smull2          v7.4s, v20.8h, v3.h[0]
++        smlal           v6.4s, v21.4h, v4.h[0]
++        smlal2          v7.4s, v21.8h, v4.h[0]
++        smlal           v6.4s, v22.4h, v5.h[0]
++        smlal2          v7.4s, v22.8h, v5.h[0]
++        shrn            v6.4h, v6.4s, #12
++        shrn2           v6.8h, v7.4s, #12
++        add             v6.8h, v6.8h, v17.8h
++        uqrshrn2        v16.16b, v6.8h, #3
++        // Y0/Y1
++        st1             {v16.16b}, [x11], #16
++
++        subs            w9, w9, #16
++        b.gt            0b
++
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        add             x2, x2, w7, SXTX
++        add             x3, x3, w7, SXTX
++        subs            w5, w5, #2
++        b.gt            1b
++
++        ret
++endfunc
+
+From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 27 Apr 2023 13:03:52 +0000
+Subject: [PATCH 126/135] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh
+
+(cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d)
+---
+ libswscale/aarch64/rgb2rgb.c      |   5 +-
+ libswscale/aarch64/rgb2rgb_neon.S | 440 ++++++++++++++++++++++++------
+ 2 files changed, 355 insertions(+), 90 deletions(-)
+
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index 6d3e0000dc..f10c4ef2de 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -44,8 +44,9 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+ 
+ static inline int chkw(const int width, const int lumStride, const int chromStride)
+ {
+-    const int aw = FFALIGN(width, 16);
+-    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
++//    const int aw = FFALIGN(width, 16);
++//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
++    return 1;
+ }
+ 
+ static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index 8cf40b65f5..978ab443ea 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -116,6 +116,25 @@ endfunc
+ //              int srcStr,                     // [sp, #0]
+ //              int32_t *rgb2yuv);              // [sp, #8]
+ 
++// regs
++// v0-2         Src bytes - reused as chroma src
++// v3-5         Coeffs (packed very inefficiently - could be squashed)
++// v6           128b
++// v7           128h
++// v8-15        Reserved
++// v16-18       Lo Src expanded as H
++// v19          -
++// v20-22       Hi Src expanded as H
++// v23          -
++// v24          U out
++// v25          U tmp
++// v26          Y out
++// v27-29       Y tmp
++// v30          V out
++// v31          V tmp
++
++// Assumes Little Endian in tail stores & conversion matrix
++
+ function ff_bgr24toyv12_aarch64, export=1
+         ldr             x15, [sp, #8]
+         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
+@@ -123,138 +142,383 @@ function ff_bgr24toyv12_aarch64, export=1
+         ld3             {v3.s, v4.s, v5.s}[2], [x15]
+ 99:
+         ldr             w14, [sp, #0]
+-        movi            v18.8b, #128
+-        uxtl            v17.8h, v18.8b
+-
+-        // Even line - YUV
++        movi            v7.8b, #128
++        uxtl            v6.8h, v7.8b
++        // Ensure if nothing to do then we do nothing
++        cmp             w4, #0
++        b.le            90f
++        cmp             w5, #0
++        b.le            90f
++        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
++        // the remainder done in the tail
++        tst             w4, #15
++        b.eq            1f
++        sub             w4, w4, #16
+ 1:
++
++// -------------------- Even line body - YUV
++11:
++        subs            w9,  w4, #0
+         mov             x10, x0
+         mov             x11, x1
+         mov             x12, x2
+         mov             x13, x3
+-        mov             w9,  w4
++        b.lt            12f
+ 
+-0:
+         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
+ 
+         uxtl2           v20.8h, v0.16b
+         uxtl2           v21.8h, v1.16b
+         uxtl2           v22.8h, v2.16b
+ 
+-        uxtl            v0.8h, v0.8b
+-        uxtl            v1.8h, v1.8b
+-        uxtl            v2.8h, v2.8b
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
+         // Y0
+-        smull           v6.4s, v0.4h, v3.h[0]
+-        smull2          v7.4s, v0.8h, v3.h[0]
+-        smlal           v6.4s, v1.4h, v4.h[0]
+-        smlal2          v7.4s, v1.8h, v4.h[0]
+-        smlal           v6.4s, v2.4h, v5.h[0]
+-        smlal2          v7.4s, v2.8h, v5.h[0]
+-        shrn            v6.4h, v6.4s, #12
+-        shrn2           v6.8h, v7.4s, #12
+-        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
+-        uqrshrn         v16.8b, v6.8h, #3
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
+         // Y1
+-        smull           v6.4s, v20.4h, v3.h[0]
+-        smull2          v7.4s, v20.8h, v3.h[0]
+-        smlal           v6.4s, v21.4h, v4.h[0]
+-        smlal2          v7.4s, v21.8h, v4.h[0]
+-        smlal           v6.4s, v22.4h, v5.h[0]
+-        smlal2          v7.4s, v22.8h, v5.h[0]
+-        shrn            v6.4h, v6.4s, #12
+-        shrn2           v6.8h, v7.4s, #12
+-        add             v6.8h, v6.8h, v17.8h
+-        uqrshrn2        v16.16b, v6.8h, #3
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
+         // Y0/Y1
+-        st1             {v16.16b}, [x11], #16
+-
+-        uzp1            v0.8h, v0.8h, v20.8h
+-        uzp1            v1.8h, v1.8h, v21.8h
+-        uzp1            v2.8h, v2.8h, v22.8h
+ 
+         // U
+         // Vector subscript *2 as we loaded into S but are only using H
+-        smull           v6.4s, v0.4h, v3.h[2]
+-        smull2          v7.4s, v0.8h, v3.h[2]
+-        smlal           v6.4s, v1.4h, v4.h[2]
+-        smlal2          v7.4s, v1.8h, v4.h[2]
+-        smlal           v6.4s, v2.4h, v5.h[2]
+-        smlal2          v7.4s, v2.8h, v5.h[2]
+-        shrn            v6.4h, v6.4s, #14
+-        shrn2           v6.8h, v7.4s, #14
+-        sqrshrn         v6.8b, v6.8h, #1
+-        add             v6.8b, v6.8b, v18.8b     // +128
+-        st1             {v6.8b}, [x12], #8
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
+ 
+         // V
+-        smull           v6.4s, v0.4h, v3.h[4]
+-        smull2          v7.4s, v0.8h, v3.h[4]
+-        smlal           v6.4s, v1.4h, v4.h[4]
+-        smlal2          v7.4s, v1.8h, v4.h[4]
+-        smlal           v6.4s, v2.4h, v5.h[4]
+-        smlal2          v7.4s, v2.8h, v5.h[4]
+-        shrn            v6.4h, v6.4s, #14
+-        shrn2           v6.8h, v7.4s, #14
+-        sqrshrn         v6.8b, v6.8h, #1
+-        add             v6.8b, v6.8b, v18.8b     // +128
+-        st1             {v6.8b}, [x13], #8
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
+ 
+         subs            w9, w9, #16
+-        b.gt            0b
+ 
+-        // Odd line - Y only
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++
++        b.gt            10b
++
++// -------------------- Even line tail - YUV
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
+ 
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        cmp             w9, #-16
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++        st1             {v24.s}[0],  [x12], #4
++        st1             {v30.s}[0],  [x13], #4
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++        st1             {v24.h}[2],  [x12], #2
++        st1             {v30.h}[2],  [x13], #2
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++        st1             {v24.b}[6],  [x12], #1
++        st1             {v30.b}[6],  [x13], #1
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++        st1             {v24.b}[7],  [x12]
++        st1             {v30.b}[7],  [x13]
++1:
++3:
++
++// -------------------- Odd line body - Y only
++
++        subs            w5, w5, #1
++        b.eq            90f
++
++        subs            w9,  w4, #0
+         add             x0, x0, w14, SXTX
+         add             x1, x1, w6, SXTX
+         mov             x10, x0
+         mov             x11, x1
+-        mov             w9,  w4
++        b.lt            12f
+ 
+-0:
+         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
+ 
+         uxtl2           v20.8h, v0.16b
+         uxtl2           v21.8h, v1.16b
+         uxtl2           v22.8h, v2.16b
+ 
+-        uxtl            v0.8h, v0.8b
+-        uxtl            v1.8h, v1.8b
+-        uxtl            v2.8h, v2.8b
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
+         // Y0
+-        smull           v6.4s, v0.4h, v3.h[0]
+-        smull2          v7.4s, v0.8h, v3.h[0]
+-        smlal           v6.4s, v1.4h, v4.h[0]
+-        smlal2          v7.4s, v1.8h, v4.h[0]
+-        smlal           v6.4s, v2.4h, v5.h[0]
+-        smlal2          v7.4s, v2.8h, v5.h[0]
+-        shrn            v6.4h, v6.4s, #12
+-        shrn2           v6.8h, v7.4s, #12
+-        add             v6.8h, v6.8h, v17.8h
+-        uqrshrn         v16.8b, v6.8h, #3
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
+         // Y1
+-        smull           v6.4s, v20.4h, v3.h[0]
+-        smull2          v7.4s, v20.8h, v3.h[0]
+-        smlal           v6.4s, v21.4h, v4.h[0]
+-        smlal2          v7.4s, v21.8h, v4.h[0]
+-        smlal           v6.4s, v22.4h, v5.h[0]
+-        smlal2          v7.4s, v22.8h, v5.h[0]
+-        shrn            v6.4h, v6.4s, #12
+-        shrn2           v6.8h, v7.4s, #12
+-        add             v6.8h, v6.8h, v17.8h
+-        uqrshrn2        v16.16b, v6.8h, #3
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
+         // Y0/Y1
+-        st1             {v16.16b}, [x11], #16
+ 
+         subs            w9, w9, #16
+-        b.gt            0b
++
++        st1             {v26.16b}, [x11], #16
++
++        b.gt            10b
++
++// -------------------- Odd line tail - Y
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        cmp             w9, #-16
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        uqrshrn         v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        uqrshrn2        v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++1:
++3:
++
++// ------------------- Loop to start
+ 
+         add             x0, x0, w14, SXTX
+         add             x1, x1, w6, SXTX
+         add             x2, x2, w7, SXTX
+         add             x3, x3, w7, SXTX
+-        subs            w5, w5, #2
+-        b.gt            1b
+-
++        subs            w5, w5, #1
++        b.gt            11b
++90:
+         ret
+ endfunc
+
+From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Wed, 26 Apr 2023 15:36:07 +0000
+Subject: [PATCH 127/135] rgb2rgb: Use asm unconditionally
+
+(cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1)
+---
+ libswscale/aarch64/rgb2rgb.c | 37 ++----------------------------------
+ 1 file changed, 2 insertions(+), 35 deletions(-)
+
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index f10c4ef2de..6a0e2dcc09 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -37,46 +37,13 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+                    int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+-// RGB to YUV asm fns process 16 pixels at once so ensure that the output
+-// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
+-// don't test for that
+-// Fall back to C if we cannot use asm
+-
+-static inline int chkw(const int width, const int lumStride, const int chromStride)
+-{
+-//    const int aw = FFALIGN(width, 16);
+-//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
+-    return 1;
+-}
+-
+-static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+-                   uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *rgb2yuv)
+-{
+-    if (chkw(width, lumStride, chromStride))
+-        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+-    else
+-        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
+-}
+-
+-static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+-                   uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *bgr2yuv)
+-{
+-    if (chkw(width, lumStride, chromStride))
+-        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+-    else
+-        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
+-}
+-
+-
+ av_cold void rgb2rgb_init_aarch64(void)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_neon(cpu_flags)) {
+         interleaveBytes = ff_interleave_bytes_neon;
+-        ff_rgb24toyv12 = rgb24toyv12_check;
+-        ff_bgr24toyv12 = bgr24toyv12_check;
++        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
++        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
+     }
+ }
+
+From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 27 Apr 2023 13:01:43 +0000
+Subject: [PATCH 128/135] tests/swscale: Add options for width and height on
+ the command line
+
+(cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271)
+---
+ libswscale/tests/swscale.c | 84 ++++++++++++++++++++++++++------------
+ 1 file changed, 59 insertions(+), 25 deletions(-)
+
+diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
+index 6c38041ddb..4cf41d9f64 100644
+--- a/libswscale/tests/swscale.c
++++ b/libswscale/tests/swscale.c
+@@ -355,56 +355,71 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
+     return 0;
+ }
+ 
+-#define W 96
+-#define H 96
+-
+ int main(int argc, char **argv)
+ {
++    unsigned int W = 96;
++    unsigned int H = 96;
++    unsigned int W2;
++    unsigned int H2;
++    unsigned int S;
+     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+-    uint8_t *rgb_data   = av_malloc(W * H * 4);
+-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
+-    uint8_t *data       = av_malloc(4 * W * H);
+-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+-    int stride[4]       = { W, W, W, W };
+     int x, y;
+     struct SwsContext *sws;
+     AVLFG rand;
+     int res = -1;
+     int i;
+     FILE *fp = NULL;
+-
+-    if (!rgb_data || !data)
+-        return -1;
++    uint8_t *rgb_data;
++    uint8_t * rgb_src[4] = { NULL };
++    int rgb_stride[4]   = { 0 };
++    uint8_t *data;
++    uint8_t * src[4] = { NULL };
++    int stride[4]       = { 0 };
+ 
+     for (i = 1; i < argc; i += 2) {
++        const char * const arg2 = argv[i+1];
++
+         if (argv[i][0] != '-' || i + 1 == argc)
+             goto bad_option;
+         if (!strcmp(argv[i], "-ref")) {
+-            fp = fopen(argv[i + 1], "r");
++            fp = fopen(arg2, "r");
+             if (!fp) {
+-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
++                fprintf(stderr, "could not open '%s'\n", arg2);
+                 goto error;
+             }
+         } else if (!strcmp(argv[i], "-cpuflags")) {
+             unsigned flags = av_get_cpu_flags();
+-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
++            int ret = av_parse_cpu_caps(&flags, arg2);
+             if (ret < 0) {
+-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid cpu flags %s\n", arg2);
+                 return ret;
+             }
+             av_force_cpu_flags(flags);
+         } else if (!strcmp(argv[i], "-src")) {
+-            srcFormat = av_get_pix_fmt(argv[i + 1]);
++            srcFormat = av_get_pix_fmt(arg2);
+             if (srcFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                 return -1;
+             }
+         } else if (!strcmp(argv[i], "-dst")) {
+-            dstFormat = av_get_pix_fmt(argv[i + 1]);
++            dstFormat = av_get_pix_fmt(arg2);
+             if (dstFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-w")) {
++            char * p = NULL;
++            W = strtoul(arg2, &p, 0);
++            if (!W || *p) {
++                fprintf(stderr, "bad width %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-h")) {
++            char * p = NULL;
++            H = strtoul(arg2, &p, 0);
++            if (!H || *p) {
++                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
+                 return -1;
+             }
+         } else {
+@@ -414,15 +429,34 @@ bad_option:
+         }
+     }
+ 
+-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
++    S = (W + 15) & ~15;
++    rgb_data   = av_mallocz(S * H * 4);
++    rgb_src[0] = rgb_data;
++    rgb_stride[0]   = 4 * S;
++    data       = av_mallocz(4 * S * H);
++    src[0] = data;
++    src[1] = data + S * H;
++    src[2] = data + S * H * 2;
++    src[3] = data + S * H * 3;
++    stride[0] = S;
++    stride[1] = S;
++    stride[2] = S;
++    stride[3] = S;
++    H2 = H < 96 ? 8 : H / 12;
++    W2 = W < 96 ? 8 : W / 12;
++
++    if (!rgb_data || !data)
++        return -1;
++
++    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
+                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+ 
+     av_lfg_init(&rand, 1);
+ 
+     for (y = 0; y < H; y++)
+         for (x = 0; x < W * 4; x++)
+-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
++            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
++    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
+     if (res < 0 || res != H) {
+         res = -1;
+         goto error;
+@@ -431,10 +465,10 @@ bad_option:
+     av_free(rgb_data);
+ 
+     if(fp) {
+-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
++        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
+         fclose(fp);
+     } else {
+-        selfTest(src, stride, W, H, srcFormat, dstFormat);
++        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
+         res = 0;
+     }
+ error:
+
+From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Wed, 26 Apr 2023 16:31:23 +0000
+Subject: [PATCH 129/135] tests/swscale: Add a timing option
+
+-t <n>   Where n is the number of time to loop the scale op.
+         Often useful to do it 10 times or so for better resolution
+
+(cherry picked from commit 50cd60a23a66254f911376602d07b30fcafbde96)
+---
+ libswscale/tests/swscale.c | 32 ++++++++++++++++++++++++++++++--
+ 1 file changed, 30 insertions(+), 2 deletions(-)
+
+diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
+index 4cf41d9f64..12776ffec7 100644
+--- a/libswscale/tests/swscale.c
++++ b/libswscale/tests/swscale.c
+@@ -23,6 +23,7 @@
+ #include <string.h>
+ #include <inttypes.h>
+ #include <stdarg.h>
++#include <time.h>
+ 
+ #undef HAVE_AV_CONFIG_H
+ #include "libavutil/cpu.h"
+@@ -78,6 +79,15 @@ struct Results {
+     uint32_t crc;
+ };
+ 
++static int time_rep = 0;
++
++static uint64_t utime(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
++}
++
+ // test by ref -> src -> dst -> out & compare out against ref
+ // ref & out are YV12
+ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+         goto end;
+     }
+ 
+-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
++    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
+            desc_src->name, srcW, srcH,
+            desc_dst->name, dstW, dstH,
+            flags);
+@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+ 
+     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+ 
++    if (time_rep != 0)
++    {
++        const uint64_t now = utime();
++        uint64_t done;
++        for (i = 1; i != time_rep; ++i) {
++            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
++        }
++        done = utime();
++        printf(" T=%7"PRId64"us ", done-now);
++    }
++
+     for (i = 0; i < 4 && dstStride[i]; i++)
+         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                      dstStride[i] * dstH);
+@@ -419,7 +440,14 @@ int main(int argc, char **argv)
+             char * p = NULL;
+             H = strtoul(arg2, &p, 0);
+             if (!H || *p) {
+-                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
++                fprintf(stderr, "bad height '%s'\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-t")) {
++            char * p = NULL;
++            time_rep = (int)strtol(arg2, &p, 0);
++            if (*p) {
++                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
+                 return -1;
+             }
+         } else {
+
+From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 20 Apr 2023 13:40:36 +0000
+Subject: [PATCH 130/135] swscale: RGB->YUV420 fix C template to allow odd
+ widths
+
+(cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8)
+---
+ libswscale/rgb2rgb_template.c | 44 +++++++++++++++++++++++++++++++++++
+ libswscale/swscale_unscaled.c | 11 ++++-----
+ 2 files changed, 49 insertions(+), 6 deletions(-)
+
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index 703de90690..e711589e1e 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -679,6 +679,19 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
+         ydst += lumStride;
+         src  += srcStride;
+ 
+@@ -701,6 +714,15 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
+         udst += chromStride;
+         vdst += chromStride;
+         ydst += lumStride;
+@@ -767,6 +789,19 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
+         ydst += lumStride;
+         src  += srcStride;
+ 
+@@ -789,6 +824,15 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
+         udst += chromStride;
+         vdst += chromStride;
+         ydst += lumStride;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index 053c06adf5..52469b2e4a 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -2062,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
+     const enum AVPixelFormat dstFormat = c->dstFormat;
+     const int flags = c->flags;
+     const int dstH = c->dstH;
+-    const int dstW = c->dstW;
+     int needsDither;
+ 
+     needsDither = isAnyRGB(dstFormat) &&
+@@ -2120,12 +2119,12 @@ void ff_get_unscaled_swscale(SwsContext *c)
+     /* bgr24toYV12 */
+     if (srcFormat == AV_PIX_FMT_BGR24 &&
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = bgr24ToYv12Wrapper;
+     /* rgb24toYV12 */
+     if (srcFormat == AV_PIX_FMT_RGB24 &&
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = rgb24ToYv12Wrapper;
+ 
+     /* bgrxtoYV12 */
+@@ -2136,17 +2135,17 @@ void ff_get_unscaled_swscale(SwsContext *c)
+     /* rgbx24toYV12 */
+     if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
+          (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = rgbxToYv12Wrapper;
+     /* xbgrtoYV12 */
+     if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
+          (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = xbgrToYv12Wrapper;
+     /* xrgb24toYV12 */
+     if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
+          (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = xrgbToYv12Wrapper;
+ 
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+
+From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Thu, 4 May 2023 14:26:14 +0000
+Subject: [PATCH 131/135] rtpenc: Add code to send H264 new extradata in
+ sidedata
+
+Fixes issue with pi V4L2 H264 encode which cannot create extradata
+at init time.
+
+(cherry picked from commit 4f852b4b093f841b64b4934a6f1720e98e4e0f2c)
+---
+ libavformat/rtpenc.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
+index a8d296a154..f67dc2a15a 100644
+--- a/libavformat/rtpenc.c
++++ b/libavformat/rtpenc.c
+@@ -19,6 +19,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "avc.h"
+ #include "avformat.h"
+ #include "mpegts.h"
+ #include "internal.h"
+@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
+         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
+         break;
+     case AV_CODEC_ID_H264:
++    {
++        uint8_t *side_data;
++        int side_data_size = 0;
++
++        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &side_data_size);
++
++        if (side_data_size != 0) {
++            int ps_size = side_data_size;
++            uint8_t * ps_buf = NULL;
++
++            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
++            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
++            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
++            av_free(ps_buf);
++        }
+         ff_rtp_send_h264_hevc(s1, pkt->data, size);
+         break;
++    }
+     case AV_CODEC_ID_H261:
+         ff_rtp_send_h261(s1, pkt->data, size);
+         break;
+
+From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Mon, 5 Jun 2023 08:34:38 +0000
+Subject: [PATCH 132/135] rgb2rgb: Fix luma narrow+saturation instruction
+
+(cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a)
+---
+ libswscale/aarch64/rgb2rgb_neon.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index 978ab443ea..476ca723a0 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -203,11 +203,11 @@ function ff_bgr24toyv12_aarch64, export=1
+         shrn            v26.4h, v26.4s, #12
+         shrn2           v26.8h, v27.4s, #12
+         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+-        uqrshrn         v26.8b, v26.8h, #3
++        sqrshrun        v26.8b, v26.8h, #3
+         shrn            v28.4h, v28.4s, #12
+         shrn2           v28.8h, v29.4s, #12
+         add             v28.8h, v28.8h, v6.8h
+-        uqrshrn2        v26.16b, v28.8h, #3
++        sqrshrun2       v26.16b, v28.8h, #3
+         // Y0/Y1
+ 
+         // U
+@@ -282,11 +282,11 @@ function ff_bgr24toyv12_aarch64, export=1
+         shrn            v26.4h, v26.4s, #12
+         shrn2           v26.8h, v27.4s, #12
+         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+-        uqrshrn         v26.8b, v26.8h, #3
++        sqrshrun        v26.8b, v26.8h, #3
+         shrn            v28.4h, v28.4s, #12
+         shrn2           v28.8h, v29.4s, #12
+         add             v28.8h, v28.8h, v6.8h
+-        uqrshrn2        v26.16b, v28.8h, #3
++        sqrshrun2       v26.16b, v28.8h, #3
+         // Y0/Y1
+ 
+         // U
+@@ -416,11 +416,11 @@ function ff_bgr24toyv12_aarch64, export=1
+         shrn            v26.4h, v26.4s, #12
+         shrn2           v26.8h, v27.4s, #12
+         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+-        uqrshrn         v26.8b, v26.8h, #3
++        sqrshrun        v26.8b, v26.8h, #3
+         shrn            v28.4h, v28.4s, #12
+         shrn2           v28.8h, v29.4s, #12
+         add             v28.8h, v28.8h, v6.8h
+-        uqrshrn2        v26.16b, v28.8h, #3
++        sqrshrun2       v26.16b, v28.8h, #3
+         // Y0/Y1
+ 
+         subs            w9, w9, #16
+@@ -464,11 +464,11 @@ function ff_bgr24toyv12_aarch64, export=1
+         shrn            v26.4h, v26.4s, #12
+         shrn2           v26.8h, v27.4s, #12
+         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
+-        uqrshrn         v26.8b, v26.8h, #3
++        sqrshrun        v26.8b, v26.8h, #3
+         shrn            v28.4h, v28.4s, #12
+         shrn2           v28.8h, v29.4s, #12
+         add             v28.8h, v28.8h, v6.8h
+-        uqrshrn2        v26.16b, v28.8h, #3
++        sqrshrun2       v26.16b, v28.8h, #3
+         // Y0/Y1
+ 
+         // Here:
+
+From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Sun, 4 Jun 2023 13:37:59 +0000
+Subject: [PATCH 133/135] v4l2_m2m_dec: Tweak pending count to use dts &
+ reorder size
+
+(cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a)
+---
+ libavcodec/v4l2_m2m.h     |  1 +
+ libavcodec/v4l2_m2m_dec.c | 53 +++++++++++++++++++++++++++++++--------
+ 2 files changed, 43 insertions(+), 11 deletions(-)
+
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index ded1478a49..a506e69d67 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -115,6 +115,7 @@ typedef struct V4L2m2mContext {
+ 
+     /* req pkt */
+     int req_pkt;
++    int reorder_size;
+ 
+     /* Ext data sent */
+     int extdata_sent;
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index d124c7b1fc..13af62e819 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -121,13 +121,18 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
+ }
+ #endif
+ 
+-static int64_t pts_stats_guess(const pts_stats_t * const stats)
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
+ {
+     if (stats->last_count <= 1)
+         return stats->last_pts;
+     if (stats->last_pts == AV_NOPTS_VALUE ||
+-            stats->last_interval == 0 ||
+-            stats->last_count >= STATS_LAST_COUNT_MAX)
++            fail_bad_guess && (stats->last_interval == 0 ||
++                               stats->last_count >= STATS_LAST_COUNT_MAX))
+         return AV_NOPTS_VALUE;
+     return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
+ }
+@@ -345,7 +350,7 @@ set_best_effort_pts(AVCodecContext *const avctx,
+ {
+     pts_stats_add(ps, frame->pts);
+ 
+-    frame->best_effort_timestamp = pts_stats_guess(ps);
++    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
+     // If we can't guess from just PTS - try DTS
+     if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
+         frame->best_effort_timestamp = frame->pkt_dts;
+@@ -380,15 +385,25 @@ xlat_init(xlat_track_t * const x)
+ }
+ 
+ static int
+-xlat_pending(const xlat_track_t * const x)
++xlat_pending(const V4L2m2mContext * const s)
+ {
++    const xlat_track_t *const x = &s->xlat;
+     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
+     int i;
+-    const int64_t now = x->last_pts;
++    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
++    int64_t first_dts = AV_NOPTS_VALUE;
++    int no_dts_count = 0;
++    unsigned int interval = pts_stats_interval(&s->pts_stat);
+ 
+     for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
+         const V4L2m2mTrackEl * const t = x->track_els + n;
+ 
++        if (first_dts == AV_NOPTS_VALUE)
++            if (t->dts == AV_NOPTS_VALUE)
++                ++no_dts_count;
++            else
++                first_dts = t->dts;
++
+         // Discard only set on never-set or flushed entries
+         // So if we get here we've never successfully decoded a frame so allow
+         // more frames into the buffer before stalling
+@@ -408,6 +423,18 @@ xlat_pending(const xlat_track_t * const x)
+             break;
+     }
+ 
++    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
++        const int iframes = (first_dts - now) / (int)interval;
++        const int t = iframes - s->reorder_size + no_dts_count;
++
++//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
++//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
++
++        if (iframes > 0 && iframes < 64 && t < i) {
++            return t;
++        }
++    }
++
+     return i;
+ }
+ 
+@@ -585,12 +612,12 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
+ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+ {
+     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    int src_rv = NQ_OK;
++    int src_rv = -1;
+     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
+     unsigned int i = 0;
+ 
+     do {
+-        const int pending = xlat_pending(&s->xlat);
++        const int pending = xlat_pending(s);
+         const int prefer_dq = (pending > 4);
+         const int last_src_rv = src_rv;
+ 
+@@ -966,8 +993,10 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
+ }
+ 
+ static void
+-parse_extradata(AVCodecContext *avctx)
++parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
+ {
++    s->reorder_size = 0;
++
+     if (!avctx->extradata || !avctx->extradata_size)
+         return;
+ 
+@@ -996,6 +1025,7 @@ parse_extradata(AVCodecContext *avctx)
+                     avctx->profile = ff_h264_get_profile(sps);
+                     avctx->level = sps->level_idc;
+                 }
++                s->reorder_size = sps->num_reorder_frames;
+             }
+             ff_h264_ps_uninit(&ps);
+             break;
+@@ -1025,6 +1055,7 @@ parse_extradata(AVCodecContext *avctx)
+                 if (sps) {
+                     avctx->profile = sps->ptl.general_ptl.profile_idc;
+                     avctx->level   = sps->ptl.general_ptl.level_idc;
++                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
+                 }
+             }
+             ff_hevc_ps_uninit(&ps);
+@@ -1057,12 +1088,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+         avctx->ticks_per_frame = 2;
+     }
+ 
+-    parse_extradata(avctx);
+-
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+ 
++    parse_extradata(avctx, s);
++
+     xlat_init(&s->xlat);
+     pts_stats_init(&s->pts_stat, avctx, "decoder");
+ 
+
+From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Wed, 7 Jun 2023 11:14:52 +0000
+Subject: [PATCH 134/135] v4l2_m2m: Add encode size check
+
+Previously an out of bounds size would fail whilst trying to copy the
+buffer with an unhelpful message. This produces a better error at init
+time.
+
+(cherry picked from commit 0b61c4617e26f043d28d44c8767f7b9fd4882f97)
+---
+ libavcodec/v4l2_m2m.c | 43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
+index f802687b1b..28d9ed4988 100644
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -109,6 +109,44 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+     return AVERROR(EINVAL);
+ }
+ 
++static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_format fmt = {.type = s->output.type};
++    int rv;
++    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
++    unsigned int w;
++    unsigned int h;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        fmt.fmt.pix_mp.pixelformat = pixfmt;
++        fmt.fmt.pix_mp.width = avctx->width;
++        fmt.fmt.pix_mp.height = avctx->height;
++    }
++    else {
++        fmt.fmt.pix.pixelformat = pixfmt;
++        fmt.fmt.pix.width = avctx->width;
++        fmt.fmt.pix.height = avctx->height;
++    }
++
++    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
++
++    if (rv != 0) {
++        rv = AVERROR(errno);
++        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
++        return rv;
++    }
++
++    w = ff_v4l2_get_format_width(&fmt);
++    h = ff_v4l2_get_format_height(&fmt);
++
++    if (w < avctx->width || h < avctx->height) {
++        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
+ static int v4l2_probe_driver(V4L2m2mContext *s)
+ {
+     void *log_ctx = s->avctx;
+@@ -128,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
+         goto done;
+     }
+ 
++    // If being given frames (encode) check that V4L2 can cope with the size
++    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
++        (ret = check_size(s->avctx, s)) != 0)
++        goto done;
++
+     ret = ff_v4l2_context_get_format(&s->capture, 1);
+     if (ret) {
+         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
+
+From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001
+From: John Cox <jc@kynesim.co.uk>
+Date: Fri, 9 Jun 2023 10:28:12 +0000
+Subject: [PATCH 135/135] vf_bwdif: Add attributes to ask for vectorization
+
+(cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7)
+---
+ libavfilter/vf_bwdif.c | 29 +++++++++++++++--------------
+ 1 file changed, 15 insertions(+), 14 deletions(-)
+
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 65c617ebb3..09e68523bb 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -74,10 +74,10 @@ typedef struct ThreadData {
+         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
+         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
+         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
+- \
++ {/*\
+         if (!diff) { \
+             dst[0] = d; \
+-        } else {
++        } else {*/
+ 
+ #define SPAT_CHECK() \
+             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
+@@ -89,15 +89,16 @@ typedef struct ThreadData {
+             diff = FFMAX3(diff, min, -max);
+ 
+ #define FILTER_LINE() \
++            int i1, i2; \
+             SPAT_CHECK() \
+-            if (FFABS(c - e) > temporal_diff0) { \
+-                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
++            /*if (FFABS(c - e) > temporal_diff0)*/ { \
++                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
+                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
+                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
+                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            } else { \
+-                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            }
++            } /*else*/ { \
++                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
++            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
+ 
+ #define FILTER_EDGE() \
+             if (spat) { \
+@@ -111,7 +112,7 @@ typedef struct ThreadData {
+             else if (interpol < d - diff) \
+                 interpol = d - diff; \
+  \
+-            dst[0] = av_clip(interpol, 0, clip_max); \
++            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
+         } \
+  \
+         dst++; \
+@@ -122,7 +123,7 @@ typedef struct ThreadData {
+         next2++; \
+     }
+ 
+-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
++static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+                          int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint8_t *dst = dst1;
+@@ -132,7 +133,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                           int prefs3, int mrefs3, int prefs4, int mrefs4,
+                           int parity, int clip_max)
+@@ -150,7 +151,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
+-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat)
+ {
+@@ -167,7 +168,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
+-static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
++static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint16_t *dst = dst1;
+@@ -177,7 +178,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
+                                 int parity, int clip_max)
+@@ -195,7 +196,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
+     FILTER2()
+ }
+ 
+-static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+ {