From 25301226bd1e5cada80e03f0deb3f8cb0d23d9c0 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Sun, 24 Jul 2022 10:30:47 +0200
Subject: [PATCH] ffmpeg: update rpi patch

Patch created using revisions dc91b91..a4f3ca9
from branch dev/4.4/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 4120 +++++++++++++----
 1 file changed, 3229 insertions(+), 891 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 6809d21783..ca1830666f 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -375,7 +375,7 @@ index 807e783422..456d4f349b 100644
        "write program-readable progress information", "url" },
      { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 33a280cf69..ef22d26dc1 100644
+index 33a280cf69..a403dc41d6 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
 @@ -19,6 +19,7 @@ HEADERS = ac3_parser.h                                                  \
@@ -428,7 +428,7 @@ index 33a280cf69..ef22d26dc1 100644
 +OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
 +OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
++                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
  OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
  OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
  OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
@@ -465,19 +465,10 @@ index 33a280cf69..ef22d26dc1 100644
 +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
 +endif
 diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
-index 954461f81d..7078dc6089 100644
+index 954461f81d..c8935f205e 100644
 --- a/libavcodec/aarch64/Makefile
 +++ b/libavcodec/aarch64/Makefile
-@@ -35,6 +35,8 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
- 
- # subsystems
- NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
-+NEON-OBJS-$(CONFIG_BLOCKDSP)            += aarch64/blockdsp_init_aarch64.o     \
-+                                           aarch64/blockdsp_neon.o
- NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
- NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
- NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
-@@ -44,10 +46,12 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
  NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                             aarch64/hpeldsp_neon.o
  NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
@@ -491,103 +482,6 @@ index 954461f81d..7078dc6089 100644
  NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
  
  # decoders/encoders
-diff --git a/libavcodec/aarch64/blockdsp_init_aarch64.c b/libavcodec/aarch64/blockdsp_init_aarch64.c
-new file mode 100644
-index 0000000000..9f3280f007
---- /dev/null
-+++ b/libavcodec/aarch64/blockdsp_init_aarch64.c
-@@ -0,0 +1,42 @@
-+/*
-+ * AArch64 NEON optimised block operations
-+ *
-+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <stdint.h>
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/cpu.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/blockdsp.h"
-+
-+void ff_clear_block_neon(int16_t *block);
-+void ff_clear_blocks_neon(int16_t *blocks);
-+
-+av_cold void ff_blockdsp_init_aarch64(BlockDSPContext *c)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags)) {
-+        c->clear_block  = ff_clear_block_neon;
-+        c->clear_blocks = ff_clear_blocks_neon;
-+    }
-+}
-diff --git a/libavcodec/aarch64/blockdsp_neon.S b/libavcodec/aarch64/blockdsp_neon.S
-new file mode 100644
-index 0000000000..e4a4959ccc
---- /dev/null
-+++ b/libavcodec/aarch64/blockdsp_neon.S
-@@ -0,0 +1,43 @@
-+/*
-+ * AArch64 NEON optimised block operations
-+ *
-+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/aarch64/asm.S"
-+
-+function ff_clear_block_neon, export=1
-+        movi            v0.16b, #0
-+        movi            v1.16b, #0
-+        st1             {v0.16b, v1.16b}, [x0], #32
-+        st1             {v0.16b, v1.16b}, [x0], #32
-+        st1             {v0.16b, v1.16b}, [x0], #32
-+        st1             {v0.16b, v1.16b}, [x0]
-+        ret
-+endfunc
-+
-+function ff_clear_blocks_neon, export=1
-+        movi            v0.16b, #0
-+        movi            v1.16b, #0
-+        .rept           23
-+        st1             {v0.16b, v1.16b}, [x0], #32
-+        .endr
-+        st1             {v0.16b, v1.16b}, [x0]
-+        ret
-+endfunc
 diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
 index 742a3372e3..eec21aa5a2 100644
 --- a/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -767,7 +661,7 @@ index 0000000000..7f47611206
 +        ret
 +endfunc
 diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
-index 13dfd74940..161d5a972b 100644
+index 13dfd74940..a7976fd596 100644
 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
 +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
 @@ -21,10 +21,28 @@
@@ -789,12 +683,12 @@ index 13dfd74940..161d5a972b 100644
 +void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 +void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 +
-+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
-+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
 +
  void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                  int h, int x, int y);
@@ -892,10 +786,10 @@ index 13dfd74940..161d5a972b 100644
  }
 diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
 new file mode 100644
-index 0000000000..529c21d285
+index 0000000000..9a96c2523c
 --- /dev/null
 +++ b/libavcodec/aarch64/vc1dsp_neon.S
-@@ -0,0 +1,1552 @@
+@@ -0,0 +1,1546 @@
 +/*
 + * VC1 AArch64 NEON optimisations
 + *
@@ -1605,11 +1499,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of lower block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_v_loop_filter4_neon, export=1
 +        sub             x3, x0, w1, sxtw #2
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.s}[0], [x0], x1     // P5
 +        ld1             {v2.s}[0], [x3], x1     // P1
@@ -1678,11 +1571,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of right block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_h_loop_filter4_neon, export=1
 +        sub             x3, x0, #4              // where to start reading
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.8b}, [x3], x1
 +        sub             x0, x0, #1              // where to start writing
@@ -1752,11 +1644,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of lower block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_v_loop_filter8_neon, export=1
 +        sub             x3, x0, w1, sxtw #2
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.8b}, [x0], x1       // P5
 +        movi            v2.2d, #0x0000ffff00000000
@@ -1830,11 +1721,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of right block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_h_loop_filter8_neon, export=1
 +        sub             x3, x0, #4              // where to start reading
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
 +        sub             x0, x0, #1              // where to start writing
@@ -1939,11 +1829,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of lower block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_v_loop_filter16_neon, export=1
 +        sub             x3, x0, w1, sxtw #2
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.16b}, [x0], x1      // P5
 +        movi            v2.2d, #0x0000ffff00000000
@@ -2071,11 +1960,10 @@ index 0000000000..529c21d285
 +// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
 +// On entry:
 +//   x0 -> top-left pel of right block
-+//   w1 = row stride, bytes
++//   x1 = row stride, bytes
 +//   w2 = PQUANT bitstream parameter
 +function ff_vc1_h_loop_filter16_neon, export=1
 +        sub             x3, x0, #4              // where to start reading
-+        sxtw            x1, w1                  // technically, stride is signed int
 +        ldr             d0, .Lcoeffs
 +        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
 +        sub             x0, x0, #1              // where to start writing
@@ -17404,7 +17292,7 @@ index 2cca784f5a..48cb816b70 100644
 +    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
  }
 diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
-index 93f043bf08..8e97bc5e58 100644
+index 93f043bf08..96014fbebc 100644
 --- a/libavcodec/arm/vc1dsp_neon.S
 +++ b/libavcodec/arm/vc1dsp_neon.S
 @@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
@@ -17560,17 +17448,17 @@ index 93f043bf08..8e97bc5e58 100644
 +function ff_vc1_v_loop_filter8_neon, export=1
 +        sub             r3, r0, r1, lsl #2
 +        vldr            d0, .Lcoeffs
-+        vld1.32         {d1}, [r0], r1          @ P5
-+        vld1.32         {d2}, [r3], r1          @ P1
-+        vld1.32         {d3}, [r3], r1          @ P2
-+        vld1.32         {d4}, [r0], r1          @ P6
-+        vld1.32         {d5}, [r3], r1          @ P3
-+        vld1.32         {d6}, [r0], r1          @ P7
++        vld1.32         {d1}, [r0 :64], r1      @ P5
++        vld1.32         {d2}, [r3 :64], r1      @ P1
++        vld1.32         {d3}, [r3 :64], r1      @ P2
++        vld1.32         {d4}, [r0 :64], r1      @ P6
++        vld1.32         {d5}, [r3 :64], r1      @ P3
++        vld1.32         {d6}, [r0 :64], r1      @ P7
 +        vshll.u8        q8, d1, #1              @ 2*P5
 +        vshll.u8        q9, d2, #1              @ 2*P1
-+        vld1.32         {d7}, [r3]              @ P4
++        vld1.32         {d7}, [r3 :64]          @ P4
 +        vmovl.u8        q1, d3                  @ P2
-+        vld1.32         {d20}, [r0]             @ P8
++        vld1.32         {d20}, [r0 :64]         @ P8
 +        vmovl.u8        q11, d4                 @ P6
 +        vdup.16         q12, r2                 @ pq
 +        vmovl.u8        q13, d5                 @ P3
@@ -17625,8 +17513,8 @@ index 93f043bf08..8e97bc5e58 100644
 +        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
 +        vqmovun.s16     d0, q3
 +        vqmovun.s16     d1, q1
-+        vst1.32         {d0}, [r3], r1
-+        vst1.32         {d1}, [r3]
++        vst1.32         {d0}, [r3 :64], r1
++        vst1.32         {d1}, [r3 :64]
 +1:      bx              lr
 +endfunc
 +
@@ -17741,17 +17629,17 @@ index 93f043bf08..8e97bc5e58 100644
 +        vpush           {d8-d15}
 +        sub             r3, r0, r1, lsl #2
 +        vldr            d0, .Lcoeffs
-+        vld1.64         {q1}, [r0], r1          @ P5
-+        vld1.64         {q2}, [r3], r1          @ P1
-+        vld1.64         {q3}, [r3], r1          @ P2
-+        vld1.64         {q4}, [r0], r1          @ P6
-+        vld1.64         {q5}, [r3], r1          @ P3
-+        vld1.64         {q6}, [r0], r1          @ P7
++        vld1.64         {q1}, [r0 :128], r1     @ P5
++        vld1.64         {q2}, [r3 :128], r1     @ P1
++        vld1.64         {q3}, [r3 :128], r1     @ P2
++        vld1.64         {q4}, [r0 :128], r1     @ P6
++        vld1.64         {q5}, [r3 :128], r1     @ P3
++        vld1.64         {q6}, [r0 :128], r1     @ P7
 +        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
 +        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
-+        vld1.64         {q9}, [r3]              @ P4
++        vld1.64         {q9}, [r3 :128]         @ P4
 +        vmovl.u8        q10, d6                 @ P2[0..7]
-+        vld1.64         {q11}, [r0]             @ P8
++        vld1.64         {q11}, [r0 :128]        @ P8
 +        vmovl.u8        q12, d8                 @ P6[0..7]
 +        vdup.16         q13, r2                 @ pq
 +        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
@@ -17861,8 +17749,8 @@ index 93f043bf08..8e97bc5e58 100644
 +        vqmovun.s16     d0, q6
 +        vqmovun.s16     d5, q9
 +        vqmovun.s16     d1, q1
-+        vst1.64         {q2}, [r3], r1
-+        vst1.64         {q0}, [r3]
++        vst1.64         {q2}, [r3 :128], r1
++        vst1.64         {q0}, [r3 :128]
 +1:      vpop            {d8-d15}
 +        bx              lr
 +endfunc
@@ -18194,31 +18082,6 @@ index 8a71c04230..53644506e5 100644
  } AVHWAccel;
  
  /**
-diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
-index c7efe7e77b..46766244b8 100644
---- a/libavcodec/blockdsp.c
-+++ b/libavcodec/blockdsp.c
-@@ -65,6 +65,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
-     c->fill_block_tab[0] = fill_block16_c;
-     c->fill_block_tab[1] = fill_block8_c;
- 
-+    if (ARCH_AARCH64)
-+        ff_blockdsp_init_aarch64(c);
-     if (ARCH_ALPHA)
-         ff_blockdsp_init_alpha(c);
-     if (ARCH_ARM)
-diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
-index 26fc2ea13b..fe539491da 100644
---- a/libavcodec/blockdsp.h
-+++ b/libavcodec/blockdsp.h
-@@ -41,6 +41,7 @@ typedef struct BlockDSPContext {
- 
- void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
- 
-+void ff_blockdsp_init_aarch64(BlockDSPContext *c);
- void ff_blockdsp_init_alpha(BlockDSPContext *c);
- void ff_blockdsp_init_arm(BlockDSPContext *c);
- void ff_blockdsp_init_ppc(BlockDSPContext *c);
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
 index 38d06b2842..bbf5d70560 100644
 --- a/libavcodec/cabac.h
@@ -19015,6 +18878,527 @@ index 0000000000..4e35bd583d
 +#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
 +
 +#endif
+diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
+new file mode 100644
+index 0000000000..7e05f6e7c3
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,515 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ *  Video for Linux Two controls header file
++ *
++ *  Copyright (C) 1999-2012 the contributors
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  Alternatively you can redistribute this file under the terms of the
++ *  BSD license as stated below:
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in
++ *     the documentation and/or other materials provided with the
++ *     distribution.
++ *  3. The names of its contributors may not be used to endorse or promote
++ *     products derived from this software without specific prior written
++ *     permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  The contents of this header was split off from videodev2.h. All control
++ *  definitions should be added to this header, which is included by
++ *  videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++	V4L2_STATELESS_HEVC_START_CODE_NONE,
++	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ *			vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ *			  reference by other syntax elements
++ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
++ *				in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ *				in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ *                         samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ *                           samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ *                                     the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ *                                    required size of the decoded picture
++ *                                    buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ *				    value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ *					    luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ *					      the maximum and minimum luma
++ *					      coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ *					       transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ *						 the maximum and minimum luma
++ *						 transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in inter
++ *					 prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in intra
++ *					 prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ *                                    bits used to represent each of PCM sample
++ *                                    values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ *                                      of bits used to represent each of PCM
++ *                                      sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ *                                              minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ *						  the maximum and minimum size of
++ *						  coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ *				 syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ *				reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ *                             of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++	__u8	video_parameter_set_id;
++	__u8	seq_parameter_set_id;
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u8	reserved[6];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ *			  syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ *				 bits that are present in the slice header RBSP
++ *				 for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ *		     each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ *			    tree block size and the minimum luma coding block
++ *			    size of coding units that convey cu_qp_delta_abs
++ *			    and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ *			     partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ *			  the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ *			 units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ *		       units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ *			  beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ *			divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ *                                    the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++	__u8	pic_parameter_set_id;
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++	__u8	reserved;
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	reserved;
++	__s32	pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ *		    for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ *		    for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ *			    all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ *				    of the denominator for all chroma
++ *				    weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ *			     elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 0
++ *                                that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 1
++ *                                that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ *			for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ *				   motion vector prediction candidates supported in
++ *				   the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ *		    blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ *		more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ *			the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ *		       prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_byte_offset;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__s32	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	__u8	reserved0[3];
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u8	reserved1[2];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ *			    set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ *			   set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ *			in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ *		       in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++	__u8	num_active_dpb_entries;
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	reserved[4];
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_16x16:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_32x32:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
 diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
 index 463d352055..7feff43c28 100644
 --- a/libavcodec/hevc_parser.c
@@ -19039,8 +19423,67 @@ index 463d352055..7feff43c28 100644
      if (ps->vps->vps_timing_info_present_flag) {
          num = ps->vps->vps_num_units_in_tick;
          den = ps->vps->vps_time_scale;
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index 4f6d985ae6..eefae71275 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
+         if (!frame->rpl_buf)
+             goto fail;
+ 
+-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+-        if (!frame->tab_mvf_buf)
+-            goto fail;
+-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        if (s->tab_mvf_pool) {
++            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++            if (!frame->tab_mvf_buf)
++                goto fail;
++            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        }
+ 
+-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+-        if (!frame->rpl_tab_buf)
+-            goto fail;
+-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+-        for (j = 0; j < frame->ctb_count; j++)
+-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        if (s->rpl_tab_pool) {
++            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++            if (!frame->rpl_tab_buf)
++                goto fail;
++            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++            for (j = 0; j < frame->ctb_count; j++)
++                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        }
+ 
+         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s)
+     int ctb_count    = frame->ctb_count;
+     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+     int i;
++    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+ 
+     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+         return AVERROR_INVALIDDATA;
+ 
+-    for (i = ctb_addr_ts; i < ctb_count; i++)
+-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++    if (frame->rpl_tab) {
++        for (i = ctb_addr_ts; i < ctb_count; i++)
++            frame->rpl_tab[i] = tab;
++    }
+ 
+-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++    frame->refPicList = tab->refPicList;
+ 
+     return 0;
+ }
 diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 2231aed259..6d2d66dfdf 100644
+index 2231aed259..7b05b41441 100644
 --- a/libavcodec/hevcdec.c
 +++ b/libavcodec/hevcdec.c
 @@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
@@ -19110,7 +19553,43 @@ index 2231aed259..6d2d66dfdf 100644
  #endif
          break;
      case AV_PIX_FMT_YUV444P:
-@@ -3327,7 +3355,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
+@@ -485,6 +513,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
+     if (!sps)
+         return 0;
+ 
++    // If hwaccel then we don't need all the s/w decode helper arrays
++    if (s->avctx->hwaccel) {
++        export_stream_params(s, sps);
++
++        s->avctx->pix_fmt = pix_fmt;
++        s->ps.sps = sps;
++        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++        return 0;
++    }
++
+     ret = pic_arrays_init(s, sps);
+     if (ret < 0)
+         goto fail;
+@@ -2901,11 +2939,13 @@ static int hevc_frame_start(HEVCContext *s)
+                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+     int ret;
+ 
+-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    if (s->horizontal_bs) {
++        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    }
+ 
+     s->is_decoded        = 0;
+     s->first_nal_type    = s->nal_unit_type;
+@@ -3327,7 +3367,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
      s->ref = NULL;
      ret    = decode_nal_units(s, avpkt->data, avpkt->size);
      if (ret < 0)
@@ -19125,7 +19604,35 @@ index 2231aed259..6d2d66dfdf 100644
  
      if (avctx->hwaccel) {
          if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-@@ -3697,6 +3732,15 @@ AVCodec ff_hevc_decoder = {
+@@ -3370,15 +3417,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
+     if (ret < 0)
+         return ret;
+ 
+-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+-    if (!dst->tab_mvf_buf)
+-        goto fail;
+-    dst->tab_mvf = src->tab_mvf;
++    if (src->tab_mvf_buf) {
++        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++        if (!dst->tab_mvf_buf)
++            goto fail;
++        dst->tab_mvf = src->tab_mvf;
++    }
+ 
+-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+-    if (!dst->rpl_tab_buf)
+-        goto fail;
+-    dst->rpl_tab = src->rpl_tab;
++    if (src->rpl_tab_buf) {
++        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++        if (!dst->rpl_tab_buf)
++            goto fail;
++        dst->rpl_tab = src->rpl_tab;
++    }
+ 
+     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+     if (!dst->rpl_buf)
+@@ -3697,6 +3748,15 @@ AVCodec ff_hevc_decoder = {
  #if CONFIG_HEVC_NVDEC_HWACCEL
                                 HWACCEL_NVDEC(hevc),
  #endif
@@ -49497,7 +50004,7 @@ index 0000000000..85c5b46d75
 +};
 +
 diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 4b2679eb38..6ca83cc21b 100644
+index 4b2679eb38..9ef2f40e39 100644
 --- a/libavcodec/v4l2_buffers.c
 +++ b/libavcodec/v4l2_buffers.c
 @@ -21,6 +21,7 @@
@@ -49508,9 +50015,11 @@ index 4b2679eb38..6ca83cc21b 100644
  #include <linux/videodev2.h>
  #include <sys/ioctl.h>
  #include <sys/mman.h>
-@@ -30,56 +31,68 @@
+@@ -29,57 +30,82 @@
+ #include <poll.h>
  #include "libavcodec/avcodec.h"
  #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
  #include "libavutil/pixdesc.h"
 +#include "libavutil/hwcontext.h"
  #include "v4l2_context.h"
@@ -49552,21 +50061,32 @@ index 4b2679eb38..6ca83cc21b 100644
  }
  
 -static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
-+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++static inline struct timeval tv_from_int(const int64_t t)
  {
 -    int64_t v4l2_pts;
--
++    return (struct timeval){
++        .tv_usec = t % USEC_PER_SEC,
++        .tv_sec  = t / USEC_PER_SEC
++    };
++}
+ 
 -    if (pts == AV_NOPTS_VALUE)
 -        pts = 0;
--
++static inline int64_t int_from_tv(const struct timeval t)
++{
++    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+ 
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
      /* convert pts to v4l2 timebase */
 -    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
 +    const int64_t v4l2_pts =
-+        out->context->no_pts_rescale ? pts :
 +        pts == AV_NOPTS_VALUE ? 0 :
 +            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++    out->buf.timestamp = tv_from_int(v4l2_pts);
  }
  
 -static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
@@ -49574,18 +50094,20 @@ index 4b2679eb38..6ca83cc21b 100644
  {
 -    int64_t v4l2_pts;
 -
++    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
      /* convert pts back to encoder timebase */
 -    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                         avbuf->buf.timestamp.tv_usec;
- 
--    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
+-                        avbuf->buf.timestamp.tv_usec;
 +    return
 +        avbuf->context->no_pts_rescale ? v4l2_pts :
 +        v4l2_pts == 0 ? AV_NOPTS_VALUE :
 +            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
 +}
-+
+ 
+-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
 +static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
 +{
 +    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
@@ -49598,7 +50120,7 @@ index 4b2679eb38..6ca83cc21b 100644
  }
  
  static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-@@ -116,6 +129,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+@@ -116,6 +142,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
      return AVCOL_PRI_UNSPECIFIED;
  }
  
@@ -49704,7 +50226,7 @@ index 4b2679eb38..6ca83cc21b 100644
  static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
  {
      enum v4l2_quantization qt;
-@@ -134,6 +246,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+@@ -134,6 +259,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
       return AVCOL_RANGE_UNSPECIFIED;
  }
  
@@ -49725,29 +50247,23 @@ index 4b2679eb38..6ca83cc21b 100644
  static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
  {
      enum v4l2_ycbcr_encoding ycbcr;
-@@ -210,73 +336,165 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+@@ -210,73 +349,165 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
      return AVCOL_TRC_UNSPECIFIED;
  }
  
 -static void v4l2_free_buffer(void *opaque, uint8_t *unused)
 +static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
-+{
-+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
-+}
-+
-+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
  {
 -    V4L2Buffer* avbuf = opaque;
 -    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
++    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
 +}
  
 -    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
 -        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
-+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
 +{
-+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
-+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
++    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
 +}
  
 -        if (s->reinit) {
@@ -49761,11 +50277,18 @@ index 4b2679eb38..6ca83cc21b 100644
 -            else if (avbuf->context->streamon)
 -                ff_v4l2_buffer_enqueue(avbuf);
 -        }
++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++{
++    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
++        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
++}
++
 +static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
 +{
 +    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
 +    AVDRMLayerDescriptor *layer;
-+
+ 
+-        av_buffer_unref(&avbuf->context_ref);
 +    /* fill the DRM frame descriptor */
 +    drm_desc->nb_objects = avbuf->num_planes;
 +    drm_desc->nb_layers = 1;
@@ -49777,7 +50300,7 @@ index 4b2679eb38..6ca83cc21b 100644
 +        layer->planes[i].object_index = i;
 +        layer->planes[i].offset = 0;
 +        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
+     }
 +
 +    switch (avbuf->context->av_pix_fmt) {
 +    case AV_PIX_FMT_YUYV422:
@@ -49805,8 +50328,7 @@ index 4b2679eb38..6ca83cc21b 100644
 +        break;
 +
 +    case AV_PIX_FMT_YUV420P:
- 
--        av_buffer_unref(&avbuf->context_ref);
++
 +        layer->format = DRM_FORMAT_YUV420;
 +
 +        if (avbuf->num_planes > 1)
@@ -49829,7 +50351,7 @@ index 4b2679eb38..6ca83cc21b 100644
 +    default:
 +        drm_desc->nb_layers = 0;
 +        break;
-     }
++    }
 +
 +    return (uint8_t *) drm_desc;
  }
@@ -49858,7 +50380,7 @@ index 4b2679eb38..6ca83cc21b 100644
  
 -    in->status = V4L2BUF_RET_USER;
 -    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
-+        avbuf->status = V4L2BUF_AVAILABLE;
++        ff_v4l2_buffer_set_avail(avbuf);
  
 -    return 0;
 +        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
@@ -49934,7 +50456,7 @@ index 4b2679eb38..6ca83cc21b 100644
  
      if (plane >= out->num_planes)
          return AVERROR(EINVAL);
-@@ -284,32 +502,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
+@@ -284,32 +515,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
      length = out->plane_info[plane].length;
      bytesused = FFMIN(size+offset, length);
  
@@ -49989,7 +50511,7 @@ index 4b2679eb38..6ca83cc21b 100644
 +    frame->buf[0] = wrap_avbuf(avbuf);
 +    if (frame->buf[0] == NULL)
 +        return AVERROR(ENOMEM);
- 
++
 +    if (buf_to_m2mctx(avbuf)->output_drm) {
 +        /* 1. get references to the actual data */
 +        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
@@ -49997,7 +50519,7 @@ index 4b2679eb38..6ca83cc21b 100644
 +        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
 +        return 0;
 +    }
-+
+ 
 +
 +    /* 1. get references to the actual data */
 +    for (i = 0; i < avbuf->num_planes; i++) {
@@ -50007,7 +50529,7 @@ index 4b2679eb38..6ca83cc21b 100644
      }
  
      /* fixup special cases */
-@@ -318,17 +561,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+@@ -318,17 +574,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
      case AV_PIX_FMT_NV21:
          if (avbuf->num_planes > 1)
              break;
@@ -50031,7 +50553,7 @@ index 4b2679eb38..6ca83cc21b 100644
          break;
  
      default:
-@@ -338,68 +581,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+@@ -338,68 +594,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
      return 0;
  }
  
@@ -50053,6 +50575,38 @@ index 4b2679eb38..6ca83cc21b 100644
 +{
 +    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
 +}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++        return AVERROR(EINVAL);
++
++    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        // Only currently cope with single buffer types
++        if (out->buf.length != 1)
++            return AVERROR_PATCHWELCOME;
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->planes[0].m.fd = src->objects[0].fd;
++    }
++    else {
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->buf.m.fd      = src->objects[0].fd;
++    }
++
++    // No need to copy src AVDescriptor and if we did then we may confuse
++    // fd close on free
++    out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++    return 0;
++}
 +
  static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
  {
@@ -50181,10 +50735,16 @@ index 4b2679eb38..6ca83cc21b 100644
      return 0;
  }
  
-@@ -411,7 +681,16 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+@@ -409,16 +724,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+  *
+  ******************************************************************************/
  
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
  {
+-    v4l2_set_pts(out, frame->pts);
+-
+-    return v4l2_buffer_swframe_to_buf(frame, out);
 +    out->buf.flags = frame->key_frame ?
 +        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
 +        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
@@ -50193,12 +50753,17 @@ index 4b2679eb38..6ca83cc21b 100644
 +    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
 +    v4l2_set_color_range(out, frame->color_range);
 +    // PTS & interlace are buffer vars
-     v4l2_set_pts(out, frame->pts);
++    if (track_ts)
++        out->buf.timestamp = tv_from_int(track_ts);
++    else
++        v4l2_set_pts(out, frame->pts);
 +    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
- 
-     return v4l2_buffer_swframe_to_buf(frame, out);
++
++    return frame->format == AV_PIX_FMT_DRM_PRIME ?
++        v4l2_buffer_primeframe_to_buf(frame, out) :
++        v4l2_buffer_swframe_to_buf(frame, out);
  }
-@@ -419,6 +698,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ 
  int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
  {
      int ret;
@@ -50206,7 +50771,7 @@ index 4b2679eb38..6ca83cc21b 100644
  
      av_frame_unref(frame);
  
-@@ -429,17 +709,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+@@ -429,17 +759,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
  
      /* 2. get frame information */
      frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
@@ -50242,7 +50807,7 @@ index 4b2679eb38..6ca83cc21b 100644
  
      /* 3. report errors upstream */
      if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
-@@ -452,15 +747,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+@@ -452,15 +797,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
  
  int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
  {
@@ -50260,16 +50825,18 @@ index 4b2679eb38..6ca83cc21b 100644
      pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
 -    pkt->data = pkt->buf->data;
 +    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++    pkt->flags = 0;
  
      if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
          pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -475,31 +769,85 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+@@ -475,31 +820,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
      return 0;
  }
  
 -int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp)
  {
      int ret;
  
@@ -50285,7 +50852,11 @@ index 4b2679eb38..6ca83cc21b 100644
 +    if (ret && ret != AVERROR(ENOMEM))
          return ret;
  
-     v4l2_set_pts(out, pkt->pts);
+-    v4l2_set_pts(out, pkt->pts);
++    if (timestamp)
++        out->buf.timestamp = tv_from_int(timestamp);
++    else
++        v4l2_set_pts(out, pkt->pts);
  
 -    if (pkt->flags & AV_PKT_FLAG_KEY)
 -        out->flags = V4L2_BUF_FLAG_KEYFRAME;
@@ -50295,14 +50866,14 @@ index 4b2679eb38..6ca83cc21b 100644
  
 -    return 0;
 +    return ret;
++}
++
++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++{
++    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
  }
  
 -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+{
-+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
-+}
-+
 +
 +static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
 +{
@@ -50320,13 +50891,15 @@ index 4b2679eb38..6ca83cc21b 100644
 +            close(avbuf->drm_frame.objects[i].fd);
 +    }
 +
++    av_buffer_unref(&avbuf->ref_buf);
++
 +    ff_weak_link_unref(&avbuf->context_wl);
 +
 +    av_free(avbuf);
 +}
 +
 +
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
  {
 -    V4L2Context *ctx = avbuf->context;
      int ret, i;
@@ -50343,8 +50916,9 @@ index 4b2679eb38..6ca83cc21b 100644
 +        return AVERROR(ENOMEM);
 +    }
  
+-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
 +    avbuf->context = ctx;
-     avbuf->buf.memory = V4L2_MEMORY_MMAP;
++    avbuf->buf.memory = mem;
      avbuf->buf.type = ctx->type;
      avbuf->buf.index = index;
  
@@ -50357,7 +50931,7 @@ index 4b2679eb38..6ca83cc21b 100644
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.length = VIDEO_MAX_PLANES;
          avbuf->buf.m.planes = avbuf->planes;
-@@ -507,7 +855,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+@@ -507,7 +912,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
  
      ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
      if (ret < 0)
@@ -50366,7 +50940,16 @@ index 4b2679eb38..6ca83cc21b 100644
  
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->num_planes = 0;
-@@ -527,25 +875,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+@@ -520,6 +925,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->num_planes = 1;
+ 
+     for (i = 0; i < avbuf->num_planes; i++) {
++        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+ 
+         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +934,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
  
          if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
              avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
@@ -50374,24 +50957,20 @@ index 4b2679eb38..6ca83cc21b 100644
 -                                           PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+            }
          } else {
              avbuf->plane_info[i].length = avbuf->buf.length;
 -            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 -                                          PROT_READ | PROT_WRITE, MAP_SHARED,
 -                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
 +
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
++            if (want_mmap)
 +                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
 +                                               PROT_READ | PROT_WRITE, MAP_SHARED,
 +                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+            }
          }
  
 -        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
@@ -50411,7 +50990,7 @@ index 4b2679eb38..6ca83cc21b 100644
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.m.planes = avbuf->planes;
          avbuf->buf.length   = avbuf->num_planes;
-@@ -555,20 +911,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+@@ -555,20 +966,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
          avbuf->buf.length    = avbuf->planes[0].length;
      }
  
@@ -50468,10 +51047,10 @@ index 4b2679eb38..6ca83cc21b 100644
      return 0;
  }
 diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 8dbc7fc104..7d5fadcd3d 100644
+index 8dbc7fc104..e64441ec9b 100644
 --- a/libavcodec/v4l2_buffers.h
 +++ b/libavcodec/v4l2_buffers.h
-@@ -27,25 +27,34 @@
+@@ -27,25 +27,38 @@
  #include <stdatomic.h>
  #include <linux/videodev2.h>
  
@@ -50508,10 +51087,14 @@ index 8dbc7fc104..7d5fadcd3d 100644
 -    atomic_uint context_refcount;
 +    /* DRM descriptor */
 +    AVDRMFrameDescriptor drm_frame;
++    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++     * are done
++     */
++    AVBufferRef * ref_buf;
  
      /* keep track of the mmap address and mmap length */
      struct V4L2Plane_info {
-@@ -60,7 +69,6 @@ typedef struct V4L2Buffer {
+@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
      struct v4l2_buffer buf;
      struct v4l2_plane planes[VIDEO_MAX_PLANES];
  
@@ -50519,27 +51102,50 @@ index 8dbc7fc104..7d5fadcd3d 100644
      enum V4L2Buffer_status status;
  
  } V4L2Buffer;
-@@ -98,6 +106,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
+@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
   */
  int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
  
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen);
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp);
 +
  /**
   * Extracts the data from an AVFrame to a V4L2Buffer
   *
-@@ -116,7 +127,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+ 
+ /**
+  * Initializes a V4L2Buffer
+@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
   *
   * @returns 0 in case of success, a negative AVERROR code otherwise
   */
 -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
  
  /**
   * Enqueues a V4L2Buffer
+@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+  */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+ 
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++    avbuf->status = V4L2BUF_AVAILABLE;
++    av_buffer_unref(&avbuf->ref_buf);
++}
++
+ 
+ #endif // AVCODEC_V4L2_BUFFERS_H
 diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ff1ea8e57b..c0d257e5d3 100644
+index ff1ea8e57b..0225f6ba64 100644
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -50556,49 +51162,191 @@ index ff1ea8e57b..c0d257e5d3 100644
  
  struct v4l2_format_update {
      uint32_t v4l2_fmt;
-@@ -41,28 +43,18 @@ struct v4l2_format_update {
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
      int update_avfmt;
  };
  
 -static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
-+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
  {
-     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-         container_of(ctx, V4L2m2mContext, output) :
-         container_of(ctx, V4L2m2mContext, capture);
+-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+-        container_of(ctx, V4L2m2mContext, output) :
+-        container_of(ctx, V4L2m2mContext, capture);
++    return (int64_t)n;
  }
  
 -static inline AVCodecContext *logger(V4L2Context *ctx)
-+static inline AVCodecContext *logger(const V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
  {
-     return ctx_to_m2mctx(ctx)->avctx;
+-    return ctx_to_m2mctx(ctx)->avctx;
++    return (unsigned int)pts;
++}
++
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
++{
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = avpkt->size,
++        .pts              = avpkt->pts,
++        .dts              = avpkt->dts,
++        .reordered_opaque = avctx->reordered_opaque,
++        .pkt_pos          = avpkt->pos,
++        .pkt_duration     = avpkt->duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
++{
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = 0,
++        .pts              = frame->pts,
++        .dts              = AV_NOPTS_VALUE,
++        .reordered_opaque = frame->reordered_opaque,
++        .pkt_pos          = frame->pkt_pos,
++        .pkt_duration     = frame->pkt_duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVFrame *const frame)
++{
++    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++    {
++        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        frame->pts              = AV_NOPTS_VALUE;
++        frame->pkt_dts          = AV_NOPTS_VALUE;
++        frame->reordered_opaque = x->last_opaque;
++        frame->pkt_pos          = -1;
++        frame->pkt_duration     = 0;
++        frame->pkt_size         = -1;
++    }
++    else if (!t->discard)
++    {
++        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
++        frame->pkt_dts          = t->dts;
++        frame->reordered_opaque = t->reordered_opaque;
++        frame->pkt_pos          = t->pkt_pos;
++        frame->pkt_duration     = t->pkt_duration;
++        frame->pkt_size         = t->pkt_size;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (frame->pts != AV_NOPTS_VALUE)
++            x->last_pts = frame->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        return -1;
++    }
++
++    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++    return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVPacket *const pkt)
++{
++    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++    {
++        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        pkt->pts                = AV_NOPTS_VALUE;
++    }
++    else if (!t->discard)
++    {
++        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (pkt->pts != AV_NOPTS_VALUE)
++            x->last_pts = pkt->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        return -1;
++    }
++
++    // * Would like something much better than this...xlat(offset + out_count)?
++    pkt->dts = pkt->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           pkt->pts, t->track_pts, n);
++    return 0;
  }
  
 -static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
--}
--
--static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
--}
--
- static AVRational v4l2_get_sar(V4L2Context *ctx)
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
  {
-     struct AVRational sar = { 0, 1 };
-@@ -81,21 +73,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
+ }
+ 
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++static inline AVCodecContext *logger(const V4L2Context *ctx)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++    return ctx_to_m2mctx(ctx)->avctx;
+ }
+ 
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
      return sar;
  }
  
 -static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
 +static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
-+{
-+    return ctx->bufrefs != NULL;
-+}
-+
-+// Width/Height changed or we don't have an alloc in the first place?
-+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
  {
 -    struct v4l2_format *fmt1 = &ctx->format;
 -    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
@@ -50607,6 +51355,12 @@ index ff1ea8e57b..c0d257e5d3 100644
 -        :
 -        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
 -        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
++{
 +    const struct v4l2_format *fmt1 = &ctx->format;
 +    int ret = !ctx_buffers_alloced(ctx) ||
 +        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
@@ -50628,7 +51382,7 @@ index ff1ea8e57b..c0d257e5d3 100644
  
      return ret;
  }
-@@ -153,90 +153,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
      }
  }
  
@@ -50753,16 +51507,16 @@ index ff1ea8e57b..c0d257e5d3 100644
          if (ret) {
 -            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
 +            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-             return AVERROR(EINVAL);
-         }
++            return AVERROR(EINVAL);
++        }
 +
 +        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
 +            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
 +            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
 +                   s->capture.width, s->capture.height,
 +                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
-+            return AVERROR(EINVAL);
-+        }
+             return AVERROR(EINVAL);
+         }
 +
 +        // Update pixel format - should only actually do something on initial change
 +        s->capture.av_pix_fmt =
@@ -50793,7 +51547,7 @@ index ff1ea8e57b..c0d257e5d3 100644
      return 1;
  }
  
-@@ -280,171 +300,275 @@ static int v4l2_stop_encode(V4L2Context *ctx)
+@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context *ctx)
      return 0;
  }
  
@@ -50879,16 +51633,18 @@ index ff1ea8e57b..c0d257e5d3 100644
          }
 -        ctx->done = 1;
 -        return NULL;
-+    }
+     }
 +    atomic_fetch_sub(&ctx->q_count, 1);
 +
 +    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+    avbuf->status = V4L2BUF_AVAILABLE;
++    ff_v4l2_buffer_set_avail(avbuf);
 +    avbuf->buf = buf;
 +    if (is_mp) {
 +        memcpy(avbuf->planes, planes, sizeof(planes));
 +        avbuf->buf.m.planes = avbuf->planes;
-     }
++    }
++    // Done with any attached buffer
++    av_buffer_unref(&avbuf->ref_buf);
  
 -start:
 -    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
@@ -51191,7 +51947,7 @@ index ff1ea8e57b..c0d257e5d3 100644
      }
  
      return NULL;
-@@ -452,25 +576,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
  
  static int v4l2_release_buffers(V4L2Context* ctx)
  {
@@ -51221,16 +51977,16 @@ index ff1ea8e57b..c0d257e5d3 100644
 +            .type = ctx->type,
 +            .count = 0, /* 0 -> unmap all buffers from the driver */
 +        };
-+
-+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
-+            if (errno == EINTR)
-+                continue;
  
 -        for (j = 0; j < buffer->num_planes; j++) {
 -            struct V4L2Plane_info *p = &buffer->plane_info[j];
 -            if (p->mm_addr && p->length)
 -                if (munmap(p->mm_addr, p->length) < 0)
 -                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++            if (errno == EINTR)
++                continue;
++
 +            ret = AVERROR(errno);
 +
 +            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
@@ -51251,7 +52007,7 @@ index ff1ea8e57b..c0d257e5d3 100644
  }
  
  static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +643,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
  
  static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
  {
@@ -51260,7 +52016,7 @@ index ff1ea8e57b..c0d257e5d3 100644
      enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
      struct v4l2_fmtdesc fdesc;
      int ret;
-@@ -517,6 +663,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
          if (ret)
              return AVERROR(EINVAL);
  
@@ -51274,7 +52030,7 @@ index ff1ea8e57b..c0d257e5d3 100644
          pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
          ret = v4l2_try_raw_format(ctx, pixfmt);
          if (ret){
-@@ -569,18 +722,83 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
    *
    *****************************************************************************/
  
@@ -51289,7 +52045,7 @@ index ff1ea8e57b..c0d257e5d3 100644
 +    for (i = 0; i < ctx->num_buffers; ++i) {
 +        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
 +        if (buf->status == V4L2BUF_IN_DRIVER)
-+            buf->status = V4L2BUF_AVAILABLE;
++            ff_v4l2_buffer_set_avail(buf);
 +    }
 +    atomic_store(&ctx->q_count, 0);
 +}
@@ -51349,6 +52105,8 @@ index ff1ea8e57b..c0d257e5d3 100644
 +    {
 +        if (cmd == VIDIOC_STREAMOFF)
 +            flush_all_buffers_status(ctx);
++        else
++            ctx->first_buf = 1;
 +
 +        ctx->streamon = (cmd == VIDIOC_STREAMON);
 +        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
@@ -51364,7 +52122,33 @@ index ff1ea8e57b..c0d257e5d3 100644
  }
  
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -608,7 +826,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
++    int64_t track_ts;
+     V4L2Buffer* avbuf;
+     int ret;
+ 
+     if (!frame) {
+         ret = v4l2_stop_encode(ctx);
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+         s->draining= 1;
+         return 0;
+     }
+@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+     if (!avbuf)
+         return AVERROR(EAGAIN);
+ 
+-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+     if (ret)
+         return ret;
+ 
      return ff_v4l2_buffer_enqueue(avbuf);
  }
  
@@ -51373,25 +52157,29 @@ index ff1ea8e57b..c0d257e5d3 100644
 +                                   const void * extdata, size_t extlen)
  {
      V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer* avbuf;
-@@ -616,8 +835,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+     int ret;
++    int64_t track_ts;
  
      if (!pkt->size) {
          ret = v4l2_stop_decode(ctx);
 +        // Log but otherwise ignore stop failure
          if (ret)
 -            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
          s->draining = 1;
          return 0;
      }
-@@ -626,8 +846,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
      if (!avbuf)
          return AVERROR(EAGAIN);
  
 -    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
 -    if (ret)
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
++    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
 +    if (ret == AVERROR(ENOMEM))
 +        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
 +               __func__, pkt->size, avbuf->planes[0].length);
@@ -51399,9 +52187,12 @@ index ff1ea8e57b..c0d257e5d3 100644
          return ret;
  
      return ff_v4l2_buffer_enqueue(avbuf);
-@@ -636,19 +859,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+ 
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer *avbuf;
 +    int rv;
  
@@ -51414,17 +52205,24 @@ index ff1ea8e57b..c0d257e5d3 100644
 -    if (!avbuf) {
 -        if (ctx->done)
 -            return AVERROR_EOF;
--
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv;
++        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+ 
 -        return AVERROR(EAGAIN);
 -    }
-+    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-+        return rv;
- 
-     return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
+-
+-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
++   return 0;
  }
-@@ -656,19 +870,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+ 
  int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
  {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
      V4L2Buffer *avbuf;
 +    int rv;
  
@@ -51440,12 +52238,19 @@ index ff1ea8e57b..c0d257e5d3 100644
 -
 -        return AVERROR(EAGAIN);
 -    }
-+    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
++            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
  
-     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    return 0;
  }
-@@ -702,78 +907,160 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+ 
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
  
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
@@ -51497,20 +52302,20 @@ index ff1ea8e57b..c0d257e5d3 100644
  
 -int ff_v4l2_context_init(V4L2Context* ctx)
 +
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
  {
 -    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
 +    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
      struct v4l2_requestbuffers req;
 -    int ret, i;
--
++    int ret;
++    int i;
+ 
 -    if (!v4l2_type_supported(ctx)) {
 -        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
 -        return AVERROR_PATCHWELCOME;
 -    }
-+    int ret;
-+    int i;
- 
+-
 -    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
 -    if (ret)
 -        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
@@ -51518,8 +52323,9 @@ index ff1ea8e57b..c0d257e5d3 100644
  
      memset(&req, 0, sizeof(req));
 -    req.count = ctx->num_buffers;
+-    req.memory = V4L2_MEMORY_MMAP;
 +    req.count = req_buffers;
-     req.memory = V4L2_MEMORY_MMAP;
++    req.memory = mem;
      req.type = ctx->type;
 -    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
 -    if (ret < 0) {
@@ -51554,7 +52360,7 @@ index ff1ea8e57b..c0d257e5d3 100644
 +    }
 +
 +    for (i = 0; i < ctx->num_buffers; i++) {
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
++        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
 +        if (ret) {
              av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
 -            goto error;
@@ -51580,7 +52386,8 @@ index ff1ea8e57b..c0d257e5d3 100644
 +    av_freep(&ctx->bufrefs);
 +    return ret;
 +}
-+
+ 
+-    av_freep(&ctx->buffers);
 +int ff_v4l2_context_init(V4L2Context* ctx)
 +{
 +    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
@@ -51616,8 +52423,7 @@ index ff1ea8e57b..c0d257e5d3 100644
 +        if (ret < 0)
 +            goto fail_unref_hwframes;
 +    }
- 
--    av_freep(&ctx->buffers);
++
 +    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
 +    if (ret) {
 +        ret = AVERROR(errno);
@@ -51625,7 +52431,7 @@ index ff1ea8e57b..c0d257e5d3 100644
 +        goto fail_unref_hwframes;
 +    }
 +
-+    ret = create_buffers(ctx, ctx->num_buffers);
++    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
 +    if (ret < 0)
 +        goto fail_unref_hwframes;
 +
@@ -51638,7 +52444,7 @@ index ff1ea8e57b..c0d257e5d3 100644
      return ret;
  }
 diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 22a9532444..a56216e990 100644
+index 22a9532444..267a629925 100644
 --- a/libavcodec/v4l2_context.h
 +++ b/libavcodec/v4l2_context.h
 @@ -31,6 +31,7 @@
@@ -51649,7 +52455,7 @@ index 22a9532444..a56216e990 100644
  #include "v4l2_buffers.h"
  
  typedef struct V4L2Context {
-@@ -70,11 +71,18 @@ typedef struct V4L2Context {
+@@ -70,28 +71,57 @@ typedef struct V4L2Context {
       */
      int width, height;
      AVRational sample_aspect_ratio;
@@ -51670,18 +52476,35 @@ index 22a9532444..a56216e990 100644
  
      /**
       * Readonly after init.
-@@ -92,6 +100,21 @@ typedef struct V4L2Context {
+      */
+     int num_buffers;
+ 
++    /**
++     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++     */
++    enum v4l2_memory buf_mem;
++
+     /**
+      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+      */
+     int streamon;
+ 
++    /* 1st buffer after stream on */
++    int first_buf;
++
+     /**
+      *  Either no more buffers available or an unrecoverable error was notified
+      *  by the V4L2 kernel driver: once set the context has to be exited.
       */
      int done;
  
 +    int flag_last;
 +
 +    /**
-+     * PTS rescale not wanted
-+     * If the PTS is just a dummy frame count then rescale is
-+     * actively harmful
++     * If NZ then when Qing frame/pkt use this rather than the
++     * "real" PTS
 +     */
-+    int no_pts_rescale;
++    uint64_t track_ts;
 +
 +    AVBufferRef *frames_ref;
 +    atomic_int q_count;
@@ -51692,7 +52515,7 @@ index 22a9532444..a56216e990 100644
  } V4L2Context;
  
  /**
-@@ -156,7 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
   * @param[in] ctx The V4L2Context to dequeue from.
   * @param[inout] f The AVFrame to dequeue to.
   * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
@@ -51703,7 +52526,7 @@ index 22a9532444..a56216e990 100644
   */
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  
-@@ -170,7 +196,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
   * @param[in] pkt A pointer to an AVPacket.
   * @return 0 in case of success, a negative error otherwise.
   */
@@ -51713,10 +52536,35 @@ index 22a9532444..a56216e990 100644
  /**
   * Enqueues a buffer to a V4L2Context from an AVFrame
 diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index cdfd579810..f14ed0b708 100644
+index cdfd579810..77fe5fc4e3 100644
 --- a/libavcodec/v4l2_m2m.c
 +++ b/libavcodec/v4l2_m2m.c
-@@ -215,13 +215,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+@@ -36,6 +36,14 @@
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
+ 
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
++
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+ 
+     s->capture.done = s->output.done = 0;
+     s->capture.name = "capture";
++    s->capture.buf_mem = V4L2_MEMORY_MMAP;
+     s->output.name = "output";
++    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     atomic_init(&s->refcount, 0);
+     sem_init(&s->refsync, 0, 0);
+ 
+@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
          av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
  
      /* 2. unmap the capture buffers (v4l2 and ffmpeg):
@@ -51730,7 +52578,7 @@ index cdfd579810..f14ed0b708 100644
      ff_v4l2_context_release(&s->capture);
  
      /* 3. get the new capture format */
-@@ -240,7 +234,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
  
      /* 5. complete reinit */
      s->draining = 0;
@@ -51738,7 +52586,7 @@ index cdfd579810..f14ed0b708 100644
  
      return 0;
  }
-@@ -274,7 +267,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s)
+@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s)
  
      /* start again now that we know the stream dimensions */
      s->draining = 0;
@@ -51746,7 +52594,7 @@ index cdfd579810..f14ed0b708 100644
  
      ret = ff_v4l2_context_get_format(&s->output, 0);
      if (ret) {
-@@ -328,10 +320,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
+@@ -328,10 +330,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
      ff_v4l2_context_release(&s->capture);
      sem_destroy(&s->refsync);
  
@@ -51762,7 +52610,7 @@ index cdfd579810..f14ed0b708 100644
  
      av_free(s);
  }
-@@ -344,6 +340,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+@@ -344,6 +350,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
      if (!s)
          return 0;
  
@@ -51774,7 +52622,7 @@ index cdfd579810..f14ed0b708 100644
      if (s->fd >= 0) {
          ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
          if (ret)
-@@ -356,7 +357,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+@@ -356,7 +367,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
  
      ff_v4l2_context_release(&s->output);
  
@@ -51789,8 +52637,59 @@ index cdfd579810..f14ed0b708 100644
      av_buffer_unref(&priv->context_ref);
  
      return 0;
+@@ -400,35 +418,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
+     return v4l2_configure_contexts(s);
+ }
+ 
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+-    *s = av_mallocz(sizeof(V4L2m2mContext));
+-    if (!*s)
++    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++    *pps = NULL;
++    if (!s)
+         return AVERROR(ENOMEM);
+ 
+-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+                                          &v4l2_m2m_destroy_context, NULL, 0);
+     if (!priv->context_ref) {
+-        av_freep(s);
++        av_free(s);
+         return AVERROR(ENOMEM);
+     }
+ 
+     /* assign the context */
+-    priv->context = *s;
+-    (*s)->priv = priv;
++    priv->context = s;
++    s->priv = priv;
+ 
+     /* populate it */
+-    priv->context->capture.num_buffers = priv->num_capture_buffers;
+-    priv->context->output.num_buffers  = priv->num_output_buffers;
+-    priv->context->self_ref = priv->context_ref;
+-    priv->context->fd = -1;
++    s->capture.num_buffers = priv->num_capture_buffers;
++    s->output.num_buffers  = priv->num_output_buffers;
++    s->self_ref = priv->context_ref;
++    s->fd = -1;
++    xlat_init(&s->xlat);
+ 
+     priv->context->frame = av_frame_alloc();
+     if (!priv->context->frame) {
+         av_buffer_unref(&priv->context_ref);
+-        *s = NULL; /* freed when unreferencing context_ref */
+         return AVERROR(ENOMEM);
+     }
+ 
++    *pps = s;
+     return 0;
+ }
 diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index b67b216331..19d618698d 100644
+index b67b216331..ee72beb052 100644
 --- a/libavcodec/v4l2_m2m.h
 +++ b/libavcodec/v4l2_m2m.h
 @@ -30,6 +30,7 @@
@@ -51801,7 +52700,7 @@ index b67b216331..19d618698d 100644
  #include "v4l2_context.h"
  
  #define container_of(ptr, type, member) ({ \
-@@ -38,7 +39,38 @@
+@@ -38,7 +39,37 @@
  
  #define V4L_M2M_DEFAULT_OPTS \
      { "num_output_buffers", "Number of buffers in the output context",\
@@ -51834,14 +52733,13 @@ index b67b216331..19d618698d 100644
 +typedef struct xlat_track_s {
 +    unsigned int track_no;
 +    int64_t last_pts;
-+    int64_t last_pkt_dts;
 +    int64_t last_opaque;
 +    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
 +} xlat_track_t;
  
  typedef struct V4L2m2mContext {
      char devname[PATH_MAX];
-@@ -52,7 +84,6 @@ typedef struct V4L2m2mContext {
+@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
      AVCodecContext *avctx;
      sem_t refsync;
      atomic_uint refcount;
@@ -51849,7 +52747,7 @@ index b67b216331..19d618698d 100644
  
      /* null frame/packet received */
      int draining;
-@@ -66,6 +97,33 @@ typedef struct V4L2m2mContext {
+@@ -66,6 +96,36 @@ typedef struct V4L2m2mContext {
  
      /* reference back to V4L2m2mPriv */
      void *priv;
@@ -51859,6 +52757,9 @@ index b67b216331..19d618698d 100644
 +    /* generate DRM frames */
 +    int output_drm;
 +
++    /* input frames are drmprime */
++    int input_drm;
++
 +    /* Frame tracking */
 +    xlat_track_t xlat;
 +    int pending_hw;
@@ -51883,7 +52784,7 @@ index b67b216331..19d618698d 100644
  } V4L2m2mContext;
  
  typedef struct V4L2m2mPriv {
-@@ -76,6 +134,7 @@ typedef struct V4L2m2mPriv {
+@@ -76,6 +136,7 @@ typedef struct V4L2m2mPriv {
  
      int num_output_buffers;
      int num_capture_buffers;
@@ -51891,7 +52792,7 @@ index b67b216331..19d618698d 100644
  } V4L2m2mPriv;
  
  /**
-@@ -129,4 +188,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
+@@ -129,4 +190,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
   */
  int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
  
@@ -51919,7 +52820,7 @@ index b67b216331..19d618698d 100644
 +
  #endif /* AVCODEC_V4L2_M2M_H */
 diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index ab07c0a24a..dd383f31e5 100644
+index ab07c0a24a..545651e560 100644
 --- a/libavcodec/v4l2_m2m_dec.c
 +++ b/libavcodec/v4l2_m2m_dec.c
 @@ -23,6 +23,10 @@
@@ -52092,94 +52993,16 @@ index ab07c0a24a..dd383f31e5 100644
      return 0;
  }
  
-@@ -133,58 +169,637 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
+@@ -133,58 +169,552 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
      return 0;
  }
  
 -static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-+{
-+    return (int64_t)n;
-+}
-+
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-+{
-+    return (unsigned int)pts;
-+}
-+
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
 +static void
-+xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
-+{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, x->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-+    x->last_pkt_dts = avpkt->dts;
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pending          = 1,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .dts              = avpkt->dts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    avpkt->pts = track_pts;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
++set_best_effort_pts(AVCodecContext *const avctx,
 +             pts_stats_t * const ps,
 +             AVFrame *const frame)
 +{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->reordered_opaque = x->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-+        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (frame->pts != AV_NOPTS_VALUE)
-+            x->last_pts = frame->pts;
-+        t->pending = 0;
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
 +    pts_stats_add(ps, frame->pts);
 +
 +#if FF_API_PKT_PTS
@@ -52188,10 +53011,15 @@ index ab07c0a24a..dd383f31e5 100644
 +FF_ENABLE_DEPRECATION_WARNINGS
 +#endif
 +    frame->best_effort_timestamp = pts_stats_guess(ps);
-+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-+    return 0;
++    // If we can't guess from just PTS - try DTS
++    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++        frame->best_effort_timestamp = frame->pkt_dts;
++
++    // We can't emulate what s/w does in a useful manner and using the
++    // "correct" answer seems to just confuse things.
++    frame->pkt_dts               = frame->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
 +}
 +
 +static void
@@ -52205,13 +53033,6 @@ index ab07c0a24a..dd383f31e5 100644
 +    x->last_pts = AV_NOPTS_VALUE;
 +}
 +
-+static void
-+xlat_init(xlat_track_t * const x)
-+{
-+    memset(x, 0, sizeof(*x));
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
 +static int
 +xlat_pending(const xlat_track_t * const x)
 +{
@@ -52358,38 +53179,36 @@ index ab07c0a24a..dd383f31e5 100644
 +            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
              return ret;
 +        }
-+
-+        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
-+    }
-+
+     }
+ 
+-    if (s->draining)
+-        goto dequeue;
 +    if (s->draining) {
 +        if (s->buf_pkt.size) {
 +            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
 +            av_packet_unref(&s->buf_pkt);
 +        }
 +        return NQ_DRAINING;
-     }
- 
--    if (s->draining)
--        goto dequeue;
++    }
++
 +    if (!s->buf_pkt.size)
 +        return NQ_NONE;
-+
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
  
 -    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
 -    if (ret < 0 && ret != AVERROR(EAGAIN))
 -        goto fail;
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
+ 
+-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
+-    if (ret != AVERROR(EAGAIN))
 +    if (s->extdata_sent)
 +        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
 +    else if (s->extdata_data)
 +        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
 +    else
 +        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
- 
--    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
--    if (ret != AVERROR(EAGAIN))
++
 +    if (ret == AVERROR(EAGAIN)) {
 +        // Out of input buffers - keep packet
 +        ret = NQ_Q_FULL;
@@ -52498,49 +53317,47 @@ index ab07c0a24a..dd383f31e5 100644
 +                prefer_dq ? 5 :
 +                src_rv == NQ_Q_FULL ? -1 : 0;
 +
-+            do {
-+                // Dequeue frame will unref any previous contents of frame
-+                // if it returns success so we don't need an explicit unref
-+                // when discarding
-+                // This returns AVERROR(EAGAIN) on timeout or if
-+                // there is room in the input Q and timeout == -1
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++            // Dequeue frame will unref any previous contents of frame
++            // if it returns success so we don't need an explicit unref
++            // when discarding
++            // This returns AVERROR(EAGAIN) on timeout or if
++            // there is room in the input Q and timeout == -1
++            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 +
-+                // Failure due to no buffer in Q?
-+                if (dst_rv == AVERROR(ENOSPC)) {
-+                    // Wait & retry
-+                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-+                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-+                    }
++            // Failure due to no buffer in Q?
++            if (dst_rv == AVERROR(ENOSPC)) {
++                // Wait & retry
++                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 +                }
++            }
 +
-+                // Adjust dynamic pending threshold
-+                if (dst_rv == 0) {
-+                    if (--s->pending_hw < PENDING_HW_MIN)
-+                        s->pending_hw = PENDING_HW_MIN;
++            // Adjust dynamic pending threshold
++            if (dst_rv == 0) {
++                if (--s->pending_hw < PENDING_HW_MIN)
++                    s->pending_hw = PENDING_HW_MIN;
++                s->pending_n = 0;
++
++                set_best_effort_pts(avctx, &s->pts_stat, frame);
++            }
++            else if (dst_rv == AVERROR(EAGAIN)) {
++                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
 +                    s->pending_n = 0;
 +                }
-+                else if (dst_rv == AVERROR(EAGAIN)) {
-+                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-+                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-+                        s->pending_n = 0;
-+                    }
-+                }
++            }
 +
-+                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-+                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-+                    dst_rv = AVERROR_EOF;
-+                    s->capture.done = 1;
-+                }
-+                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                           s->draining, s->capture.done);
-+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                           s->draining, s->capture.done, dst_rv);
-+
-+                // Go again if we got a frame that we need to discard
-+            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
++            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++                dst_rv = AVERROR_EOF;
++                s->capture.done = 1;
++            }
++            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++                       s->draining, s->capture.done);
++            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++                       s->draining, s->capture.done, dst_rv);
 +        }
 +
 +        ++i;
@@ -52747,14 +53564,13 @@ index ab07c0a24a..dd383f31e5 100644
      if (ret < 0)
          return ret;
  
-+    xlat_init(&s->xlat);
 +    pts_stats_init(&s->pts_stat, avctx, "decoder");
 +    s->pending_hw = PENDING_HW_MIN;
 +
      capture = &s->capture;
      output = &s->output;
  
-@@ -192,14 +807,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -192,14 +722,51 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
       * by the v4l2 driver; this event will trigger a full pipeline reconfig and
       * the proper values will be retrieved from the kernel driver.
       */
@@ -52768,12 +53584,10 @@ index ab07c0a24a..dd383f31e5 100644
      output->av_codec_id = avctx->codec_id;
      output->av_pix_fmt  = AV_PIX_FMT_NONE;
 +    output->min_buf_size = max_coded_size(avctx);
-+    output->no_pts_rescale = 1;
  
      capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
      capture->av_pix_fmt = avctx->pix_fmt;
 +    capture->min_buf_size = 0;
-+    capture->no_pts_rescale = 1;
 +
 +    /* the client requests the codec to generate DRM frames:
 +     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
@@ -52810,7 +53624,7 @@ index ab07c0a24a..dd383f31e5 100644
  
      s->avctx = avctx;
      ret = ff_v4l2_m2m_codec_init(priv);
-@@ -208,12 +862,74 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -208,12 +775,74 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
          return ret;
      }
  
@@ -52887,7 +53701,7 @@ index ab07c0a24a..dd383f31e5 100644
  }
  
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -222,10 +938,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+@@ -222,10 +851,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -52905,7 +53719,7 @@ index ab07c0a24a..dd383f31e5 100644
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -246,9 +968,15 @@ static const AVOption options[] = {
+@@ -246,9 +881,15 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -52921,6 +53735,367 @@ index ab07c0a24a..dd383f31e5 100644
          .wrapper_name   = "v4l2m2m", \
      }
  
+diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
+index f644b50133..3195ec729b 100644
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "encode.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+@@ -38,6 +40,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+ 
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+     struct v4l2_streamparm parm = { 0 };
+@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+     if (s->avctx->max_b_frames)
+-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+ 
+-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+     if (s->avctx->max_b_frames == 0)
+         return 0;
+ 
+     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+     return AVERROR_PATCHWELCOME;
+ }
+ 
+@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
+     return 0;
+ }
+ 
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        case DRM_FORMAT_P030:
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++    if (a->type != b->type)
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++        unsigned int i;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->num_planes != pb->num_planes)
++            return 0;
++        for (i = 0; i != pa->num_planes; ++i) {
++            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++                return 0;
++        }
++    }
++    else {
++        const struct v4l2_pix_format *const pa = &a->fmt.pix;
++        const struct v4l2_pix_format *const pb = &b->fmt.pix;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->bytesperline != pb->bytesperline)
++            return 0;
++    }
++    return 1;
++}
++
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+     V4L2Context *const output = &s->output;
+ 
++    // Signal EOF if needed
++    if (!frame) {
++        return ff_v4l2_context_enqueue_frame(output, frame);
++    }
++
++    if (s->input_drm && !output->streamon) {
++        int rv;
++        struct v4l2_format req_format = {.type = output->format.type};
++
++        // Set format when we first get a buffer
++        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++            return rv;
++        }
++
++        ff_v4l2_context_release(output);
++
++        output->format = req_format;
++
++        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++            return rv;
++        }
++
++        if (!fmt_eq(&req_format, &output->format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++            return AVERROR(EINVAL);
++        }
++
++        output->selection.top = frame->crop_top;
++        output->selection.left = frame->crop_left;
++        output->selection.width = av_frame_cropped_width(frame);
++        output->selection.height = av_frame_cropped_height(frame);
++
++        if ((rv = ff_v4l2_context_init(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++            return rv;
++        }
++
++        {
++            struct v4l2_selection selection = {
++                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++                .target = V4L2_SEL_TGT_CROP,
++                .r = output->selection
++            };
++            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++                       av_err2str(AVERROR(errno)));
++            }
++            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++        }
++    }
++
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++    if (frame->pict_type == AV_PICTURE_TYPE_I)
+         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+ 
+@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+     }
+ 
+ dequeue:
+-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
++    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++        return ret;
++
++    if (capture->first_buf == 1) {
++        uint8_t * data;
++        const int len = avpkt->size;
++
++        // 1st buffer after streamon should be SPS/PPS
++        capture->first_buf = 2;
++
++        // Clear both possible stores so there is no chance of confusion
++        av_freep(&s->extdata_data);
++        s->extdata_size = 0;
++        av_freep(&avctx->extradata);
++        avctx->extradata_size = 0;
++
++        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
++            memcpy(data, avpkt->data, len);
++
++        av_packet_unref(avpkt);
++
++        if (data == NULL)
++            return AVERROR(ENOMEM);
++
++        // We need to copy the header, but keep local if not global
++        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++            avctx->extradata = data;
++            avctx->extradata_size = len;
++        }
++        else {
++            s->extdata_data = data;
++            s->extdata_size = len;
++        }
++
++        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++            return ret;
++    }
++
++    // First frame must be key so mark as such even if encoder forgot
++    if (capture->first_buf == 2)
++        avpkt->flags |= AV_PKT_FLAG_KEY;
++
++    // Add SPS/PPS to the start of every key frame if non-global headers
++    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++        const size_t newlen = s->extdata_size + avpkt->size;
++        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++        if (buf == NULL) {
++            av_packet_unref(avpkt);
++            return AVERROR(ENOMEM);
++        }
++
++        memcpy(buf->data, s->extdata_data, s->extdata_size);
++        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++        av_buffer_unref(&avpkt->buf);
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++        avpkt->size = newlen;
++    }
++
++//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
++    capture->first_buf = 0;
++    return 0;
+ }
+ 
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     uint32_t v4l2_fmt_output;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     capture = &s->capture;
+     output  = &s->output;
+ 
++    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+     /* common settings output/capture */
+     output->height = capture->height = avctx->height;
+     output->width = capture->width = avctx->width;
+ 
+     /* output context */
+     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+-    output->av_pix_fmt = avctx->pix_fmt;
++    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++            AV_PIX_FMT_YUV420P;
+ 
+     /* capture context */
+     capture->av_codec_id = avctx->codec_id;
+@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+ 
+     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+-    if (pix_fmt_output != avctx->pix_fmt) {
++    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+         return AVERROR(EINVAL);
 diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
 new file mode 100644
 index 0000000000..5b3fb958fa
@@ -53867,20 +55042,27 @@ index 0000000000..dcc8d95632
 +#define HEVC_CTRLS_VERSION 3
 +#include "v4l2_req_hevc_vx.c"
 +
+diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
+new file mode 100644
+index 0000000000..c35579d8e0
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
 diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
 new file mode 100644
-index 0000000000..55c41ae679
+index 0000000000..9ff5592e61
 --- /dev/null
 +++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -0,0 +1,1228 @@
+@@ -0,0 +1,1365 @@
 +// File included by v4l2_req_hevc_v* - not compiled on its own
 +
 +#include "decode.h"
 +#include "hevcdec.h"
 +#include "hwconfig.h"
 +
-+#include "v4l2_request_hevc.h"
-+
 +#if HEVC_CTRLS_VERSION == 1
 +#include "hevc-ctrls-v1.h"
 +
@@ -53891,10 +55073,37 @@ index 0000000000..55c41ae679
 +#include "hevc-ctrls-v2.h"
 +#elif HEVC_CTRLS_VERSION == 3
 +#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
 +#else
 +#error Unknown HEVC_CTRLS_VERSION
 +#endif
 +
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++// Should be in videodev2 but we might not have a good enough one
++#ifndef V4L2_PIX_FMT_HEVC_SLICE
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++#endif
++
++#include "v4l2_request_hevc.h"
++
 +#include "libavutil/hwcontext_drm.h"
 +
 +#include <semaphore.h>
@@ -53930,11 +55139,16 @@ index 0000000000..55c41ae679
 +    struct v4l2_ctrl_hevc_slice_params * slice_params;
 +    struct slice_info * slices;
 +
++    size_t num_offsets;
++    size_t alloced_offsets;
++    uint32_t *offsets;
++
 +} V4L2MediaReqDescriptor;
 +
 +struct slice_info {
 +    const uint8_t * ptr;
 +    size_t len; // bytes
++    size_t n_offsets;
 +};
 +
 +// Handy container for accumulating controls before setting
@@ -54093,7 +55307,7 @@ index 0000000000..55c41ae679
 +    if (rd->num_slices >= rd->alloced_slices) {
 +        struct v4l2_ctrl_hevc_slice_params * p2;
 +        struct slice_info * s2;
-+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
++        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
 +
 +        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
 +        if (p2 == NULL)
@@ -54111,6 +55325,23 @@ index 0000000000..55c41ae679
 +    return 0;
 +}
 +
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++    if (rd->num_offsets + n > rd->alloced_offsets) {
++        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++        void * p2;
++        while (rd->num_offsets + n > n2)
++            n2 *= 2;
++        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++            return AVERROR(ENOMEM);
++        rd->offsets = p2;
++        rd->alloced_offsets = n2;
++    }
++    for (size_t i = 0; i != n; ++i)
++        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++    return 0;
++}
++
 +static unsigned int
 +fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
 +{
@@ -54132,9 +55363,13 @@ index 0000000000..55c41ae679
 +#endif
 +            entry->field_pic = frame->frame->interlaced_frame;
 +
++#if HEVC_CTRLS_VERSION <= 3
 +            /* TODO: Interleaved: Get the POC for each field. */
 +            entry->pic_order_cnt[0] = frame->poc;
 +            entry->pic_order_cnt[1] = frame->poc;
++#else
++            entry->pic_order_cnt_val = frame->poc;
++#endif
 +        }
 +    }
 +    return n;
@@ -54160,8 +55395,11 @@ index 0000000000..55c41ae679
 +
 +    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
 +        .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
 +        .data_bit_offset = bit_offset,
-+
++#else
++        .data_byte_offset = bit_offset / 8 + 1,
++#endif
 +        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
 +        .slice_segment_addr = sh->slice_segment_addr,
 +
@@ -54244,6 +55482,7 @@ index 0000000000..55c41ae679
 +    fill_pred_table(h, &slice_params->pred_weight_table);
 +
 +    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
 +    if (slice_params->num_entry_point_offsets > 256) {
 +        slice_params->num_entry_point_offsets = 256;
 +        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
@@ -54251,6 +55490,7 @@ index 0000000000..55c41ae679
 +
 +    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
 +        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
 +}
 +
 +#if HEVC_CTRLS_VERSION >= 2
@@ -54626,51 +55866,66 @@ index 0000000000..55c41ae679
 +#if HEVC_CTRLS_VERSION >= 2
 +    struct v4l2_ctrl_hevc_decode_params * const dec,
 +#endif
-+    struct v4l2_ctrl_hevc_slice_params * const slices,
-+    const unsigned int slice_no,
-+    const unsigned int slice_count)
++    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++    void * const offsets, const size_t offset_count)
 +{
 +    int rv;
++#if HEVC_CTRLS_VERSION >= 2
++    unsigned int n = 3;
++#else
++    unsigned int n = 2;
++#endif
 +
-+    struct v4l2_ext_control control[] = {
++    struct v4l2_ext_control control[6] = {
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .id = V4L2_CID_STATELESS_HEVC_SPS,
 +            .ptr = &controls->sps,
 +            .size = sizeof(controls->sps),
 +        },
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
++            .id = V4L2_CID_STATELESS_HEVC_PPS,
 +            .ptr = &controls->pps,
 +            .size = sizeof(controls->pps),
 +        },
 +#if HEVC_CTRLS_VERSION >= 2
 +        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
++            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
 +            .ptr = dec,
 +            .size = sizeof(*dec),
 +        },
 +#endif
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = slices + slice_no,
-+            .size = sizeof(*slices) * slice_count,
-+        },
-+        // Optional
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
 +    };
 +
-+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-+            controls->has_scaling ?
-+                FF_ARRAY_ELEMS(control) :
-+                FF_ARRAY_ELEMS(control) - 1);
++    if (slices)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++            .ptr = slices,
++            .size = sizeof(*slices) * slice_count,
++        };
++
++    if (controls->has_scaling)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        };
++
++#if HEVC_CTRLS_VERSION >= 4
++    if (offsets)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++            .ptr = offsets,
++            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++        };
++#endif
++
++    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
 +
 +    return rv;
 +}
 +
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
 +static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
 +{
 +    const HEVCContext * const h = avctx->priv_data;
@@ -54679,18 +55934,45 @@ index 0000000000..55c41ae679
 +    int bcount = get_bits_count(&h->HEVClc->gb);
 +    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
 +
++    const unsigned int n = rd->num_slices;
++    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
 +    int rv;
 +    struct slice_info * si;
 +
++    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++    // that contains the entire frame including the start code
++    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++        buffer -= 3;
++        size += 3;
++        boff += 24;
++        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++                   buffer[0], buffer[1], buffer[2]);
++        }
++    }
++
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++        if (rd->slices == NULL) {
++            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++                return AVERROR(ENOMEM);
++            rd->slices->ptr = buffer;
++            rd->num_slices = 1;
++        }
++        rd->slices->len = buffer - rd->slices->ptr + size;
++        return 0;
++    }
++
 +    if ((rv = slice_add(rd)) != 0)
 +        return rv;
 +
-+    si = rd->slices + rd->num_slices - 1;
++    si = rd->slices + n;
 +    si->ptr = buffer;
 +    si->len = size;
++    si->n_offsets = rd->num_offsets;
 +
-+    if (ctx->multi_slice && rd->num_slices > 1) {
-+        struct slice_info *const si0 = rd->slices;
++    if (n != block_start) {
++        struct slice_info *const si0 = rd->slices + block_start;
 +        const size_t offset = (buffer - si0->ptr);
 +        boff += offset * 8;
 +        size += offset;
@@ -54698,12 +55980,15 @@ index 0000000000..55c41ae679
 +    }
 +
 +#if HEVC_CTRLS_VERSION >= 2
-+    if (rd->num_slices == 1)
++    if (n == 0)
 +        fill_decode_params(h, &rd->dec);
-+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
++    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
 +#else
-+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
++    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
 +#endif
++    if (ctx->max_offsets != 0 &&
++        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++        return rv;
 +
 +    return 0;
 +}
@@ -54729,10 +56014,13 @@ index 0000000000..55c41ae679
 +{
 +    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
 +
++    const int is_last = (j == rd->num_slices);
 +    struct slice_info *const si = rd->slices + i;
 +    struct media_request * req = NULL;
 +    struct qent_src * src = NULL;
 +    MediaBufsStatus stat;
++    void * offsets = rd->offsets + rd->slices[i].n_offsets;
++    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
 +
 +    if ((req = media_request_get(ctx->mpool)) == NULL) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
@@ -54744,8 +56032,8 @@ index 0000000000..55c41ae679
 +#if HEVC_CTRLS_VERSION >= 2
 +                     &rd->dec,
 +#endif
-+                     rd->slice_params,
-+                     i, j - i)) {
++                     rd->slice_params + i, j - i,
++                     offsets, n_offsets)) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
 +        goto fail1;
 +    }
@@ -54765,13 +56053,9 @@ index 0000000000..55c41ae679
 +        goto fail2;
 +    }
 +
-+#warning ANNEX_B start code
-+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+//        }
-+
 +    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
 +                                   i == 0 ? rd->qe_dst : NULL,
-+                                   j == rd->num_slices);
++                                   is_last);
 +
 +    if (stat != MEDIABUFS_STATUS_SUCCESS) {
 +        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
@@ -54836,18 +56120,11 @@ index 0000000000..55c41ae679
 +    }
 +
 +    // Send as slices
-+    if (ctx->multi_slice)
-+    {
-+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
++    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
 +            goto fail;
 +    }
-+    else
-+    {
-+        for (i = 0; i != rd->num_slices; ++i) {
-+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-+                goto fail;
-+        }
-+    }
 +
 +    // Set the drm_prime desriptor
 +    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
@@ -54862,6 +56139,12 @@ index 0000000000..55c41ae679
 +    return rv;
 +}
 +
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++    return v >= c->minimum && v <= c->maximum;
++}
++
 +// Initial check & init
 +static int
 +probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
@@ -54873,17 +56156,19 @@ index 0000000000..55c41ae679
 +
 +    // Check for var slice array
 +    struct v4l2_query_ext_ctrl qc[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SPS },
++        { .id = V4L2_CID_STATELESS_HEVC_PPS },
++        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
 +#if HEVC_CTRLS_VERSION >= 2
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
 +#endif
 +    };
 +    // Order & size must match!
 +    static const size_t ctrl_sizes[] = {
 +        sizeof(struct v4l2_ctrl_hevc_slice_params),
++        sizeof(int32_t),
 +        sizeof(struct v4l2_ctrl_hevc_sps),
 +        sizeof(struct v4l2_ctrl_hevc_pps),
 +        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
@@ -54901,11 +56186,22 @@ index 0000000000..55c41ae679
 +        return AVERROR(EINVAL);
 +#endif
 +
-+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++    i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++    // Skip slice check if no slice mode
++    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        i = 1;
++#else
++    // Fail frame mode silently for anything prior to V4
++    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
 +        return AVERROR(EINVAL);
-+    }
-+    for (i = 0; i != noof_ctrls; ++i) {
++#endif
++    for (; i != noof_ctrls; ++i) {
++        if (qc[i].type == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++            return AVERROR(EINVAL);
++        }
 +        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
 +            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
 +                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
@@ -54915,12 +56211,11 @@ index 0000000000..55c41ae679
 +
 +    fill_sps(&ctrl_sps, sps);
 +
-+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
 +        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
 +    return 0;
 +}
 +
@@ -54931,38 +56226,63 @@ index 0000000000..55c41ae679
 +    int ret;
 +
 +    struct v4l2_query_ext_ctrl querys[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
 +    };
 +
 +    struct v4l2_ext_control ctrls[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
 +    };
 +
 +    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
 +
-+    ctx->decode_mode = querys[0].default_value;
++    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++        1 : querys[2].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
 +
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++#if HEVC_CTRLS_VERSION >= 4
++    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++        0 : querys[3].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++    ctx->max_offsets = 0;
++#endif
++
++    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++        ctx->decode_mode = querys[0].default_value;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->start_code = querys[1].default_value;
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++        ctx->start_code = querys[1].default_value;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
 +        return AVERROR(EINVAL);
 +    }
 +
-+    ctx->max_slices = querys[2].elems;
-+    if (ctx->max_slices > MAX_SLICES) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-+        return AVERROR(EINVAL);
-+    }
++    // If we are in slice mode & START_CODE_NONE supported then pick that
++    // as it doesn't require the slightly dodgy look backwards in our raw buffer
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
 +
 +    ctrls[0].value = ctx->decode_mode;
 +    ctrls[1].value = ctx->start_code;
@@ -54986,6 +56306,7 @@ index 0000000000..55c41ae679
 +
 +    av_freep(&rd->slices);
 +    av_freep(&rd->slice_params);
++    av_freep(&rd->offsets);
 +
 +    av_free(rd);
 +}
@@ -57289,10 +58610,10 @@ index 0000000000..cb4bd164b4
 +
 diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
 new file mode 100644
-index 0000000000..0ae14db90b
+index 0000000000..27b1b8dd6d
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,311 @@
+@@ -0,0 +1,315 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -57504,7 +58825,11 @@ index 0000000000..0ae14db90b
 +        goto fail4;
 +    }
 +
-+    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++    }
++    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
 +        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
 +        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
 +    }
@@ -57606,10 +58931,10 @@ index 0000000000..0ae14db90b
 +};
 diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
 new file mode 100644
-index 0000000000..b2cb8c8584
+index 0000000000..830444bf92
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.h
-@@ -0,0 +1,102 @@
+@@ -0,0 +1,101 @@
 +#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
 +#define AVCODEC_V4L2_REQUEST_HEVC_H
 +
@@ -57657,8 +58982,6 @@ index 0000000000..b2cb8c8584
 +#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
 +#endif
 +
-+#define MAX_SLICES 128
-+
 +#define VCAT(name, version) name##_v##version
 +#define V2(n,v) VCAT(n, v)
 +#define V(n) V2(n, HEVC_CTRLS_VERSION)
@@ -57675,10 +58998,10 @@ index 0000000000..b2cb8c8584
 +
 +    unsigned int timestamp;  // ?? maybe uint64_t
 +
-+    int multi_slice;
 +    int decode_mode;
 +    int start_code;
-+    int max_slices;
++    unsigned int max_slices;    // 0 => not wanted (frame mode)
++    unsigned int max_offsets;   // 0 => not wanted
 +
 +    req_decode_q decode_q;
 +
@@ -57710,6 +59033,7 @@ index 0000000000..b2cb8c8584
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
 +
 +#endif
 diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
@@ -61841,10 +63165,10 @@ index 5613813ba8..ab8bcfcf34 100644
 +
 diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
 new file mode 100644
-index 0000000000..cdcf71ee67
+index 0000000000..2f07d9674c
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,676 @@
+@@ -0,0 +1,781 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -62095,228 +63419,6 @@ index 0000000000..cdcf71ee67
 +    ret
 +endfunc
 +
-+//void ff_rpi_sand30_lines_to_planar_y16(
-+//  uint8_t * dest,             // [x0]
-+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-+//  const uint8_t * src,        // [x2]
-+//  unsigned int src_stride1,   // [w3] -> 128
-+//  unsigned int src_stride2,   // [w4]
-+//  unsigned int _x,            // [w5]
-+//  unsigned int y,             // [w6]
-+//  unsigned int _w,            // [w7]
-+//  unsigned int h);            // [sp, #0]
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    // w6 = argument h
-+    ldr w6, [sp, #48]
-+
-+    // slice_inc = ((stride2 - 1) * stride1)
-+    mov w5, w4
-+    sub w5, w5, #1
-+    lsl w5, w5, #7
-+
-+    // total number of bytes per row = (width / 3) * 4
-+    mov w8, w7
-+    mov w9, #3
-+    udiv w8, w8, w9
-+    lsl w8, w8, #2
-+
-+    // number of full 128 byte blocks to be processed
-+    mov w9, #96
-+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-+
-+    // w10 = number of full integers to process (4 bytes)
-+    // w11 = remaning zero to two 10bit values still to copy over
-+    mov w12, #96
-+    mul w12, w9, w12
-+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-+    mov w11, #3
-+    udiv w10, w12, w11 // full integers to process = w12 / 3 
-+    mul w11, w10, w11  // #integers *3
-+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-+
-+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-+    // this is to efficiently copy incomplete blocks at the end of the rows
-+    // the last row is handled explicitly to avoid writing out of bounds
-+    add w22, w10, w11
-+    cmp w22, #0
-+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-+    add w9, w9, w22
-+    sub w6, w6, #1
-+
-+    // store the number of bytes in w20 which we copy too much for every row
-+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-+    mov w20, #96*2
-+    mul w20, w20, w9
-+    sub w20, w1, w20
-+
-+    mov w23, #0 // flag to check whether the last line had already been processed
-+    
-+    // bitmask to clear the uppper 6bits of the result values
-+    mov x19, #0x03ff03ff03ff03ff
-+    dup v22.2d, x19
-+
-+    // row counter = 0
-+    eor w12, w12, w12
-+row_loop_y16:
-+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-+    bge row_loop_y16_fin
-+
-+    mov x13, x2               // row src
-+    eor w14, w14, w14         // full block counter
-+block_loop_y16:
-+    cmp w14, w9
-+    bge block_loop_y16_fin
-+
-+    // load 64 bytes
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+   
-+    // process v0 and v1
-+    xtn v16.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v17.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v18.4h, v0.4s
-+   
-+    xtn2 v16.8h, v1.4s
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v17.8h, v1.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v18.8h, v1.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // process v2 and v3
-+    xtn v23.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v24.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v25.4h, v2.4s
-+    
-+    xtn2 v23.8h, v3.4s
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v24.8h, v3.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v25.8h, v3.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+
-+    // load the second half of the block -> 64 bytes into registers v4-v7
-+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-+    
-+    // process v4 and v5
-+    xtn v16.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v17.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v18.4h, v4.4s
-+   
-+    xtn2 v16.8h, v5.4s 
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v17.8h, v5.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v18.8h, v5.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // v6 and v7
-+    xtn v23.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v24.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v25.4h, v6.4s
-+   
-+    xtn2 v23.8h, v7.4s 
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v24.8h, v7.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v25.8h, v7.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+ 
-+    add x13, x13, x5          // row src += slice_inc
-+    add w14, w14, #1
-+    b block_loop_y16
-+block_loop_y16_fin:
-+
-+    
-+
-+
-+    add x2, x2, #128          // src += stride1 (start of the next row)
-+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-+    add w12, w12, #1
-+    b row_loop_y16
-+row_loop_y16_fin:
-+
-+    // check whether we have incomplete blocks at the end of every row
-+    // in that case decrease row block count by one
-+    // change height back to it's original value (meaning increase it by 1)
-+    // and jump back to another iteration of row_loop_y16
-+
-+    cmp w23, #1
-+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-+    add w6, w6, #1    // increase height to the original value
-+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-+    mov w23, #1
-+    b row_loop_y16
-+row_loop_y16_fin2:
-+
-+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-+
-+    // now we've got to handle the last block in the last row
-+    eor w12, w12, w12 // w12 = 0 = counter
-+integer_loop_y16:
-+    cmp w12, w10
-+    bge integer_loop_y16_fin
-+    ldr w14, [x13], #4
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    add w12, w12, #1
-+    b integer_loop_y16
-+integer_loop_y16_fin:
-+
-+final_values_y16:
-+    // remaining point count = w11
-+    ldr w14, [x13], #4
-+    cmp w11, #0
-+    beq final_values_y16_fin
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    cmp w11, #1
-+    beq final_values_y16_fin
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+final_values_y16_fin:
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
 +//void ff_rpi_sand30_lines_to_planar_c16(
 +//  uint8_t * dst_u,            // [x0]
 +//  unsigned int dst_stride_u,  // [w1] == _w*2
@@ -62521,12 +63623,339 @@ index 0000000000..cdcf71ee67
 +//  unsigned int _w,
 +//  unsigned int h);
 +
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7, lsl #1
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #14
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #10
++
++                shrn2           v18.8h,  v1.4s,   #14
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #10
++
++                ushr            v18.8h,  v18.8h,  #6
++                bic             v16.8h,  #0xfc,   lsl #8
++                bic             v17.8h,  #0xfc,   lsl #8
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #14
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #10
++
++                shrn2           v21.8h,  v3.4s,   #14
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #10
++
++                ushr            v21.8h,  v21.8h,  #6
++                bic             v19.8h,  #0xfc,   lsl #8
++                bic             v20.8h,  #0xfc,   lsl #8
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #14
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #10
++
++                shrn2           v24.8h,  v5.4s,   #14
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #10
++
++                ushr            v24.8h,  v24.8h,  #6
++                bic             v22.8h,  #0xfc,   lsl #8
++                bic             v23.8h,  #0xfc,   lsl #8
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #14
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #10
++
++                shrn2           v27.8h,  v7.4s,   #14
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #10
++
++                ushr            v27.8h,  v27.8h,  #6
++                bic             v25.8h,  #0xfc,   lsl #8
++                bic             v26.8h,  #0xfc,   lsl #8
++
++                blt             2f
++
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
++                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++                mov             v19.16b, v25.16b
++                mov             v20.16b, v26.16b
++                mov             v21.16b, v27.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v19.16b
++                mov             v17.16b, v20.16b
++                sub             w5,  w5,  #24
++                mov             v18.16b, v21.16b
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #12
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #6
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #3
++                mov             v17.4h[0], v17.4h[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.h, v17.h}[0], [x0], #4
++                b               11b
++1:
++                st1             {v16.h}[0], [x0], #2
++                b               11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #16
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #12
++
++                shrn2           v18.8h,  v1.4s,   #16
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #12
++
++                shrn            v18.8b,  v18.8h,  #6
++                shrn            v16.8b,  v16.8h,  #2
++                xtn             v17.8b,  v17.8h
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #16
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #12
++
++                shrn2           v21.8h,  v3.4s,   #16
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #12
++
++                shrn2           v18.16b, v21.8h,  #6
++                shrn2           v16.16b, v19.8h,  #2
++                xtn2            v17.16b, v20.8h
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #16
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #12
++
++                shrn2           v24.8h,  v5.4s,   #16
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #12
++
++                shrn            v21.8b,  v24.8h,  #6
++                shrn            v19.8b,  v22.8h,  #2
++                xtn             v20.8b,  v23.8h
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #16
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #12
++
++                shrn2           v27.8h,  v7.4s,   #16
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #12
++
++                shrn2           v21.16b, v27.8h,  #6
++                shrn2           v19.16b, v25.8h,  #2
++                xtn2            v20.16b, v26.8h
++
++                blt             2f
++
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #24
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #12
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #6
++                mov             v17.4h[0], v17.4h[1]
++                mov             v18.4h[0], v18.4h[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                beq             11b
++                mov             v16.8b[0], v16.8b[1]
++                sub             w5,  w5,  #3
++                mov             v17.8b[0], v17.8b[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.b, v17.b}[0], [x0], #2
++                b               11b
++1:
++                st1             {v16.b}[0], [x0], #1
++                b               11b
++
++endfunc
++
 diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
 new file mode 100644
-index 0000000000..b3aa481ea4
+index 0000000000..2a56135bc3
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,55 @@
+@@ -0,0 +1,59 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -62578,6 +64007,10 @@ index 0000000000..b3aa481ea4
 +  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
 +  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
 +
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
 +#ifdef __cplusplus
 +}
 +#endif
@@ -62593,10 +64026,10 @@ index 5da44b0542..b74b7c4e2f 100644
 +             arm/rpi_sand_neon.o                                        \
 diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
 new file mode 100644
-index 0000000000..80890fe985
+index 0000000000..60e697f681
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,768 @@
+@@ -0,0 +1,925 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -62959,7 +64392,6 @@ index 0000000000..80890fe985
 +                ldr             r6,  [sp, #36]
 +                ldr             r7,  [sp, #32]  @ y
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r3,  #1
 +                lsl             r3,  #7
 +                sub             r1,  r1,  r6,  lsl #1
@@ -62975,37 +64407,33 @@ index 0000000000..80890fe985
 +                vldm            r2!, {q10-q13}
 +                add             lr,  #64
 +
-+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
++                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
 +                ands            lr,  #127
 +                vshrn.u32       d2,  q10, #10
 +                vmovn.u32       d0,  q10
-+                vmovn.u32       d4,  q14
 +
-+                vshr.u32        q14, q11, #20
++                vshrn.u32       d5,  q11, #14
 +                it              eq
 +                addeq           r2,  r3
 +                vshrn.u32       d3,  q11, #10
 +                vmovn.u32       d1,  q11
-+                vmovn.u32       d5,  q14
 +
 +                subs            r5,  #48
-+                vand            q0,  q15
-+                vand            q1,  q15
-+                vand            q2,  q15
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
-+                vshr.u32        q14, q12, #20
++                vshrn.u32       d20, q12, #14
 +                vshrn.u32       d18, q12, #10
 +                vmovn.u32       d16, q12
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q13, #20
++                vshrn.u32       d21, q13, #14
 +                vshrn.u32       d19, q13, #10
 +                vmovn.u32       d17, q13
-+                vmovn.u32       d21, q14
 +
-+                vand            q8,  q15
-+                vand            q9,  q15
-+                vand            q10, q15
++                vshr.u16        q10, #6
++                vbic.u16        q8,  #0xfc00
++                vbic.u16        q9 , #0xfc00
 +                blt             2f
 +
 +                vst3.16         {d0,  d2,  d4},  [r0], r12
@@ -63098,7 +64526,6 @@ index 0000000000..80890fe985
 +                ldr             r7,  [sp, #48]
 +                ldr             r9,  [sp, #52]
 +                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
 +                sub             r8,  #1
 +                lsl             r8,  #7
 +                add             r5,  r5,  r7,  lsl #7
@@ -63114,48 +64541,44 @@ index 0000000000..80890fe985
 +                add             lr,  #64
 +
 +                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-+                vshr.u32        q14, q0,  #20
-+                vshrn.u32       d16, q0,  #10
++                vshrn.u32       d20, q0,  #14
 +                vmovn.u32       d18, q0
++                vshrn.u32       d0,  q0,  #10
 +                ands            lr,  #127
-+                vmovn.u32       d20, q14
 +
-+                vshr.u32        q14, q1,  #20
-+                vshrn.u32       d17, q1,  #10
++                vshrn.u32       d21, q1,  #14
 +                vmovn.u32       d19, q1
-+                vmovn.u32       d21, q14
++                vshrn.u32       d1,  q1,  #10
 +
-+                vshr.u32        q14, q2,  #20
 +                vshrn.u32       d22, q2,  #10
-+                vmovn.u32       d24, q2
-+                vmovn.u32       d26, q14
++                vmovn.u32       d2,  q2
++                vshrn.u32       d4,  q2,  #14
 +
-+                vshr.u32        q14, q3,  #20
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d25, q3
 +                add             r10, r0,  #24
-+                vmovn.u32       d27, q14
++                vshrn.u32       d23, q3,  #10
++                vmovn.u32       d3,  q3
++                vshrn.u32       d5,  q3,  #14
 +
 +                it              eq
 +                addeq           r4,  r8
-+                vuzp.16         q8,  q11
-+                vuzp.16         q9,  q12
-+                vuzp.16         q10, q13
++                vuzp.16         q0,  q11
++                vuzp.16         q9,  q1
++                vuzp.16         q10, q2
 +
-+                @ q8   V0, V3,.. -> q0
++                @ q0   V0, V3,..
 +                @ q9   U0, U3...
 +                @ q10  U1, U4...
 +                @ q11  U2, U5,..
-+                @ q12  V1, V4,.. -> q1
-+                @ q13  V2, V5,.. -> q2
++                @ q1   V1, V4,
++                @ q2   V2, V5,..
 +
 +                subs            r6,  #24
-+                vand            q11, q15
-+                vand            q9,  q15
-+                vand            q10, q15
-+                vand            q0,  q8,  q15
-+                vand            q1,  q12, q15
-+                vand            q2,  q13, q15
++                vbic.u16        q11, #0xfc00
++                vbic.u16        q9,  #0xfc00
++                vshr.u16        q10, #6
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
 +
 +                blt             2f
 +
@@ -63364,13 +64787,180 @@ index 0000000000..80890fe985
 +endfunc
 +
 +
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                lsl             r3,  #7
++                sub             r1,  r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++1:
++                vldm            r2,  {q8-q15}
++
++                subs            r5,  #96
++
++                vmovn.u32       d0,  q8
++                vshrn.u32       d2,  q8,  #12
++                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
++
++                add             r2,  r3
++
++                vmovn.u32       d1,  q9
++                vshrn.u32       d3,  q9,  #12
++                vshrn.u32       d5,  q9,  #16
++
++                pld             [r2, #0]
++
++                vshrn.u16       d0,  q0,  #2
++                vmovn.u16       d1,  q1
++                vshrn.u16       d2,  q2,  #6
++
++                vmovn.u32       d16, q10
++                vshrn.u32       d18, q10, #12
++                vshrn.u32       d20, q10, #16
++
++                vmovn.u32       d17, q11
++                vshrn.u32       d19, q11, #12
++                vshrn.u32       d21, q11, #16
++
++                pld             [r2, #64]
++
++                vshrn.u16       d4,  q8,  #2
++                vmovn.u16       d5,  q9
++                vshrn.u16       d6,  q10, #6
++
++                vmovn.u32       d16, q12
++                vshrn.u32       d18, q12, #12
++                vshrn.u32       d20, q12, #16
++
++                vmovn.u32       d17, q13
++                vshrn.u32       d19, q13, #12
++                vshrn.u32       d21, q13, #16
++
++                vshrn.u16       d16, q8,  #2
++                vmovn.u16       d17, q9
++                vshrn.u16       d18, q10, #6
++
++                vmovn.u32       d20, q14
++                vshrn.u32       d22, q14, #12
++                vshrn.u32       d24, q14, #16
++
++                vmovn.u32       d21, q15
++                vshrn.u32       d23, q15, #12
++                vshrn.u32       d25, q15, #16
++
++                vshrn.u16       d20, q10, #2
++                vmovn.u16       d21, q11
++                vshrn.u16       d22, q12, #6
++
++                blt             2f
++
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                vst3.8          {d16, d17, d18}, [r0], r12
++                vst3.8          {d20, d21, d22}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #48-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                beq             11b
++                vmov            q0,  q8
++                vmov            q2,  q10
++                sub             r5,  #48
++                vmov            d2,  d18
++                vmov            d6,  d22
++1:
++                cmp             r5,  #24-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0]!
++                beq             11b
++                vmov            q0,  q2
++                sub             r5,  #24
++                vmov            d2,  d6
++1:
++                cmp             r5,  #12-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
++                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #12
++                vmov            s2,  s3
++                vmov            s4,  s5
++1:
++                cmp             r5,  #6-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                add             r0,  #12
++                beq             11b
++                vshr.u32        d0,  #16
++                sub             r5,  #6
++                vshr.u32        d1,  #16
++                vshr.u32        d2,  #16
++1:
++                cmp             r5, #3-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #8
++                vshr.u32        d1, #8
++1:
++                cmp             r5, #2-96
++                blt             1f
++                vst2.8          {d0[0], d1[0]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
 +
 diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
 new file mode 100644
-index 0000000000..447f367bea
+index 0000000000..d457c10870
 --- /dev/null
 +++ b/libavutil/arm/rpi_sand_neon.h
-@@ -0,0 +1,99 @@
+@@ -0,0 +1,110 @@
 +/*
 +Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -63468,6 +65058,17 @@ index 0000000000..447f367bea
 +  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
 +  unsigned int h);            // [sp, #16] -> r7
 +
++void ff_rpi_sand30_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
 +#endif // AVUTIL_ARM_SAND_NEON_H
 +
 diff --git a/libavutil/frame.c b/libavutil/frame.c
@@ -63528,7 +65129,7 @@ index 7d1f8e2935..a4e7dc915d 100644
   * @}
   */
 diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
-index 7a9fdbd263..2a498f9b50 100644
+index 7a9fdbd263..2f825b7e16 100644
 --- a/libavutil/hwcontext_drm.c
 +++ b/libavutil/hwcontext_drm.c
 @@ -21,6 +21,7 @@
@@ -63603,13 +65204,23 @@ index 7a9fdbd263..2a498f9b50 100644
  
      err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
                                  &drm_unmap_frame, map);
-@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
-     if (!pix_fmts)
+@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
+                                     enum AVHWFrameTransferDirection dir,
+                                     enum AVPixelFormat **formats)
+ {
+-    enum AVPixelFormat *pix_fmts;
++    enum AVPixelFormat *p;
+ 
+-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+-    if (!pix_fmts)
++    p = *formats = av_malloc_array(3, sizeof(*p));
++    if (!p)
          return AVERROR(ENOMEM);
  
 -    pix_fmts[0] = ctx->sw_format;
+-    pix_fmts[1] = AV_PIX_FMT_NONE;
 +    // **** Offer native sand too ????
-+    pix_fmts[0] =
++    *p++ =
 +#if CONFIG_SAND
 +        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
 +            AV_PIX_FMT_YUV420P :
@@ -63617,10 +65228,19 @@ index 7a9fdbd263..2a498f9b50 100644
 +            AV_PIX_FMT_YUV420P10LE :
 +#endif
 +            ctx->sw_format;
-     pix_fmts[1] = AV_PIX_FMT_NONE;
++
++#if CONFIG_SAND
++    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++        *p++ = AV_PIX_FMT_NV12;
++#endif
  
-     *formats = pix_fmts;
-@@ -231,18 +267,80 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
+-    *formats = pix_fmts;
++    *p = AV_PIX_FMT_NONE;
+     return 0;
+ }
+ 
+@@ -231,18 +272,63 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
      map = av_frame_alloc();
      if (!map)
          return AVERROR(ENOMEM);
@@ -63655,29 +65275,12 @@ index 7a9fdbd263..2a498f9b50 100644
 +        const unsigned int w = FFMIN(dst->width, map->width);
 +        const unsigned int h = FFMIN(dst->height, map->height);
 +
-+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else
++        map->crop_top = 0;
++        map->crop_bottom = 0;
++        map->crop_left = 0;
++        map->crop_right = 0;
++
++        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
 +        {
 +            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
 +            err = AVERROR(EINVAL);
@@ -63705,7 +65308,7 @@ index 7a9fdbd263..2a498f9b50 100644
  
      err = 0;
  fail:
-@@ -257,7 +355,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
+@@ -257,7 +343,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
      int err;
  
      if (src->width > hwfc->width || src->height > hwfc->height)
@@ -64011,10 +65614,10 @@ index 0000000000..0d5d203dc3
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..1f543e9357
+index 0000000000..b6071e2928
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,356 @@
+@@ -0,0 +1,445 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -64246,6 +65849,75 @@ index 0000000000..1f543e9357
 +    }
 +}
 +
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint8_t * d = dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = (p3 >> 2) & 0xff;
++            *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = (p3 >> 2) & 0xff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 12) & 0xff;
++        }
++    }
++}
++
++
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -64327,6 +65999,16 @@ index 0000000000..1f543e9357
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2,  w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -64361,6 +66043,16 @@ index 0000000000..1f543e9357
 +                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
 +                                             x/2, y/2, w/2, h/2);
 +                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
 +                default:
 +                    return -1;
 +            }
@@ -64373,10 +66065,10 @@ index 0000000000..1f543e9357
 +}
 diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
 new file mode 100644
-index 0000000000..634b55e800
+index 0000000000..462ccb8abd
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,188 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -64464,6 +66156,11 @@ index 0000000000..634b55e800
 +                             unsigned int _x, unsigned int y,
 +                             unsigned int _w, unsigned int h);
 +
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
 +
 +// w/h in pixels
 +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
@@ -66194,3 +67891,644 @@ index 0000000000..5935a11ca5
 +
 +    do_logparse(args.logfile)
 +
+diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
+index 1827a4e134..08da4166ef 100644
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
+ AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
+ 
+diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
+index 8338e8ff58..81ef182f04 100644
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -131,6 +131,9 @@ static const struct {
+     #if CONFIG_HUFFYUV_DECODER
+         { "huffyuvdsp", checkasm_check_huffyuvdsp },
+     #endif
++    #if CONFIG_IDCTDSP
++        { "idctdsp", checkasm_check_idctdsp },
++    #endif
+     #if CONFIG_JPEG2000_DECODER
+         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+     #endif
+@@ -155,6 +158,9 @@ static const struct {
+     #if CONFIG_V210_ENCODER
+         { "v210enc", checkasm_check_v210enc },
+     #endif
++    #if CONFIG_VC1DSP
++        { "vc1dsp", checkasm_check_vc1dsp },
++    #endif
+     #if CONFIG_VP8DSP
+         { "vp8dsp", checkasm_check_vp8dsp },
+     #endif
+diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
+index ef6645e3a2..1a1e17d835 100644
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -70,6 +70,7 @@ void checkasm_check_hevc_epel_bi(void);
+ void checkasm_check_hevc_epel_bi_w(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+@@ -83,6 +84,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c
+new file mode 100644
+index 0000000000..02724536a7
+--- /dev/null
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/idctdsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size)          \
++    do {                                        \
++        int i;                                  \
++        for (i = 0; i < size; ++i) {            \
++            uint16_t r = rnd() % 0x201 - 0x100; \
++            AV_WN16A(name##0 + i, r);           \
++            AV_WN16A(name##1 + i, r);           \
++        }                                       \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++static void check_add_put_clamped(void)
++{
++    /* Source buffers are only as big as needed, since any over-read won't affect results */
++    LOCAL_ALIGNED_16(int16_t, src0, [64]);
++    LOCAL_ALIGNED_16(int16_t, src1, [64]);
++    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++    AVCodecContext avctx = { 0 };
++    IDCTDSPContext h;
++
++    const test tests[] = {
++        IDCTDSP_TEST(add_pixels_clamped)
++        IDCTDSP_TEST(put_pixels_clamped)
++        IDCTDSP_TEST(put_signed_pixels_clamped)
++    };
++
++    ff_idctdsp_init(&h, &avctx);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "idctdsp.%s", tests[t].name)) {
++            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++            RANDOMIZE_BUFFER16(src, 64);
++            RANDOMIZE_BUFFER8(dst, 10 * 24);
++            call_ref(src0, dst0 + 24 + 8, 24);
++            call_new(src1, dst1 + 24 + 8, 24);
++            if (memcmp(dst0, dst1, 10 * 24))
++                fail();
++            bench_new(src1, dst1 + 24 + 8, 24);
++        }
++    }
++}
++
++void checkasm_check_idctdsp(void)
++{
++    check_add_put_clamped();
++    report("idctdsp");
++}
+diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
+new file mode 100644
+index 0000000000..52628d15e4
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++    int width;
++    int height;
++} test;
++
++typedef struct matrix {
++    size_t width;
++    size_t height;
++    float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++        12,  12,  12,  12,  12,  12,  12,  12,
++        16,  15,   9,   4,  -4,  -9, -15, -16,
++        16,   6,  -6, -16, -16,  -6,   6,  16,
++        15,  -4, -16,  -9,   9,  16,   4, -15,
++        12, -12, -12,  12,  12, -12, -12,  12,
++         9, -16,   4,  15, -15,  -4,  16,  -9,
++         6, -16,  16,  -6,  -6,  16, -16,   6,
++         4,  -9,  15, -16,  16, -15,   9,  -4
++} };
++
++static const matrix T4 = { 4, 4, {
++        17,  17,  17,  17,
++        22,  10, -10, -22,
++        17, -17, -17,  17,
++        10, -22,  22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++        12,  16,  16,  15,  12,   9,   6,   4,
++        12,  15,   6,  -4, -12, -16, -16,  -9,
++        12,   9,  -6, -16, -12,   4,  16,  15,
++        12,   4, -16,  -9,  12,  15,  -6, -16,
++        12,  -4, -16,   9,  12, -15,  -6,  16,
++        12,  -9,  -6,  16, -12,  -4,  16, -15,
++        12, -15,   6,   4, -12,  16, -16,   9,
++        12, -16,  16, -15,  12,  -9,   6,  -4
++} };
++
++static const matrix T4t = { 4, 4, {
++        17,  22,  17,  10,
++        17,  10, -17, -22,
++        17, -10, -17,  22,
++        17, -22,  17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++    if (out == NULL) {
++        fprintf(stderr, "Memory allocation failure\n");
++        exit(EXIT_FAILURE);
++    }
++    out->width = width;
++    out->height = height;
++    return out;
++}
++
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++    matrix *out;
++    if (a->width != b->height) {
++        fprintf(stderr, "Incompatible multiplication\n");
++        exit(EXIT_FAILURE);
++    }
++    out = new_matrix(b->width, a->height);
++    for (int j = 0; j < out->height; ++j)
++        for (int i = 0; i < out->width; ++i) {
++            float sum = 0;
++            for (int k = 0; k < a->width; ++k)
++                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++            out->d[j * out->width + i] = sum;
++        }
++    return out;
++}
++
++static void normalise(matrix *a)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p *= 64;
++            if (a->height == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++            if (a->width == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
++        }
++}
++
++static void divide_and_round_nearest(matrix *a, float by)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p = rintf(*p / by);
++        }
++}
++
++static void tweak(matrix *a)
++{
++    for (int j = 4; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p += 1;
++        }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++    matrix *raw, *tmp, *D, *E, *R;
++    raw = new_matrix(width, height);
++    for (int i = 0; i < width * height; ++i)
++        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++    tmp = multiply(height == 8 ? &T8 : &T4, raw);
++    D = multiply(tmp, width == 8 ? &T8t : &T4t);
++    normalise(D);
++    divide_and_round_nearest(D, 1);
++    for (int i = 0; i < width * height; ++i) {
++        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    }
++    E = multiply(D, width == 8 ? &T8 : &T4);
++    divide_and_round_nearest(E, 8);
++    for (int i = 0; i < width * height; ++i)
++        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    R = multiply(height == 8 ? &T8t : &T4t, E);
++    tweak(R);
++    divide_and_round_nearest(R, 128);
++    for (int i = 0; i < width * height; ++i)
++        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            av_free(R);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    av_free(raw);
++    av_free(tmp);
++    av_free(E);
++    av_free(R);
++    return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size)        \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint16_t r = rnd();               \
++            AV_WN16A(name##0 + i, r);         \
++            AV_WN16A(name##1 + i, r);         \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
++    do {                                            \
++        uint8_t *p##0 = name##0, *p##1 = name##1;   \
++        int i = (size);                             \
++        while (i-- > 0) {                           \
++            int x = 0x80 | (rnd() & 0x7F);          \
++            x >>= rnd() % 9;                        \
++            if (rnd() & 1)                          \
++                x = -x;                             \
++            *p##1++ = *p##0++ = 0x80 + x;           \
++        }                                           \
++    } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size.
++     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++     * are stored in column-major order, and the outputs are written back
++     * to the input buffer, so we oversize it slightly to catch overruns. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++        matrix *coeffs;
++        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++        for (int j = 0; j < 8; ++j)
++            for (int i = 0; i < 8; ++i) {
++                int idx = 8 + i * 8 + j;
++                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++            }
++        call_ref(inv_trans_in0 + 8);
++        call_new(inv_trans_in1 + 8);
++        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
++            fail();
++        bench_new(inv_trans_in1 + 8);
++        av_free(coeffs);
++    }
++}
++
++static void check_inv_trans_adding(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++     * added with saturation to an array of unsigned 8-bit values. Oversize
++     * this by 8 samples left and right and one row above and below. */
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            matrix *coeffs;
++            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++            for (int j = 0; j < tests[t].height; ++j)
++                for (int i = 0; i < tests[t].width; ++i) {
++                    int idx = j * 8 + i;
++                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++                }
++            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++                fail();
++            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++            av_free(coeffs);
++        }
++    }
++}
++
++static void check_loop_filter(void)
++{
++    /* Deblocking filter buffers are big enough to hold a 16x16 block,
++     * plus 16 columns left and 4 rows above to hold filter inputs
++     * (depending on whether v or h neighbouring block edge, oversized
++     * horizontally to maintain 16-byte alignment) plus 16 columns and
++     * 4 rows below to catch write overflows */
++    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_TEST(vc1_v_loop_filter4)
++        VC1DSP_TEST(vc1_h_loop_filter4)
++        VC1DSP_TEST(vc1_v_loop_filter8)
++        VC1DSP_TEST(vc1_h_loop_filter8)
++        VC1DSP_TEST(vc1_v_loop_filter16)
++        VC1DSP_TEST(vc1_h_loop_filter16)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            for (int count = 1000; count > 0; --count) {
++                int pq = rnd() % 31 + 1;
++                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++                    fail();
++            }
++        }
++        for (int j = 0; j < 24; ++j)
++            for (int i = 0; i < 48; ++i)
++                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++    }
++}
++
++#define TEST_UNESCAPE                                                                               \
++    do {                                                                                            \
++        for (int count = 100; count > 0; --count) {                                                 \
++            escaped_offset = rnd() & 7;                                                             \
++            unescaped_offset = rnd() & 7;                                                           \
++            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
++            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
++            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
++                fail();                                                                             \
++        }                                                                                           \
++    } while (0)
++
++static void check_unescape(void)
++{
++    /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++        /* Test data which consists of escapes sequences packed as tightly as possible */
++        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++        TEST_UNESCAPE;
++
++        /* Test random data */
++        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Test data with escape sequences at random intervals */
++        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++            int gap, gap_msb;
++            escaped1[x+0] = escaped0[x+0] = 0;
++            escaped1[x+1] = escaped0[x+1] = 0;
++            escaped1[x+2] = escaped0[x+2] = 3;
++            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++            gap_msb = 2u << (rnd() % 8);
++            gap = (rnd() &~ -gap_msb) | gap_msb;
++            x += gap;
++        }
++        TEST_UNESCAPE;
++
++        /* Test data which is known to contain no escape sequences */
++        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Benchmark the no-escape-sequences case */
++        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++    }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++    check_inv_trans_inplace();
++    check_inv_trans_adding();
++    report("inv_trans");
++
++    check_loop_filter();
++    report("loop_filter");
++
++    check_unescape();
++    report("unescape_buffer");
++}
+diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
+index 07f1d8238e..aa5f45ec8f 100644
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-hevc_add_res                              \
+                 fate-checkasm-hevc_idct                                 \
+                 fate-checkasm-hevc_sao                                  \
++                fate-checkasm-idctdsp                                   \
+                 fate-checkasm-jpeg2000dsp                               \
+                 fate-checkasm-llviddsp                                  \
+                 fate-checkasm-llviddspenc                               \
+@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-sw_scale                                  \
+                 fate-checkasm-v210dec                                   \
+                 fate-checkasm-v210enc                                   \
++                fate-checkasm-vc1dsp                                    \
+                 fate-checkasm-vf_blend                                  \
+                 fate-checkasm-vf_colorspace                             \
+                 fate-checkasm-vf_eq                                     \