diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 0810fee4dc..0bb3199ffe 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -47543,7 +47543,7 @@ index 22a9532444..5588e4a460 100644
  /**
   * Enqueues a buffer to a V4L2Context from an AVFrame
 diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index e48b3a8ccf..b994e39ad6 100644
+index e48b3a8ccf..092b750dc4 100644
 --- a/libavcodec/v4l2_m2m.c
 +++ b/libavcodec/v4l2_m2m.c
 @@ -328,7 +328,10 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
@@ -47577,7 +47577,7 @@ index e48b3a8ccf..b994e39ad6 100644
 +        av_packet_unref(&s->buf_pkt);
 +
 +    if (s->fd >= 0) {
-+            ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
++        ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
 +        if (ret)
 +            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
 +
@@ -47664,7 +47664,7 @@ index 456281f48c..b08a5b38ac 100644
  
  /**
 diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 3e17e0fcac..e774532e36 100644
+index 3e17e0fcac..a02012bf44 100644
 --- a/libavcodec/v4l2_m2m_dec.c
 +++ b/libavcodec/v4l2_m2m_dec.c
 @@ -23,6 +23,10 @@
@@ -47958,10 +47958,7 @@ index 3e17e0fcac..e774532e36 100644
 +            return ret;
 +        }
 +    }
- 
--            /* cant recover */
--            if (ret == AVERROR(ENOMEM))
--                return ret;
++
 +    // Start if we haven't
 +    {
 +        const int ret2 = v4l2_try_start(avctx);
@@ -47971,7 +47968,9 @@ index 3e17e0fcac..e774532e36 100644
 +        }
 +    }
  
--            return 0;
+-            /* cant recover */
+-            if (ret == AVERROR(ENOMEM))
+-                return ret;
 +    return ret;
 +}
 +
@@ -48018,7 +48017,8 @@ index 3e17e0fcac..e774532e36 100644
 +                // Go again if we got a frame that we need to discard
 +            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
 +        }
-+
+ 
+-            return 0;
 +        // Continue trying to enqueue packets if either
 +        // (a) we succeeded last time OR
 +        // (b) enqueue failed due to input Q full AND there is now room
@@ -48052,8 +48052,8 @@ index 3e17e0fcac..e774532e36 100644
 +        src_rv < 0 ? src_rv :
 +        dst_rv < 0 ? dst_rv :
 +            AVERROR(EAGAIN);
-+}
-+
+ }
+ 
 +#if 0
 +#include <time.h>
 +static int64_t us_time(void)
@@ -48061,8 +48061,8 @@ index 3e17e0fcac..e774532e36 100644
 +    struct timespec ts;
 +    clock_gettime(CLOCK_MONOTONIC, &ts);
 +    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
- }
- 
++}
++
 +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 +{
 +    int ret;
@@ -48125,10 +48125,14 @@ index 3e17e0fcac..e774532e36 100644
          return ret;
      }
  
-@@ -223,10 +519,59 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -223,10 +519,58 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
  
  static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  {
+-    V4L2m2mPriv *priv = avctx->priv_data;
+-    V4L2m2mContext *s = priv->context;
+-    av_packet_unref(&s->buf_pkt);
+-    return ff_v4l2_m2m_codec_end(priv);
 +    int rv;
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
 +    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
@@ -48146,19 +48150,17 @@ index 3e17e0fcac..e774532e36 100644
 +    // possibly limited h/w resources and fails on a Pi for this reason unless
 +    // more GPU mem is allocated than is the default.
 +
-     V4L2m2mPriv *priv = avctx->priv_data;
--    V4L2m2mContext *s = priv->context;
--    av_packet_unref(&s->buf_pkt);
--    return ff_v4l2_m2m_codec_end(priv);
-+    V4L2m2mContext* s = priv->context;
-+    V4L2Context* output = &s->output;
-+    V4L2Context* capture = &s->capture;
++    V4L2m2mPriv * const priv = avctx->priv_data;
++    V4L2m2mContext * const s = priv->context;
++    V4L2Context * const output = &s->output;
++    V4L2Context * const capture = &s->capture;
 +    int ret, i;
 +
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
 +
-+    if (!output->streamon)
-+        goto done;
++    // Reflushing everything is benign, quick and avoids having to worry about
++    // states like EOS processing so don't try to optimize out (having got it
++    // wrong once)
 +
 +    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
 +    if (ret < 0)
@@ -48182,13 +48184,11 @@ index 3e17e0fcac..e774532e36 100644
 +    capture->done = 0;
 +
 +    // Stream on will occur when we actually submit a new frame
-+
-+done:
 +    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
  }
  
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -235,10 +580,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+@@ -235,10 +579,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -48206,7 +48206,7 @@ index 3e17e0fcac..e774532e36 100644
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -259,9 +610,14 @@ static const AVOption options[] = {
+@@ -259,9 +609,14 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -54715,10 +54715,10 @@ index 5613813ba8..ab8bcfcf34 100644
 +
 diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
 new file mode 100644
-index 0000000000..641242dd8f
+index 0000000000..1981e7d46f
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,239 @@
+@@ -0,0 +1,498 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -54838,7 +54838,7 @@ index 0000000000..641242dd8f
 +    add w11, w11, #128
 +    // increment the row counter
 +    add w12, w12, #1
-+
++    
 +    // process the next row if we haven't finished yet
 +    cmp w15, w12
 +    bgt row_loop
@@ -54957,13 +54957,272 @@ index 0000000000..641242dd8f
 +    ret
 +endfunc
 +
++//void ff_rpi_sand30_lines_to_planar_y16(
++//  uint8_t * dest,             // [x0]
++//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
++//  const uint8_t * src,        // [x2]
++//  unsigned int src_stride1,   // [w3] -> 128
++//  unsigned int src_stride2,   // [w4]
++//  unsigned int _x,            // [w5]
++//  unsigned int y,             // [w6]
++//  unsigned int _w,            // [w7]
++//  unsigned int h);            // [sp, #0]
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++    str x19, [sp, #-8]
++    str x20, [sp, #-16]
++    str x21, [sp, #-24]
++    str x22, [sp, #-32]
++    str x23, [sp, #-40]
++    
++    // w6 = argument h
++    ldr w6, [sp, #0]
++
++    // slice_inc = ((stride2 - 1) * stride1)
++    mov w5, w4
++    sub w5, w5, #1
++    lsl w5, w5, #7
++
++    // total number of bytes per row = (width / 3) * 4
++    mov w8, w7
++    mov w9, #3
++    udiv w8, w8, w9
++    lsl w8, w8, #2
++
++    // number of full 128 byte blocks to be processed
++    mov w9, #96
++    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
++
++    // w10 = number of full integers to process (4 bytes)
++    // w11 = remaning zero to two 10bit values still to copy over
++    mov w12, #96
++    mul w12, w9, w12
++    sub w12, w7, w12  // width - blocks*96 = remaining points per row
++    mov w11, #3
++    udiv w10, w12, w11 // full integers to process = w12 / 3 
++    mul w11, w10, w11  // #integers *3
++    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
++
++    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
++    // this is to efficiently copy incomplete blocks at the end of the rows
++    // the last row is handled explicitly to avoid writing out of bounds
++    add w22, w10, w11
++    cmp w22, #0
++    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
++    add w9, w9, w22
++    sub w6, w6, #1
++
++    // store the number of bytes in w20 which we copy too much for every row
++    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
++    mov w20, #3
++    mul w21, w10, w20
++    mov w20, #96
++    sub w20, w20, w21 // w20 = 96 - #integers*3
++    sub w20, w20, w11 // w20 = 96 - #integers*3 - rem. points
++    cmp w20, #96
++    cset w21, eq
++    mov w23, #96
++    mul w23, w23, w21 // 0 or 1 * 96
++    sub w20, w20, w23 // = w20 mod 96
++    lsl w20, w20, #1  // convert to bytes (*2 since we store 16bits per value)
++    
++    mov w23, #0 // flag to check whether the last line had already been processed
++    
++    // bitmask to clear the uppper 6bits of the result values
++    mov x19, #0x03ff03ff03ff03ff
++    dup v22.2d, x19
++
++    // row counter = 0
++    eor w12, w12, w12
++row_loop_y16:
++    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
++    bge row_loop_y16_fin
++
++    mov x13, x2               // row src
++    eor w14, w14, w14         // full block counter
++block_loop_y16:
++    cmp w14, w9
++    bge block_loop_y16_fin
++
++    // load 64 bytes
++    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
++   
++    // process v0 and v1
++    xtn v16.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v17.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v18.4h, v0.4s
++   
++    xtn2 v16.8h, v1.4s
++    and v16.16b, v16.16b, v22.16b
++    ushr v1.4s, v1.4s, #10
++    xtn2 v17.8h, v1.4s
++    and v17.16b, v17.16b, v22.16b
++    ushr v1.4s, v1.4s, #10
++    xtn2 v18.8h, v1.4s
++    and v18.16b, v18.16b, v22.16b
++
++    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
++
++    // process v2 and v3
++    xtn v23.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v24.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v25.4h, v2.4s
++    
++    xtn2 v23.8h, v3.4s
++    and v23.16b, v23.16b, v22.16b
++    ushr v3.4s, v3.4s, #10
++    xtn2 v24.8h, v3.4s
++    and v24.16b, v24.16b, v22.16b
++    ushr v3.4s, v3.4s, #10
++    xtn2 v25.8h, v3.4s
++    and v25.16b, v25.16b, v22.16b
++
++    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
++
++    // load the second half of the block -> 64 bytes into registers v4-v7
++    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
++    
++    // process v4 and v5
++    xtn v16.4h, v4.4s
++    ushr v4.4s, v4.4s, #10
++    xtn v17.4h, v4.4s
++    ushr v4.4s, v4.4s, #10
++    xtn v18.4h, v4.4s
++   
++    xtn2 v16.8h, v5.4s 
++    and v16.16b, v16.16b, v22.16b
++    ushr v5.4s, v5.4s, #10
++    xtn2 v17.8h, v5.4s
++    and v17.16b, v17.16b, v22.16b
++    ushr v5.4s, v5.4s, #10
++    xtn2 v18.8h, v5.4s
++    and v18.16b, v18.16b, v22.16b
++
++    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
++
++    // v6 and v7
++    xtn v23.4h, v6.4s
++    ushr v6.4s, v6.4s, #10
++    xtn v24.4h, v6.4s
++    ushr v6.4s, v6.4s, #10
++    xtn v25.4h, v6.4s
++   
++    xtn2 v23.8h, v7.4s 
++    and v23.16b, v23.16b, v22.16b
++    ushr v7.4s, v7.4s, #10
++    xtn2 v24.8h, v7.4s
++    and v24.16b, v24.16b, v22.16b
++    ushr v7.4s, v7.4s, #10
++    xtn2 v25.8h, v7.4s
++    and v25.16b, v25.16b, v22.16b
++
++    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
++ 
++    add x13, x13, x5          // row src += slice_inc
++    add w14, w14, #1
++    b block_loop_y16
++block_loop_y16_fin:
++
++    
++
++
++    add x2, x2, #128          // src += stride1 (start of the next row)
++    sub x0, x0, x20           // subtract the bytes we copied too much from dst
++    add w12, w12, #1
++    b row_loop_y16
++row_loop_y16_fin:
++
++    // check whether we have incomplete blocks at the end of every row
++    // in that case decrease row block count by one
++    // change height back to it's original value (meaning increase it by 1)
++    // and jump back to another iteration of row_loop_y16
++
++    cmp w23, #1
++    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
++    add w6, w6, #1    // increase height to the original value
++    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
++    mov w23, #1
++    b row_loop_y16
++row_loop_y16_fin2:
++
++    add x0, x0, x20 // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
++
++    // now we've got to handle the last block in the last row
++    eor w12, w12, w12 // w12 = 0 = counter
++integer_loop_y16:
++    cmp w12, w10
++    bge integer_loop_y16_fin
++    ldr w14, [x2], #4
++    and w15, w14, #0x3ff
++    strh w15, [x0], #2
++    lsr w14, w14, #10
++    and w15, w14, #0x3ff
++    strh w15, [x0], #2
++    lsr w14, w14, #10
++    and w15, w14, #0x3ff
++    strh w15, [x0], #2
++    add w12, w12, #1
++    b integer_loop_y16
++integer_loop_y16_fin:
++
++final_values_y16:
++    // remaining point count = w11
++    ldr w14, [x2], #4
++    cmp w11, #0
++    beq final_values_y16_fin
++    and w15, w14, #0x3ff
++    strh w15, [x0], #2
++    cmp w11, #1
++    beq final_values_y16_fin
++    lsr w14, w14, #10
++    and w15, w14, #0x3ff
++    strh w15, [x0], #2
++final_values_y16_fin:
++
++    ldr x23, [sp, #-40]
++    ldr x22, [sp, #-32]
++    ldr x21, [sp, #-24]
++    ldr x20, [sp, #-16]
++    ldr x19, [sp, #-8]
++
++    ret
++endfunc
++
++//void ff_rpi_sand30_lines_to_planar_c16(
++//  uint8_t * dst_u,
++//  unsigned int dst_stride_u,
++//  uint8_t * dst_v,
++//  unsigned int dst_stride_v,
++//  const uint8_t * src,
++//  unsigned int stride1,
++//  unsigned int stride2,
++//  unsigned int _x,
++//  unsigned int y,
++//  unsigned int _w,
++//  unsigned int h);
++
++//void ff_rpi_sand30_lines_to_planar_p010(
++//  uint8_t * dest,
++//  unsigned int dst_stride,
++//  const uint8_t * src,
++//  unsigned int src_stride1,
++//  unsigned int src_stride2,
++//  unsigned int _x,
++//  unsigned int y,
++//  unsigned int _w,
++//  unsigned int h);
++
 +
 diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
 new file mode 100644
-index 0000000000..2894ce5aa3
+index 0000000000..d820057624
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,47 @@
+@@ -0,0 +1,51 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -55007,6 +55266,10 @@ index 0000000000..2894ce5aa3
 +  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
 +  unsigned int _w, unsigned int h);
 +
++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
 +#ifdef __cplusplus
 +}
 +#endif
@@ -56495,7 +56758,7 @@ index 0000000000..0324f6826d
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..038c306877
+index 0000000000..64c34ced56
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
 @@ -0,0 +1,357 @@
@@ -56598,7 +56861,7 @@ index 0000000000..038c306877
 +    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
 +    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 +
-+#if HAVE_SAND_ASM
++#if HAVE_SAND_ASM || HAVE_SAND_ASM64
 +    if (_x == 0) {
 +        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
 +        return;