diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 659c5d7d13..974d7e0d45 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -51212,7 +51212,7 @@ index 8dbc7fc104..0bda4dd06b 100644
  
  #endif // AVCODEC_V4L2_BUFFERS_H
 diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ff1ea8e57b..fcd5fdf359 100644
+index ff1ea8e57b..65b2648557 100644
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -51414,6 +51414,12 @@ index ff1ea8e57b..fcd5fdf359 100644
  
 -static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
 +static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
  {
 -    struct v4l2_format *fmt1 = &ctx->format;
 -    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
@@ -51422,12 +51428,6 @@ index ff1ea8e57b..fcd5fdf359 100644
 -        :
 -        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
 -        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
-+    return ctx->bufrefs != NULL;
-+}
-+
-+// Width/Height changed or we don't have an alloc in the first place?
-+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
-+{
 +    const struct v4l2_format *fmt1 = &ctx->format;
 +    int ret = !ctx_buffers_alloced(ctx) ||
 +        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
@@ -51520,12 +51520,12 @@ index ff1ea8e57b..fcd5fdf359 100644
 -        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
 -    }
 +    get_default_selection(&s->capture, &s->capture.selection);
- 
--    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++
 +    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
 +    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
 +        reinit = 1;
-+
+ 
+-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
 +    s->capture.format = cap_fmt;
      if (reinit) {
 -        s->capture.height = v4l2_get_height(&cap_fmt);
@@ -51574,16 +51574,16 @@ index ff1ea8e57b..fcd5fdf359 100644
          if (ret) {
 -            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
 +            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-+            return AVERROR(EINVAL);
-+        }
+             return AVERROR(EINVAL);
+         }
 +
 +        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
 +            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
 +            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
 +                   s->capture.width, s->capture.height,
 +                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
-             return AVERROR(EINVAL);
-         }
++            return AVERROR(EINVAL);
++        }
 +
 +        // Update pixel format - should only actually do something on initial change
 +        s->capture.av_pix_fmt =
@@ -51928,17 +51928,17 @@ index ff1ea8e57b..fcd5fdf359 100644
              }
 -            return NULL;
 +            return AVERROR(EAGAIN);
-+        }
-+
-+        if ((pfd.revents & POLLERR) != 0) {
-+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
-+            return AVERROR_UNKNOWN;
          }
  
 -        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
 -            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
 -                            buf.m.planes[0].bytesused : buf.bytesused;
 -            if (bytesused == 0) {
++        if ((pfd.revents & POLLERR) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++            return AVERROR_UNKNOWN;
++        }
++
 +        if ((pfd.revents & poll_event) != 0) {
 +            ret = get_event(m);
 +            if (ret < 0) {
@@ -51951,13 +51951,6 @@ index ff1ea8e57b..fcd5fdf359 100644
 -                ctx->done = 1;
 -#endif
 +            continue;
-+        }
-+
-+        if ((pfd.revents & poll_cap) != 0) {
-+            ret = dq_buf(ctx, ppavbuf);
-+            if (ret == AVERROR(EPIPE))
-+                continue;
-+            return ret;
          }
  
 -        avbuf = &ctx->buffers[buf.index];
@@ -51966,12 +51959,19 @@ index ff1ea8e57b..fcd5fdf359 100644
 -        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
 -            memcpy(avbuf->planes, planes, sizeof(planes));
 -            avbuf->buf.m.planes = avbuf->planes;
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
+         }
+-        return avbuf;
++
 +        if ((pfd.revents & poll_out) != 0) {
 +            if (is_cap)
 +                return AVERROR(EAGAIN);
 +            return dq_buf(ctx, ppavbuf);
-         }
--        return avbuf;
++        }
 +
 +        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
 +        return AVERROR_UNKNOWN;
@@ -52062,18 +52062,18 @@ index ff1ea8e57b..fcd5fdf359 100644
 +            .type = ctx->type,
 +            .count = 0, /* 0 -> unmap all buffers from the driver */
 +        };
-+
-+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
-+            if (errno == EINTR)
-+                continue;
-+
-+            ret = AVERROR(errno);
  
 -        for (j = 0; j < buffer->num_planes; j++) {
 -            struct V4L2Plane_info *p = &buffer->plane_info[j];
 -            if (p->mm_addr && p->length)
 -                if (munmap(p->mm_addr, p->length) < 0)
 -                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++            if (errno == EINTR)
++                continue;
++
++            ret = AVERROR(errno);
++
 +            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
 +                ctx->name, av_err2str(AVERROR(errno)));
 +
@@ -52289,7 +52289,7 @@ index ff1ea8e57b..fcd5fdf359 100644
          return ret;
  
      return ff_v4l2_buffer_enqueue(avbuf);
-@@ -635,42 +1032,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -635,42 +1032,77 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
  
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
@@ -52307,9 +52307,6 @@ index ff1ea8e57b..fcd5fdf359 100644
 -    if (!avbuf) {
 -        if (ctx->done)
 -            return AVERROR_EOF;
--
--        return AVERROR(EAGAIN);
--    }
 +    do {
 +        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
 +            return rv;
@@ -52317,6 +52314,9 @@ index ff1ea8e57b..fcd5fdf359 100644
 +            return rv;
 +    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
  
+-        return AVERROR(EAGAIN);
+-    }
+-
 -    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
 +   return 0;
  }
@@ -52346,14 +52346,53 @@ index ff1ea8e57b..fcd5fdf359 100644
 +    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
  
 -        return AVERROR(EAGAIN);
--    }
--
--    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
 +    return 0;
++}
++
++// Return 0 terminated list of drm fourcc video formats for this context
++// NULL if none found or error
++// Returned list is malloced so must be freed
++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN)
++{
++    unsigned int i;
++    unsigned int n = 0;
++    unsigned int size = 0;
++    uint32_t * e = NULL;
++    *pN = 0;
++
++    for (i = 0; i < 1024; ++i) {
++        struct v4l2_fmtdesc fdesc = {
++            .index = i,
++            .type = ctx->type
++        };
++
++        if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc))
++            return e;
++
++        if (n + 1 >= size) {
++            unsigned int newsize = (size == 0) ? 16 : size * 2;
++            uint32_t * t = av_realloc(e, newsize * sizeof(*t));
++            if (!t)
++                return e;
++            e = t;
++            size = newsize;
++        }
++
++        e[n] = fdesc.pixelformat;
++        e[++n] = 0;
++        if (pN)
++            *pN = n;
+     }
+ 
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    // If we've looped 1024 times we are clearly confused
++    *pN = 0;
++    av_free(e);
++    return NULL;
  }
  
  int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
-@@ -702,78 +1093,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1134,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
  
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
@@ -52566,7 +52605,7 @@ index ff1ea8e57b..fcd5fdf359 100644
      return ret;
  }
 diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 22a9532444..108fc05a6f 100644
+index 22a9532444..0c8c020be1 100644
 --- a/libavcodec/v4l2_context.h
 +++ b/libavcodec/v4l2_context.h
 @@ -31,6 +31,7 @@
@@ -52637,7 +52676,27 @@ index 22a9532444..108fc05a6f 100644
  } V4L2Context;
  
  /**
-@@ -147,7 +177,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
+@@ -119,6 +149,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx);
+  */
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe);
+ 
++/**
++ * Get the list of drm fourcc pixel formats for this context
++ *
++ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context
++ *       description for required variables.
++ * @param[in] pN A pointer to receive the number of formats
++ *       found. May be NULL if not wanted.
++ * @return Pointer to malloced list of zero terminated formats,
++ *         NULL if none or error. As list is malloced it must be
++ *         freed.
++ */
++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN);
++
+ /**
+  * Releases a V4L2Context.
+  *
+@@ -147,7 +190,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
   * @param[inout] pkt The AVPacket to dequeue to.
   * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
   */
@@ -52646,7 +52705,7 @@ index 22a9532444..108fc05a6f 100644
  
  /**
   * Dequeues a buffer from a V4L2Context to an AVFrame.
-@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+@@ -156,7 +199,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
   * @param[in] ctx The V4L2Context to dequeue from.
   * @param[inout] f The AVFrame to dequeue to.
   * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
@@ -52657,7 +52716,7 @@ index 22a9532444..108fc05a6f 100644
   */
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
  
-@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+@@ -170,7 +216,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
   * @param[in] pkt A pointer to an AVPacket.
   * @return 0 in case of success, a negative error otherwise.
   */
@@ -52666,7 +52725,7 @@ index 22a9532444..108fc05a6f 100644
  
  /**
   * Enqueues a buffer to a V4L2Context from an AVFrame
-@@ -183,4 +216,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+@@ -183,4 +229,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
   */
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
  
@@ -52696,7 +52755,7 @@ index 22a9532444..108fc05a6f 100644
 +
  #endif // AVCODEC_V4L2_CONTEXT_H
 diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index cdfd579810..025cf24769 100644
+index cdfd579810..143656e792 100644
 --- a/libavcodec/v4l2_m2m.c
 +++ b/libavcodec/v4l2_m2m.c
 @@ -35,6 +35,15 @@
@@ -52854,14 +52913,19 @@ index cdfd579810..025cf24769 100644
      if (s->fd >= 0) {
          ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
          if (ret)
-@@ -356,7 +413,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+@@ -355,8 +412,20 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     }
  
      ff_v4l2_context_release(&s->output);
- 
-+    dmabufs_ctl_unref(&s->db_ctl);
-+    close(s->fd);
-+    s->fd = -1;
++    av_buffer_unref(&s->device_ref);
 +
++    dmabufs_ctl_unref(&s->db_ctl);
++
++    if (s->fd != -1) {
++        close(s->fd);
++        s->fd = -1;
++    }
+ 
      s->self_ref = NULL;
 +    // This is only called on avctx close so after this point we don't have that
 +    // Crash sooner if we find we are using it (can still log with avctx = NULL)
@@ -52870,7 +52934,7 @@ index cdfd579810..025cf24769 100644
      av_buffer_unref(&priv->context_ref);
  
      return 0;
-@@ -400,35 +465,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
+@@ -400,35 +469,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
      return v4l2_configure_contexts(s);
  }
  
@@ -53060,7 +53124,7 @@ index b67b216331..a506e69d67 100644
 +
  #endif /* AVCODEC_V4L2_M2M_H */
 diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index ab07c0a24a..cec98cc16a 100644
+index ab07c0a24a..e7fd8980e5 100644
 --- a/libavcodec/v4l2_m2m_dec.c
 +++ b/libavcodec/v4l2_m2m_dec.c
 @@ -21,8 +21,14 @@
@@ -53406,7 +53470,7 @@ index ab07c0a24a..cec98cc16a 100644
      return 0;
  }
  
-@@ -133,58 +343,768 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
+@@ -133,46 +343,822 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
      return 0;
  }
  
@@ -53625,27 +53689,27 @@ index ab07c0a24a..cec98cc16a 100644
 +            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
              return ret;
 +        }
-     }
- 
--    if (s->draining)
--        goto dequeue;
++    }
++
 +    if (s->draining) {
 +        if (s->buf_pkt.size) {
 +            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
 +            av_packet_unref(&s->buf_pkt);
 +        }
 +        return NQ_DRAINING;
-+    }
-+
+     }
+ 
+-    if (s->draining)
+-        goto dequeue;
 +    if (!s->buf_pkt.size)
 +        return NQ_NONE;
-+
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
  
 -    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
 -    if (ret < 0 && ret != AVERROR(EAGAIN))
 -        goto fail;
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
++
 +    if (s->extdata_sent)
 +        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
 +    else
@@ -53972,10 +54036,9 @@ index ab07c0a24a..cec98cc16a 100644
 +};
 +
 +static int
-+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc)
 +{
 +    unsigned int i;
-+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
 +    const uint32_t w = avctx->coded_width;
 +    const uint32_t h = avctx->coded_height;
 +
@@ -54103,11 +54166,13 @@ index ab07c0a24a..cec98cc16a 100644
 +#if CONFIG_H264_DECODER
 +        case AV_CODEC_ID_H264:
 +        {
-+            H264ParamSets ps = {{NULL}};
++            H264ParamSets ps;
 +            int is_avc = 0;
 +            int nal_length_size = 0;
 +            int ret;
 +
++            memset(&ps, 0, sizeof(ps));
++
 +            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
 +                                           &ps, &is_avc, &nal_length_size,
 +                                           avctx->err_recognition, avctx);
@@ -54133,12 +54198,15 @@ index ab07c0a24a..cec98cc16a 100644
 +#if CONFIG_HEVC_DECODER
 +        case AV_CODEC_ID_HEVC:
 +        {
-+            HEVCParamSets ps = {{NULL}};
-+            HEVCSEI sei = {{{{0}}}};
++            HEVCParamSets ps;
++            HEVCSEI sei;
 +            int is_nalff = 0;
 +            int nal_length_size = 0;
 +            int ret;
 +
++            memset(&ps, 0, sizeof(ps));
++            memset(&sei, 0, sizeof(sei));
++
 +            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
 +                                           &ps, &sei, &is_nalff, &nal_length_size,
 +                                           avctx->err_recognition, 0, avctx);
@@ -54165,14 +54233,92 @@ index ab07c0a24a..cec98cc16a 100644
 +        default:
 +            break;
 +    }
++}
++
++static int
++choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    const V4L2m2mPriv * const priv = avctx->priv_data;
++    unsigned int fmts_n;
++    uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n);
++    enum AVPixelFormat *fmts2 = NULL;
++    enum AVPixelFormat t;
++    enum AVPixelFormat gf_pix_fmt;
++    unsigned int i;
++    unsigned int n = 0;
++    unsigned int pref_n = 1;
++    int rv = AVERROR(ENOENT);
++
++    if (!fmts)
++        return AVERROR(ENOENT);
++
++    if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) {
++        rv = AVERROR(ENOMEM);
++        goto error;
++    }
++
++    // Filter for formats that are supported by ffmpeg and
++    // can accomodate the stream size
++    fmts2[n++] = AV_PIX_FMT_DRM_PRIME;
++    for (i = 0; i != fmts_n; ++i) {
++        const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO);
++        if (f == AV_PIX_FMT_NONE)
++            continue;
++
++        if (check_size(avctx, s, fmts[i]) != 0)
++            continue;
++
++        if (f == priv->pix_fmt)
++            pref_n = n;
++        fmts2[n++] = f;
++    }
++    fmts2[n] = AV_PIX_FMT_NONE;
++
++    if (n < 2) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__);
++        goto error;
++    }
++
++    // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt
++    t = fmts2[n - 1];
++    fmts2[n - 1] = fmts2[pref_n];
++    fmts2[pref_n] = t;
++
++    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
++    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++           avctx->coded_width, avctx->coded_height,
++           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++
++    if (gf_pix_fmt == AV_PIX_FMT_NONE)
++        goto error;
++
++    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
++        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++        s->capture.av_pix_fmt = avctx->sw_pix_fmt;
++        s->output_drm = 1;
++    }
++    else {
++        avctx->pix_fmt = gf_pix_fmt;
++        s->capture.av_pix_fmt = gf_pix_fmt;
++        s->output_drm = 0;
++    }
++
++    // Get format converts capture.av_pix_fmt back into a V4L2 format in the context
++    if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0)
++        goto error;
++    rv = ff_v4l2_context_set_format(&s->capture);
++
++error:
++    av_free(fmts2);
++    av_free(fmts);
++    return rv;
 +}
  
  static av_cold int v4l2_decode_init(AVCodecContext *avctx)
  {
-     V4L2Context *capture, *output;
-     V4L2m2mContext *s;
+@@ -181,10 +1167,27 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
      V4L2m2mPriv *priv = avctx->priv_data;
-+    int gf_pix_fmt;
      int ret;
  
 +    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
@@ -54199,7 +54345,7 @@ index ab07c0a24a..cec98cc16a 100644
      capture = &s->capture;
      output = &s->output;
  
-@@ -192,14 +1112,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -192,14 +1195,45 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
       * by the v4l2 driver; this event will trigger a full pipeline reconfig and
       * the proper values will be retrieved from the kernel driver.
       */
@@ -54218,28 +54364,8 @@ index ab07c0a24a..cec98cc16a 100644
      capture->av_pix_fmt = avctx->pix_fmt;
 +    capture->min_buf_size = 0;
 +
-+    /* the client requests the codec to generate DRM frames:
-+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-+     *       check the ff_v4l2_buffer_to_avframe conversion function.
-+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-+     *       check the v4l2_get_drm_frame function.
-+     */
-+
-+    avctx->sw_pix_fmt = avctx->pix_fmt;
-+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
-+           avctx->coded_width, avctx->coded_height,
-+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-+
-+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+        s->output_drm = 1;
-+    }
-+    else {
-+        capture->av_pix_fmt = gf_pix_fmt;
-+        s->output_drm = 0;
-+    }
++    capture->av_pix_fmt = AV_PIX_FMT_NONE;
++    s->output_drm = 0;
 +
 +    s->db_ctl = NULL;
 +    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
@@ -54267,7 +54393,7 @@ index ab07c0a24a..cec98cc16a 100644
  
      s->avctx = avctx;
      ret = ff_v4l2_m2m_codec_init(priv);
-@@ -208,12 +1179,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -208,12 +1242,90 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
          return ret;
      }
  
@@ -54281,19 +54407,21 @@ index ab07c0a24a..cec98cc16a 100644
 +        return ret;
 +    }
 +
-+    if ((ret = v4l2_prepare_decoder(s)) < 0)
-+        return ret;
-+
 +    if ((ret = get_quirks(avctx, s)) != 0)
 +        return ret;
 +
-+    if ((ret = check_size(avctx, s)) != 0)
-+        return ret;
-+
 +    if ((ret = check_profile(avctx, s)) != 0) {
 +        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
 +        return ret;
 +    }
++
++    // Size check done as part of format filtering
++    if ((ret = choose_capture_format(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = v4l2_prepare_decoder(s)) < 0)
++        return ret;
++
 +    return 0;
  }
  
@@ -54358,7 +54486,7 @@ index ab07c0a24a..cec98cc16a 100644
  }
  
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -222,10 +1269,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+@@ -222,10 +1334,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -54377,7 +54505,7 @@ index ab07c0a24a..cec98cc16a 100644
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -246,9 +1300,15 @@ static const AVOption options[] = {
+@@ -246,9 +1365,15 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -54972,10 +55100,10 @@ index 0000000000..af7bbe1de4
 +
 diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
 new file mode 100644
-index 0000000000..cfa94d55c4
+index 0000000000..ee8527ba1f
 --- /dev/null
 +++ b/libavcodec/v4l2_req_devscan.c
-@@ -0,0 +1,449 @@
+@@ -0,0 +1,451 @@
 +#include <errno.h>
 +#include <fcntl.h>
 +#include <libudev.h>
@@ -55415,12 +55543,14 @@ index 0000000000..cfa94d55c4
 +    }
 +
 +    udev_enumerate_unref(enumerate);
++    udev_unref(udev);
 +
 +    *pscan = scan;
 +    return 0;
 +
 +fail:
-+    udev_unref(udev);
++    if (udev)
++        udev_unref(udev);
 +    devscan_delete(&scan);
 +    return ret;
 +}
@@ -60261,10 +60391,10 @@ index 75db62b1b4..e192b431be 100644
  void ff_vc1dsp_init(VC1DSPContext* c);
 diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
 new file mode 100644
-index 0000000000..f234a985b9
+index 0000000000..5a79e89ed7
 --- /dev/null
 +++ b/libavcodec/weak_link.c
-@@ -0,0 +1,102 @@
+@@ -0,0 +1,103 @@
 +#include <stdlib.h>
 +#include <pthread.h>
 +#include <stdatomic.h>
@@ -60286,6 +60416,7 @@ index 0000000000..f234a985b9
 +    struct ff_weak_link_master * w = malloc(sizeof(*w));
 +    if (!w)
 +        return NULL;
++    atomic_init(&w->ref_count, 0);
 +    w->ptr = p;
 +    if (pthread_rwlock_init(&w->lock, NULL)) {
 +        free(w);
@@ -62464,6 +62595,941 @@ index b2c254ea67..144fbda652 100644
  OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
  OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
                                                  opencl/unsharp.o
+diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
+index b58daa3a3f..b68209bc94 100644
+--- a/libavfilter/aarch64/Makefile
++++ b/libavfilter/aarch64/Makefile
+@@ -1,3 +1,5 @@
++OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
+ 
++NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+new file mode 100644
+index 0000000000..f52bc4b9b4
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -0,0 +1,125 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/common.h"
++#include "libavfilter/bwdif.h"
++#include "libavutil/aarch64/cpu.h"
++
++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat);
++
++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max);
++
++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max);
++
++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max);
++
++
++static void filter_line3_helper(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max)
++{
++    // Asm works on 16 byte chunks
++    // If w is a multiple of 16 then all is good - if not then if width rounded
++    // up to nearest 16 will fit in both src & dst strides then allow the asm
++    // to write over the padding bytes as that is almost certainly faster than
++    // having to invoke the C version to clean up the tail.
++    const int w1 = FFALIGN(w, 16);
++    const int w0 = clip_max != 255 ? 0 :
++                   d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
++
++    ff_bwdif_filter_line3_neon(dst1, d_stride,
++                               prev1, cur1, next1, s_stride,
++                               w0, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
++                                (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
++                                w - w0, parity, clip_max);
++}
++
++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
++                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++}
++
++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
++                              parity, clip_max, spat);
++
++    if (w0 < w)
++        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2,
++                               parity, clip_max, spat);
++}
++
++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
++                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++}
++
++void
++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
++{
++    const int cpu_flags = av_get_cpu_flags();
++
++    if (bit_depth != 8)
++        return;
++
++    if (!have_neon(cpu_flags))
++        return;
++
++    s->filter_intra = filter_intra_helper;
++    s->filter_line  = filter_line_helper;
++    s->filter_edge  = filter_edge_helper;
++    s->filter_line3 = filter_line3_helper;
++}
++
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+new file mode 100644
+index 0000000000..ae9aab20cd
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -0,0 +1,788 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#include "libavutil/aarch64/asm.S"
++
++// Space taken on the stack by an int (32-bit)
++#ifdef __APPLE__
++.set    SP_INT, 4
++#else
++.set    SP_INT, 8
++#endif
++
++.macro SQSHRUNN b, s0, s1, s2, s3, n
++        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
++        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
++        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
++        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
++        uzp2            \b\().16b, \s0\().16b, \s1\().16b
++.endm
++
++.macro SMULL4K a0, a1, a2, a3, s0, s1, k
++        smull           \a0\().4s, \s0\().4h, \k
++        smull2          \a1\().4s, \s0\().8h, \k
++        smull           \a2\().4s, \s1\().4h, \k
++        smull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMULL4K a0, a1, a2, a3, s0, s1, k
++        umull           \a0\().4s, \s0\().4h, \k
++        umull2          \a1\().4s, \s0\().8h, \k
++        umull           \a2\().4s, \s1\().4h, \k
++        umull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
++        umlal           \a0\().4s, \s0\().4h, \k
++        umlal2          \a1\().4s, \s0\().8h, \k
++        umlal           \a2\().4s, \s1\().4h, \k
++        umlal2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
++        umlsl           \a0\().4s, \s0\().4h, \k
++        umlsl2          \a1\().4s, \s0\().8h, \k
++        umlsl           \a2\().4s, \s1\().4h, \k
++        umlsl2          \a3\().4s, \s1\().8h, \k
++.endm
++
++//      int b = m2s1 - m1;
++//      int f = p2s1 - p1;
++//      int dc = c0s1 - m1;
++//      int de = c0s1 - p1;
++//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
++//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
++//      sp_min = FFMIN(sp_min, FFMAX(b,f));
++//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
++        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
++        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
++        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
++        umax            \t3\().16b, \t3\().16b, \t1\().16b
++        umin            \t3\().16b, \t3\().16b, \t2\().16b
++
++        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
++        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
++        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
++        umax            \t0\().16b, \t0\().16b, \t1\().16b
++        umin            \t2\().16b, \t2\().16b, \t0\().16b
++
++        cmeq            \t1\().16b, \diff\().16b, #0
++        umax            \diff\().16b, \diff\().16b, \t3\().16b
++        umax            \diff\().16b, \diff\().16b, \t2\().16b
++        bic             \diff\().16b, \diff\().16b, \t1\().16b
++.endm
++
++//      i0 = s0;
++//      if (i0 > d0 + diff0)
++//          i0 = d0 + diff0;
++//      else if (i0 < d0 - diff0)
++//          i0 = d0 - diff0;
++//
++// i0 = s0 is safe
++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
++        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
++        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
++        umin            \i0\().16b, \s0\().16b, \t0\().16b
++        umax            \i0\().16b, \i0\().16b, \t1\().16b
++.endm
++
++//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
++//      DIFF_CLIP
++//
++// i0 = i1 is safe
++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
++        uabd            \t0\().16b, \m1\().16b, \p1\().16b
++        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
++        bsl             \t0\().16b, \i1\().16b, \i2\().16b
++        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
++.endm
++
++.macro PUSH_VREGS
++        stp             d8,  d9,  [sp, #-64]!
++        stp             d10, d11, [sp, #16]
++        stp             d12, d13, [sp, #32]
++        stp             d14, d15, [sp, #48]
++.endm
++
++.macro POP_VREGS
++        ldp             d14, d15, [sp, #48]
++        ldp             d12, d13, [sp, #32]
++        ldp             d10, d11, [sp, #16]
++        ldp             d8,  d9,  [sp], #64
++.endm
++
++.macro LDR_COEFFS d, t0
++        movrel          \t0, coeffs, 0
++        ld1             {\d\().8h}, [\t0]
++.endm
++
++// static const uint16_t coef_lf[2] = { 4309, 213 };
++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
++// static const uint16_t coef_sp[2] = { 5077, 981 };
++
++const coeffs, align=4   // align 4 means align on 2^4 boundry
++        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
++        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
++        .hword          5077, 981                       // sp[0] = v0.h[6]
++endconst
++
++// ===========================================================================
++//
++// void ff_bwdif_filter_line3_neon(
++//         void * dst1,         // x0
++//         int d_stride,        // w1
++//         const void * prev1,  // x2
++//         const void * cur1,   // x3
++//         const void * next1,  // x4
++//         int s_stride,        // w5
++//         int w,               // w6
++//         int parity,          // w7
++//         int clip_max);       // [sp, #0] (Ignored)
++
++function ff_bwdif_filter_line3_neon, export=1
++        // Sanity check w
++        cmp             w6, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        cmp             w7, #0
++        csel            x17, x2, x4, ne
++
++        // We want all the V registers - save all the ones we must
++        PUSH_VREGS
++
++        // Some rearrangement of initial values for nice layout of refs in regs
++        mov             w10, w6                         // w10 = loop count
++        neg             w9,  w5                         // w9  = mref
++        lsl             w8,  w9,  #1                    // w8 =  mref2
++        add             w7,  w9,  w9, LSL #1            // w7  = mref3
++        lsl             w6,  w9,  #2                    // w6  = mref4
++        mov             w11, w5                         // w11 = pref
++        lsl             w12, w5,  #1                    // w12 = pref2
++        add             w13, w5,  w5, LSL #1            // w13 = pref3
++        lsl             w14, w5,  #2                    // w14 = pref4
++        add             w15, w5,  w5, LSL #2            // w15 = pref5
++        add             w16, w14, w12                   // w16 = pref6
++
++        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];                // c0 = v20, v21
++//             d0  = c0 >> 1;                           // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                    // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];      // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];      // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             j1 = -coef_hf[1] * (c0 + p4);            // j1 = v6-v9  (-c0:v20,v21)
++        add             v20.8h,  v20.8h,  v24.8h
++        add             v21.8h,  v21.8h,  v25.8h
++        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
++
++//             m3 = cur[mrefs3];                        // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                        // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);            // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);        // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        ldr             q28, [x3, w16, sxtw]
++        ldr             q25, [x17, w16, sxtw]
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];      // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             p6 = prev2[prefs6] + next2[prefs6];      // p6 = v24,v25
++        uaddl           v24.8h,  v25.8b,  v28.8b
++        uaddl2          v25.8h,  v25.16b, v28.16b
++
++//             j1 += coef_hf[2] * (m2 + p6);            // (-p6:v24,v25)
++        add             v24.8h,  v24.8h,  v22.8h
++        add             v25.8h,  v25.8h,  v23.8h
++        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
++
++//             m1 = cur[mrefs];                         // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p5 = cur[prefs5];                        // p5 = v25
++        ldr             q25, [x3, w15, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];      // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                           // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             j1 += coef_hf[0] * p2;                   // -
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
++
++//             i1 -= coef_hf[1] * (m2 + p2);            // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                         // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             j1 -= coef_lf[1] * 4 * (m1 + p5);        // -
++        uaddl           v26.8h,  v24.8b,  v25.8b
++        uaddl2          v27.8h,  v24.16b, v25.16b
++        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
++
++//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
++        uaddl           v18.8h,  v22.8b,  v21.8b
++        uaddl2          v19.8h,  v22.16b, v21.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v24.8b,  v25.8b
++        uaddl2          v19.8h,  v24.16b, v25.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v16, v28, v29, v30, v31, 13
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);        // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++//             j1 += coef_lf[0] * 4 * (p1 + p3);        // p1 = v22, p3 = v21
++        uaddl           v26.8h,  v21.8b,  v22.8b
++        uaddl2          v27.8h,  v21.16b, v22.16b
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                               // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             j1 >>= 15;                               // j1 = v3, -v6*, -v7*, -v8*, -v9*
++        SQSHRUNN        v3, v6, v7, v8, v9, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++        ldr             q27, [x2, w13, sxtw]
++        ldr             q26, [x4, w13, sxtw]
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++//             }                                        // v28, v30 preserved for next block
++//             {  // tdiff2 = v14
++//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
++//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
++        uabd            v31.16b, v21.16b, v27.16b
++        uabd            v29.16b, v21.16b, v26.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
++        ushr            v19.16b, v14.16b, #1
++        umax            v19.16b, v19.16b, v31.16b
++        umax            v19.16b, v19.16b, v29.16b
++//             }
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        //  diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
++        SPAT_CHECK      v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
++
++        // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
++        INTERPOL        v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
++
++//                 dst[d_stride * 2] = av_clip_uint8(interpol);
++        str             q3,  [x0, w5, sxtw]
++
++//             dst[d_stride] = p1;
++        str             q22, [x0, w1, sxtw]
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
++// ===========================================================================
++//
++// void filter_line(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int prefs3,     // [sp, #SP_INT]
++//      int mrefs3,     // [sp, #SP_INT*2]
++//      int prefs4,     // [sp, #SP_INT*3]
++//      int mrefs4,     // [sp, #SP_INT*4]
++//      int parity,     // [sp, #SP_INT*5]
++//      int clip_max)   // [sp, #SP_INT*6]
++
++function ff_bwdif_filter_line_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++        // Rearrange regs to be the same as line3 for ease of debug!
++        mov             w10, w4                         // w10 = loop count
++        mov             w9,  w6                         // w9  = mref
++        mov             w12, w7                         // w12 = pref2
++        mov             w11, w5                         // w11 = pref
++        ldr             w8,  [sp, #0]                   // w8 =  mref2
++        ldr             w7,  [sp, #SP_INT*2]            // w7  = mref3
++        ldr             w6,  [sp, #SP_INT*4]            // w6  = mref4
++        ldr             w13, [sp, #SP_INT]              // w13 = pref3
++        ldr             w14, [sp, #SP_INT*3]            // w14 = pref4
++
++        mov             x4,  x3
++        mov             x3,  x2
++        mov             x2,  x1
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        ldr             w17, [sp, #SP_INT*5]            // parity
++        cmp             w17, #0
++        csel            x17, x2, x4, ne
++
++        PUSH_VREGS
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
++//             d0  = c0 >> 1;                       // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             m3 = cur[mrefs3];                    // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                    // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);    // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++//             m1 = cur[mrefs];                     // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                       // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                     // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
++// ============================================================================
++//
++// void ff_bwdif_filter_edge_neon(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int parity,     // [sp, #SP_INT]
++//      int clip_max,   // [sp, #SP_INT*2]  unused
++//      int spat);      // [sp, #SP_INT*3]
++
++function ff_bwdif_filter_edge_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++// #define prev2 cur
++//     const uint8_t * restrict next2 = parity ? prev : next;
++
++        ldr             w8,  [sp, #0]                   // mrefs2
++
++        ldr             w17, [sp, #SP_INT]              // parity
++        ldr             w16, [sp, #SP_INT*3]            // spat
++        cmp             w17, #0
++        csel            x17, x1, x3, ne
++
++//     for (x = 0; x < w; x++) {
++
++10:
++//        int m1 = cur[mrefs];
++//        int d = (prev2[0] + next2[0]) >> 1;
++//        int p1 = cur[prefs];
++//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
++//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
++        ldr             q31, [x2]
++        ldr             q21, [x17]
++        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
++        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
++        ldr             q24, [x2, w6, sxtw]             // m1 = v24
++        ldr             q22, [x2, w5, sxtw]             // p1 = v22
++
++        ldr             q0,  [x1, w6, sxtw]             // prev[mrefs]
++        ldr             q2,  [x1, w5, sxtw]             // prev[prefs]
++        ldr             q1,  [x3, w6, sxtw]             // next[mrefs]
++        ldr             q3,  [x3, w5, sxtw]             // next[prefs]
++
++        ushr            v29.16b, v17.16b, #1
++
++        uabd            v31.16b, v0.16b,  v24.16b
++        uabd            v30.16b, v2.16b,  v22.16b
++        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
++
++        uabd            v31.16b, v1.16b,  v24.16b
++        uabd            v30.16b, v3.16b,  v22.16b
++        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
++
++        umax            v0.16b,  v0.16b,  v29.16b
++        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
++
++//        if (spat) {
++//            SPAT_CHECK()
++//        }
++//        i0 = (m1 + p1) >> 1;
++        cbz             w16, 1f
++
++        ldr             q31, [x2,  w8, sxtw]
++        ldr             q18, [x17, w8, sxtw]
++        ldr             q30, [x2,  w7, sxtw]
++        ldr             q19, [x17, w7, sxtw]
++        uhadd           v18.16b, v18.16b, v31.16b
++        uhadd           v19.16b, v19.16b, v30.16b
++
++        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
++
++1:
++        uhadd           v2.16b,  v22.16b, v24.16b
++
++        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
++        DIFF_CLIP       v2, v2, v16, v0, v31, v30
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++        subs            w4,  w4,  #16
++        add             x1,  x1,  #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++99:
++        ret
++endfunc
++
++// ============================================================================
++//
++// void ff_bwdif_filter_intra_neon(
++//      void *dst1,     // x0
++//      void *cur1,     // x1
++//      int w,          // w2
++//      int prefs,      // w3
++//      int mrefs,      // w4
++//      int prefs3,     // w5
++//      int mrefs3,     // w6
++//      int parity,     // w7       unused
++//      int clip_max)   // [sp, #0] unused
++
++function ff_bwdif_filter_intra_neon, export=1
++        cmp             w2, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++//    for (x = 0; x < w; x++) {
++10:
++
++//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
++        ldr             q31, [x1, w4, sxtw]
++        ldr             q30, [x1, w3, sxtw]
++        ldr             q29, [x1, w6, sxtw]
++        ldr             q28, [x1, w5, sxtw]
++
++        uaddl           v20.8h,  v31.8b,  v30.8b
++        uaddl2          v21.8h,  v31.16b, v30.16b
++
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
++
++        uaddl           v20.8h,  v29.8b,  v28.8b
++        uaddl2          v21.8h,  v29.16b, v28.16b
++
++        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        SQSHRUNN        v2, v2, v3, v4, v5, 13
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++
++        subs            w2,  w2,  #16
++        add             x1,  x1,  #16
++        bgt             10b
++
++99:
++        ret
++endfunc
 diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
 index 0872c6e0f2..1dd05e4d75 100644
 --- a/libavfilter/allfilters.c
@@ -62696,108 +63762,194 @@ index da1cf9941e..c588ed23cb 100644
                                       frame->format, frame->pts);
              break;
          case AVMEDIA_TYPE_AUDIO:
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index 889ff772ed..496cec72ef 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -35,8 +35,29 @@ typedef struct BWDIFContext {
+     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat);
++    void (*filter_line3)(void *dst, int dstride,
++                         const void *prev, const void *cur, const void *next, int prefs,
++                         int w, int parity, int clip_max);
+ } BWDIFContext;
+ 
+-void ff_bwdif_init_x86(BWDIFContext *bwdif);
++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
++
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat);
++
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max);
++
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max);
++
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max);
+ 
+ #endif /* AVFILTER_BWDIF_H */
 diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
-index b6aed7a450..92e26d54bc 100644
+index b6aed7a450..b268113271 100644
 --- a/libavfilter/vf_bwdif.c
 +++ b/libavfilter/vf_bwdif.c
-@@ -75,10 +75,10 @@ typedef struct ThreadData {
-         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
-         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
-         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
-- \
-+ {/*\
-         if (!diff) { \
-             dst[0] = d; \
--        } else {
-+        } else {*/
- 
- #define SPAT_CHECK() \
-             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
-@@ -90,15 +90,16 @@ typedef struct ThreadData {
-             diff = FFMAX3(diff, min, -max);
- 
- #define FILTER_LINE() \
-+            int i1, i2; \
-             SPAT_CHECK() \
--            if (FFABS(c - e) > temporal_diff0) { \
--                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
-+            /*if (FFABS(c - e) > temporal_diff0)*/ { \
-+                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
-                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
-                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
-                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
--            } else { \
--                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
--            }
-+            } /*else*/ { \
-+                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-+            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
- 
- #define FILTER_EDGE() \
-             if (spat) { \
-@@ -112,7 +113,7 @@ typedef struct ThreadData {
-             else if (interpol < d - diff) \
-                 interpol = d - diff; \
-  \
--            dst[0] = av_clip(interpol, 0, clip_max); \
-+            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
-         } \
-  \
-         dst++; \
-@@ -123,7 +124,7 @@ typedef struct ThreadData {
+@@ -123,8 +123,8 @@ typedef struct ThreadData {
          next2++; \
      }
  
 -static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
-                          int prefs3, int mrefs3, int parity, int clip_max)
+-                         int prefs3, int mrefs3, int parity, int clip_max)
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max)
  {
      uint8_t *dst = dst1;
-@@ -133,7 +134,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+     uint8_t *cur = cur1;
+@@ -133,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
      FILTER_INTRA()
  }
  
 -static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                           int prefs3, int mrefs3, int prefs4, int mrefs4,
-                           int parity, int clip_max)
-@@ -151,7 +152,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+-                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                          int prefs3, int mrefs3, int prefs4, int mrefs4,
+-                          int parity, int clip_max)
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max)
+ {
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -151,9 +151,34 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
      FILTER2()
  }
  
 -static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                         int parity, int clip_max, int spat)
+-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                        int parity, int clip_max, int spat)
++#define NEXT_LINE()\
++    dst += d_stride; \
++    prev += prefs; \
++    cur  += prefs; \
++    next += prefs;
++
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max)
++{
++    const int prefs = s_stride;
++    uint8_t * dst  = dst1;
++    const uint8_t * prev = prev1;
++    const uint8_t * cur  = cur1;
++    const uint8_t * next = next1;
++
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++    NEXT_LINE();
++    memcpy(dst, cur, w);
++    NEXT_LINE();
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++}
++
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat)
  {
-@@ -168,7 +169,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -213,6 +238,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
      FILTER2()
  }
  
--static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
-+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
-                                int prefs3, int mrefs3, int parity, int clip_max)
++// Round job start line down to multiple of 4 so that if filter_line3 exists
++// and the frame is a multiple of 4 high then filter_line will never be called
++static inline int job_start(const int jobnr, const int nb_jobs, const int h)
++{
++    return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
++}
++
+ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
  {
-     uint16_t *dst = dst1;
-@@ -178,7 +179,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
-     FILTER_INTRA()
+     BWDIFContext *s = ctx->priv;
+@@ -222,8 +254,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+     int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
+     int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
+     int refs = linesize / df;
+-    int slice_start = (td->h *  jobnr   ) / nb_jobs;
+-    int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
++    int slice_start = job_start(jobnr, nb_jobs, td->h);
++    int slice_end   = job_start(jobnr + 1, nb_jobs, td->h);
+     int y;
+ 
+     for (y = slice_start; y < slice_end; y++) {
+@@ -245,6 +277,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+                                refs << 1, -(refs << 1),
+                                td->parity ^ td->tff, clip_max,
+                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
++            } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
++                s->filter_line3(dst, td->frame->linesize[td->plane],
++                                prev, cur, next, linesize, td->w,
++                                td->parity ^ td->tff, clip_max);
++                y += 2;
+             } else {
+                 s->filter_line(dst, prev, cur, next, td->w,
+                                refs, -refs, refs << 1, -(refs << 1),
+@@ -280,7 +317,8 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
+         td.h     = h;
+         td.plane = i;
+ 
+-        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(h, ff_filter_get_nb_threads(ctx)));
++        ctx->internal->execute(ctx, filter_slice, &td, NULL,
++                          FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
+     }
+     if (yadif->current_field == YADIF_FIELD_END) {
+         yadif->current_field = YADIF_FIELD_NORMAL;
+@@ -350,20 +388,29 @@ static int config_props(AVFilterLink *link)
+ 
+     yadif->csp = av_pix_fmt_desc_get(link->format);
+     yadif->filter = filter;
+-    if (yadif->csp->comp[0].depth > 8) {
++    ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
++
++    return 0;
++}
++
++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
++{
++    s->filter_line3 = 0;
++    if (bit_depth > 8) {
+         s->filter_intra = filter_intra_16bit;
+         s->filter_line  = filter_line_c_16bit;
+         s->filter_edge  = filter_edge_16bit;
+     } else {
+-        s->filter_intra = filter_intra;
+-        s->filter_line  = filter_line_c;
+-        s->filter_edge  = filter_edge;
++        s->filter_intra = ff_bwdif_filter_intra_c;
++        s->filter_line  = ff_bwdif_filter_line_c;
++        s->filter_edge  = ff_bwdif_filter_edge_c;
+     }
+ 
+-    if (ARCH_X86)
+-        ff_bwdif_init_x86(s);
+-
+-    return 0;
++#if ARCH_X86
++    ff_bwdif_init_x86(s, bit_depth);
++#elif ARCH_AARCH64
++    ff_bwdif_init_aarch64(s, bit_depth);
++#endif
  }
  
--static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
-                                 int parity, int clip_max)
-@@ -196,7 +197,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
-     FILTER2()
- }
  
--static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                               int parity, int clip_max, int spat)
- {
 diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
 new file mode 100644
 index 0000000000..d4c11cfc51
@@ -65154,6 +66306,23 @@ index 0000000000..61c03a385c
 +    .outputs       = avfilter_vf_unsand_outputs,
 +};
 +
+diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
+index b1e70b3bc6..b9e3a25921 100644
+--- a/libavfilter/x86/vf_bwdif_init.c
++++ b/libavfilter/x86/vf_bwdif_init.c
+@@ -51,11 +51,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
+                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                       int mrefs4, int parity, int clip_max);
+ 
+-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
+ {
+-    YADIFContext *yadif = &bwdif->yadif;
+     int cpu_flags = av_get_cpu_flags();
+-    int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;
+ 
+     if (bit_depth <= 8) {
+ #if ARCH_X86_32
 diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
 index b4284a8778..692265593c 100644
 --- a/libavformat/matroskaenc.c
@@ -65422,10 +66591,10 @@ index 5613813ba8..ab8bcfcf34 100644
 +
 diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
 new file mode 100644
-index 0000000000..2f07d9674c
+index 0000000000..11658de0c8
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,781 @@
+@@ -0,0 +1,672 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -65676,199 +66845,191 @@ index 0000000000..2f07d9674c
 +    ret
 +endfunc
 +
-+//void ff_rpi_sand30_lines_to_planar_c16(
-+//  uint8_t * dst_u,            // [x0]
-+//  unsigned int dst_stride_u,  // [w1] == _w*2
-+//  uint8_t * dst_v,            // [x2]
-+//  unsigned int dst_stride_v,  // [w3] == _w*2
-+//  const uint8_t * src,        // [x4]
-+//  unsigned int stride1,       // [w5] == 128
-+//  unsigned int stride2,       // [w6] 
-+//  unsigned int _x,            // [w7] == 0
-+//  unsigned int y,             // [sp, #0] == 0
-+//  unsigned int _w,            // [sp, #8] -> w3
-+//  unsigned int h);            // [sp, #16] -> w7
++// Unzip chroma
++//
++// On entry:
++// a0 = V0, U2,  ...
++// a1 = U0, V1,  ...
++// a2 = U1, V2,  ...
++// b0 = V8, U10, ...
++// b1 = U8, V9,  ...
++// b2 = U9, V10, ...
++//
++// On exit:
++// d0 = U0, U3, ...
++// ...
++// a0 = V0, V3, ..
++// ...
++//
++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs)
 +
-+.macro rpi_sand30_lines_to_planar_c16_block_half
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2
++                uzp1            \d0\().8h, \a1\().8h, \b1\().8h
++                uzp1            \d1\().8h, \a2\().8h, \b2\().8h
++                uzp2            \d2\().8h, \a0\().8h, \b0\().8h
 +
-+    xtn v4.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v5.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v6.4h, v0.4s
-+    xtn2 v4.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v5.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v6.8h, v1.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-+    
-+    xtn v4.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v5.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v6.4h, v2.4s
-+    xtn2 v4.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v5.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v6.8h, v3.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
-+    sub sp, sp, #48
++                uzp1            \a0\().8h, \a0\().8h, \b0\().8h
++                uzp2            \a1\().8h, \a1\().8h, \b1\().8h
++                uzp2            \a2\().8h, \a2\().8h, \b2\().8h
 +.endm
 +
++// SAND30 -> 10bit
++.macro USAND10 d0, d1, d2, a0, a1
++                shrn            \d2\().4h, \a0\().4s, #14
++                shrn            \d1\().4h, \a0\().4s, #10
++
++                shrn2           \d2\().8h, \a1\().4s, #14
++                shrn2           \d1\().8h, \a1\().4s, #10
++                uzp1            \d0\().8h, \a0\().8h, \a1\().8h
++
++                ushr            \d2\().8h, \d2\().8h, #6
++                bic             \d0\().8h, #0xfc,     lsl #8
++                bic             \d1\().8h, #0xfc,     lsl #8
++.endm
++
++// SAND30 -> 8bit
++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2
++                shrn            \d1\().4h,  \a0\().4s,  #12
++                shrn2           \d1\().8h,  \a1\().4s,  #12
++                uzp1            \d0\().8h,  \a0\().8h,  \a1\().8h
++                uzp2            \d2\().8h,  \a0\().8h,  \a1\().8h
++
++                shrn            \t1\().4h,  \a2\().4s,  #12
++                shrn2           \t1\().8h,  \a3\().4s,  #12
++                uzp1            \t0\().8h,  \a2\().8h,  \a3\().8h
++                uzp2            \t2\().8h,  \a2\().8h,  \a3\().8h
++
++                shrn            \d0\().8b,  \d0\().8h,  #2
++                shrn2           \d0\().16b, \t0\().8h,  #2
++                shrn            \d2\().8b,  \d2\().8h,  #6
++                shrn2           \d2\().16b, \t2\().8h,  #6
++                uzp1            \d1\().16b, \d1\().16b, \t1\().16b
++.endm
++
++
++// void ff_rpi_sand30_lines_to_planar_c16(
++//   uint8_t * dst_u,            // [x0]
++//   unsigned int dst_stride_u,  // [w1]
++//   uint8_t * dst_v,            // [x2]
++//   unsigned int dst_stride_v,  // [w3]
++//   const uint8_t * src,        // [x4]
++//   unsigned int stride1,       // [w5]      128
++//   unsigned int stride2,       // [w6]
++//   unsigned int _x,            // [w7]      0
++//   unsigned int y,             // [sp, #0]
++//   unsigned int _w,            // [sp, #8]  w9
++//   unsigned int h);            // [sp, #16] w10
++
 +function ff_rpi_sand30_lines_to_planar_c16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
++                ldr             w7,  [sp, #0]                   // y
++                ldr             w8,  [sp, #8]                   // _w
++                ldr             w10, [sp, #16]                  // h
++                lsl             w6,  w6,  #7                    // Fixup stride2
++                sub             w6,  w6,  #64
++                uxtw            x6,  w6
++                sub             w1,  w1,  w8,  LSL #1           // Fixup chroma strides
++                sub             w3,  w3,  w8,  LSL #1
++                lsl             w7,  w7,  #7                    // Add y to src
++                add             x4,  x4,  w7,  UXTW
++10:
++                mov             w13, #0
++                mov             x5,  x4
++                mov             w9,  w8
++1:
++                ld1             {v0.4s-v3.4s}, [x5], #64
++                ld1             {v4.4s-v7.4s}, [x5], x6
++                subs            w9,  w9,  #48
 +
-+    ldr w3, [sp, #48+8]    // w3 = width
-+    ldr w7, [sp, #48+16]   // w7 = height
++                USAND10         v17, v16, v18, v0, v1
++                USAND10         v20, v19, v21, v2, v3
++                UZPH_C          v0, v1, v2, v16, v17, v18, v19, v20, v21
++                USAND10         v23, v22, v24, v4, v5
++                USAND10         v26, v25, v27, v6, v7
++                UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
 +
-+    // reserve space on the stack for intermediate results
-+    sub sp, sp, #256
++                blt             2f
 +
-+    // number of 128byte blocks per row, w8 = width / 48
-+    mov w9, #48
-+    udiv w8, w3, w9
++                st3             {v0.8h-v2.8h},   [x0], #48
++                st3             {v4.8h-v6.8h},   [x0], #48
++                st3             {v16.8h-v18.8h}, [x2], #48
++                st3             {v22.8h-v24.8h}, [x2], #48
 +
-+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
-+    mul w9, w8, w9
-+    sub w9, w3, w9
++                bne             1b
++11:
++                subs            w10, w10, #1
++                add             x4,  x4,  #128
++                add             x0,  x0,  w1,  UXTW
++                add             x2,  x2,  w3,  UXTW
++                bne             10b
++99:
++                ret
 +
-+    // row offset, the beginning of the next row to process
-+    eor w10, w10, w10
-+
-+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
-+    lsl w11, w6, #7
-+    sub w11, w11, #128
-+
-+    // decrease the height by one and in case of remaining pixels increase the block count by one
-+    sub w7, w7, #1
-+    cmp w9, #0
-+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
-+    add w8, w8, w19
-+
-+    // bytes we have to move dst back by at the end of every row
-+    mov w21, #48*2
-+    mul w21, w21, w8
-+    sub w21, w1, w21
-+
-+    mov w20, #0     // w20 = flag, last row processed
-+
-+    mov x12, #0x03ff03ff03ff03ff
-+    dup v16.2d, x12
-+
-+    // iterate through rows, row counter = w12 = 0
-+    eor w12, w12, w12
-+row_loop_c16:
-+    cmp w12, w7
-+    bge row_loop_c16_fin
-+
-+    // address of row data = src + row_offset
-+    mov x13, x4
-+    add x13, x13, x10
-+
-+    eor w14, w14, w14
-+block_loop_c16:
-+    cmp w14, w8
-+    bge block_loop_c16_fin
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    add x13, x13, x11 // offset to next block
-+    add w14, w14, #1
-+    b block_loop_c16
-+block_loop_c16_fin:
-+
-+    add w10, w10, #128
-+    add w12, w12, #1
-+    add x0, x0, w21, sxtw  // move dst pointers back by x21
-+    add x2, x2, w21, sxtw
-+    b row_loop_c16
-+row_loop_c16_fin:
-+
-+    cmp w20, #1
-+    beq row_loop_c16_fin2
-+    mov w20, #1
-+    sub w8, w8, w19 // decrease block count by w19
-+    add w7, w7, #1 // increase height
-+    b row_loop_c16
-+
-+row_loop_c16_fin2:
-+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
-+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
-+
-+    // last incomplete block to be finished
-+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp], #32
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #160
-+
-+    mov x4, sp
-+    eor w20, w20, w20
-+rem_pix_c16_loop:
-+    cmp w20, w9
-+    bge rem_pix_c16_fin
-+
-+    ldr w22, [x4], #4
-+    str w22, [x0], #2
-+    lsr w22, w22, #16
-+    str w22, [x2], #2 
-+
-+    add w20, w20, #1
-+    b rem_pix_c16_loop
-+rem_pix_c16_fin:
-+
-+    add sp, sp, #256
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
++// Partial final write
++2:
++                cmp             w9,  #24-48
++                blt             1f
++                st3             {v0.8h  - v2.8h},  [x0], #48
++                st3             {v16.8h - v18.8h}, [x2], #48
++                beq             11b
++                mov             v0.16b,  v4.16b
++                mov             v1.16b,  v5.16b
++                sub             w9,  w9,  #24
++                mov             v2.16b,  v6.16b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                mov             v18.16b, v24.16b
++1:
++                cmp             w9,  #12-48
++                blt             1f
++                st3             {v0.4h  - v2.4h},  [x0], #24
++                st3             {v16.4h - v18.4h}, [x2], #24
++                beq             11b
++                mov             v0.2d[0],  v0.2d[1]
++                sub             w9,  w9,  #12
++                mov             v1.2d[0],  v1.2d[1]
++                mov             v2.2d[0],  v2.2d[1]
++                mov             v16.2d[0], v16.2d[1]
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w9,  #6-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v0.h  - v2.h}[1],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                st3             {v16.h - v18.h}[1], [x2], #6
++                beq             11b
++                mov             v0.s[0],  v0.s[1]
++                sub             w9,  w9,  #6
++                mov             v1.s[0],  v1.s[1]
++                mov             v2.s[0],  v2.s[1]
++                mov             v16.s[0], v16.s[1]
++                mov             v17.s[0], v17.s[1]
++                mov             v18.s[0], v18.s[1]
++1:
++                cmp             w9,  #3-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                beq             11b
++                mov             v0.h[0],  v0.h[1]
++                sub             w9,  w9,  #3
++                mov             v1.h[0],  v1.h[1]
++                mov             v16.h[0], v16.h[1]
++                mov             v17.h[0], v17.h[1]
++1:
++                cmp             w9,  #2-48
++                blt             1f
++                st2             {v0.h  - v1.h}[0],  [x0], #4
++                st2             {v16.h - v17.h}[0], [x2], #4
++                b               11b
++1:
++                st1             {v0.h}[0],  [x0], #2
++                st1             {v16.h}[0], [x2], #2
++                b               11b
 +endfunc
 +
 +
-+
 +//void ff_rpi_sand30_lines_to_planar_p010(
 +//  uint8_t * dest,
 +//  unsigned int dst_stride,
@@ -65897,6 +67058,7 @@ index 0000000000..2f07d9674c
 +function ff_rpi_sand30_lines_to_planar_y16, export=1
 +                lsl             w4,  w4,  #7
 +                sub             w4,  w4,  #64
++                uxtw            x4,  w4
 +                sub             w1,  w1,  w7, lsl #1
 +                uxtw            x6,  w6
 +                add             x8,  x2,  x6, lsl #7
@@ -65911,61 +67073,10 @@ index 0000000000..2f07d9674c
 +
 +                subs            w5,  w5,  #96
 +
-+                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #14
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #10
-+
-+                shrn2           v18.8h,  v1.4s,   #14
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #10
-+
-+                ushr            v18.8h,  v18.8h,  #6
-+                bic             v16.8h,  #0xfc,   lsl #8
-+                bic             v17.8h,  #0xfc,   lsl #8
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #14
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #10
-+
-+                shrn2           v21.8h,  v3.4s,   #14
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #10
-+
-+                ushr            v21.8h,  v21.8h,  #6
-+                bic             v19.8h,  #0xfc,   lsl #8
-+                bic             v20.8h,  #0xfc,   lsl #8
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #14
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #10
-+
-+                shrn2           v24.8h,  v5.4s,   #14
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #10
-+
-+                ushr            v24.8h,  v24.8h,  #6
-+                bic             v22.8h,  #0xfc,   lsl #8
-+                bic             v23.8h,  #0xfc,   lsl #8
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #14
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #10
-+
-+                shrn2           v27.8h,  v7.4s,   #14
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #10
-+
-+                ushr            v27.8h,  v27.8h,  #6
-+                bic             v25.8h,  #0xfc,   lsl #8
-+                bic             v26.8h,  #0xfc,   lsl #8
++                USAND10         v16, v17, v18, v0, v1
++                USAND10         v19, v20, v21, v2, v3
++                USAND10         v22, v23, v24, v4, v5
++                USAND10         v25, v26, v27, v6, v7
 +
 +                blt             2f
 +
@@ -66062,6 +67173,7 @@ index 0000000000..2f07d9674c
 +function ff_rpi_sand30_lines_to_planar_y8, export=1
 +                lsl             w4,  w4,  #7
 +                sub             w4,  w4,  #64
++                uxtw            x4,  w4
 +                sub             w1,  w1,  w7
 +                uxtw            x6,  w6
 +                add             x8,  x2,  x6, lsl #7
@@ -66077,60 +67189,8 @@ index 0000000000..2f07d9674c
 +                subs            w5,  w5,  #96
 +
 +                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #16
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #12
-+
-+                shrn2           v18.8h,  v1.4s,   #16
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #12
-+
-+                shrn            v18.8b,  v18.8h,  #6
-+                shrn            v16.8b,  v16.8h,  #2
-+                xtn             v17.8b,  v17.8h
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #16
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #12
-+
-+                shrn2           v21.8h,  v3.4s,   #16
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #12
-+
-+                shrn2           v18.16b, v21.8h,  #6
-+                shrn2           v16.16b, v19.8h,  #2
-+                xtn2            v17.16b, v20.8h
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #16
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #12
-+
-+                shrn2           v24.8h,  v5.4s,   #16
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #12
-+
-+                shrn            v21.8b,  v24.8h,  #6
-+                shrn            v19.8b,  v22.8h,  #2
-+                xtn             v20.8b,  v23.8h
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #16
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #12
-+
-+                shrn2           v27.8h,  v7.4s,   #16
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #12
-+
-+                shrn2           v21.16b, v27.8h,  #6
-+                shrn2           v19.16b, v25.8h,  #2
-+                xtn2            v20.16b, v26.8h
++                USAND8          v16, v17, v18, v0, v1, v2, v3, v22, v23, v24
++                USAND8          v19, v20, v21, v4, v5, v6, v7, v22, v23, v24
 +
 +                blt             2f
 +
@@ -67885,10 +68945,10 @@ index 0000000000..0d5d203dc3
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..b6071e2928
+index 0000000000..0626bb06cb
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,445 @@
+@@ -0,0 +1,447 @@
 +/*
 +Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
 +All rights reserved.
@@ -67926,10 +68986,12 @@ index 0000000000..b6071e2928
 +#include "frame.h"
 +
 +#if ARCH_ARM && HAVE_NEON
-+#include "arm/rpi_sand_neon.h"
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
 +#define HAVE_SAND_ASM 1
 +#elif ARCH_AARCH64 && HAVE_NEON
-+#include "aarch64/rpi_sand_neon.h"
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
 +#define HAVE_SAND_ASM 1
 +#else
 +#define HAVE_SAND_ASM 0
@@ -67988,7 +69050,7 @@ index 0000000000..b6071e2928
 +    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 +
 +#if HAVE_SAND_ASM
-+    if (_x == 0) {
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
 +        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
 +        return;
 +    }
@@ -68054,7 +69116,7 @@ index 0000000000..b6071e2928
 +    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 +
 +#if HAVE_SAND_ASM
-+    if (_x == 0) {
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
 +        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
 +                                       src, stride1, stride2, _x, y, _w, h);
 +        return;
@@ -71406,7 +72468,7 @@ index 0000000000..5935a11ca5
 +    do_logparse(args.logfile)
 +
 diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
-index 1827a4e134..08da4166ef 100644
+index 1827a4e134..3c765a5eb1 100644
 --- a/tests/checkasm/Makefile
 +++ b/tests/checkasm/Makefile
 @@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
@@ -71420,8 +72482,27 @@ index 1827a4e134..08da4166ef 100644
  AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
  AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
  
+@@ -35,6 +37,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+ # libavfilter tests
+ AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
+ AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
+ AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
+ AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
+@@ -52,8 +55,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
+ # libavutil tests
+ AVUTILOBJS                              += fixed_dsp.o
+ AVUTILOBJS                              += float_dsp.o
++AVUTILOBJS-$(CONFIG_SAND)               += rpi_sand.o
+ 
+-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
++CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
+ 
+ CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
+ CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
 diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
-index 8338e8ff58..81ef182f04 100644
+index 8338e8ff58..c1ee09c72e 100644
 --- a/tests/checkasm/checkasm.c
 +++ b/tests/checkasm/checkasm.c
 @@ -131,6 +131,9 @@ static const struct {
@@ -71444,11 +72525,31 @@ index 8338e8ff58..81ef182f04 100644
      #if CONFIG_VP8DSP
          { "vp8dsp", checkasm_check_vp8dsp },
      #endif
+@@ -172,6 +178,9 @@ static const struct {
+     #if CONFIG_BLEND_FILTER
+         { "vf_blend", checkasm_check_blend },
+     #endif
++    #if CONFIG_BWDIF_FILTER
++        { "vf_bwdif", checkasm_check_vf_bwdif },
++    #endif
+     #if CONFIG_COLORSPACE_FILTER
+         { "vf_colorspace", checkasm_check_colorspace },
+     #endif
+@@ -198,6 +207,9 @@ static const struct {
+ #if CONFIG_AVUTIL
+         { "fixed_dsp", checkasm_check_fixed_dsp },
+         { "float_dsp", checkasm_check_float_dsp },
++    #if CONFIG_SAND
++        { "rpi_sand",  checkasm_check_rpi_sand },
++    #endif
+ #endif
+     { NULL }
+ };
 diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
-index ef6645e3a2..1a1e17d835 100644
+index ef6645e3a2..02d3642836 100644
 --- a/tests/checkasm/checkasm.h
 +++ b/tests/checkasm/checkasm.h
-@@ -70,6 +70,7 @@ void checkasm_check_hevc_epel_bi(void);
+@@ -70,12 +70,14 @@ void checkasm_check_hevc_epel_bi(void);
  void checkasm_check_hevc_epel_bi_w(void);
  void checkasm_check_hevc_sao(void);
  void checkasm_check_huffyuvdsp(void);
@@ -71456,11 +72557,19 @@ index ef6645e3a2..1a1e17d835 100644
  void checkasm_check_jpeg2000dsp(void);
  void checkasm_check_llviddsp(void);
  void checkasm_check_llviddspenc(void);
-@@ -83,6 +84,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_nlmeans(void);
+ void checkasm_check_opusdsp(void);
+ void checkasm_check_pixblockdsp(void);
++void checkasm_check_rpi_sand(void);
+ void checkasm_check_sbrdsp(void);
+ void checkasm_check_synth_filter(void);
+ void checkasm_check_sw_rgb(void);
+@@ -83,6 +85,8 @@ void checkasm_check_sw_scale(void);
  void checkasm_check_utvideodsp(void);
  void checkasm_check_v210dec(void);
  void checkasm_check_v210enc(void);
 +void checkasm_check_vc1dsp(void);
++void checkasm_check_vf_bwdif(void);
  void checkasm_check_vf_eq(void);
  void checkasm_check_vf_gblur(void);
  void checkasm_check_vf_hflip(void);
@@ -71568,6 +72677,130 @@ index 0000000000..02724536a7
 +    check_add_put_clamped();
 +    report("idctdsp");
 +}
+diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c
+new file mode 100644
+index 0000000000..0888714c4c
+--- /dev/null
++++ b/tests/checkasm/rpi_sand.c
+@@ -0,0 +1,118 @@
++/*
++ * Copyright (c) 2023 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++#include "checkasm.h"
++#include "libavutil/common.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#if ARCH_ARM
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
++#elif ARCH_AARCH64
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
++#endif
++
++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c)
++{
++    return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20);
++}
++
++void checkasm_check_rpi_sand(void)
++{
++    const unsigned int w = 1280;
++    const unsigned int h = 66;
++    const unsigned int stride1 = 128;
++    const unsigned int stride2 = h*3/2;
++    const unsigned int ssize = ((w+95)/96)*128*h*3/2;
++    const unsigned int ysize = ((w + 32) * (h + 32) * 2);
++
++    uint8_t * sbuf0 = malloc(ssize);
++    uint8_t * sbuf1 = malloc(ssize);
++    uint8_t * ybuf0 = malloc(ysize);
++    uint8_t * ybuf1 = malloc(ysize);
++    uint8_t * vbuf0 = malloc(ysize);
++    uint8_t * vbuf1 = malloc(ysize);
++    uint8_t * yframe0 = (w + 32) * 16 + ybuf0;
++    uint8_t * yframe1 = (w + 32) * 16 + ybuf1;
++    uint8_t * vframe0 = (w + 32) * 16 + vbuf0;
++    uint8_t * vframe1 = (w + 32) * 16 + vbuf1;
++    unsigned int i;
++
++    for (i = 0; i != ssize; i += 4)
++        *(uint32_t*)(sbuf0 + i) = rnd();
++    memcpy(sbuf1, sbuf0, ssize);
++
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) {
++        declare_func(void, uint8_t * dst, const unsigned int dst_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h);
++        call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize))
++            fail();
++
++        bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
++    }
++
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) {
++        declare_func(void, uint8_t * u_dst, const unsigned int u_stride,
++                     uint8_t * v_dst, const unsigned int v_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++        memset(vbuf0, 0xbb, ysize);
++        memset(vbuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2);
++        call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize)
++            || memcmp(vbuf0, vbuf1, ysize))
++            fail();
++
++        bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
++    }
++
++
++    report("sand30");
++
++    free(sbuf0);
++    free(sbuf1);
++    free(ybuf0);
++    free(ybuf1);
++    free(vbuf0);
++    free(vbuf1);
++}
++
 diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
 new file mode 100644
 index 0000000000..52628d15e4
@@ -72026,11 +73259,273 @@ index 0000000000..52628d15e4
 +    check_unescape();
 +    report("unescape_buffer");
 +}
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+new file mode 100644
+index 0000000000..3399cacdf7
+--- /dev/null
++++ b/tests/checkasm/vf_bwdif.c
+@@ -0,0 +1,256 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++#include "checkasm.h"
++#include "libavcodec/internal.h"
++#include "libavfilter/bwdif.h"
++#include "libavutil/mem_internal.h"
++
++#define WIDTH 256
++
++#define randomize_buffers(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = rnd() & mask
++
++#define randomize_overflow_check(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
++
++#define BODY(type, depth)                                                      \
++    do {                                                                       \
++        type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
++        type next0[9*WIDTH], next1[9*WIDTH];                                   \
++        type cur0[9*WIDTH], cur1[9*WIDTH];                                     \
++        type dst0[WIDTH], dst1[WIDTH];                                         \
++        const int stride = WIDTH;                                              \
++        const int mask = (1<<depth)-1;                                         \
++                                                                               \
++        declare_func(void, void *dst, void *prev, void *cur, void *next,       \
++                        int w, int prefs, int mrefs, int prefs2, int mrefs2,   \
++                        int prefs3, int mrefs3, int prefs4, int mrefs4,        \
++                        int parity, int clip_max);                             \
++                                                                               \
++        randomize_buffers(prev0, prev1, mask, 9*WIDTH);                        \
++        randomize_buffers(next0, next1, mask, 9*WIDTH);                        \
++        randomize_buffers( cur0,  cur1, mask, 9*WIDTH);                        \
++                                                                               \
++        call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++        call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++                                                                               \
++        if (memcmp(dst0, dst1, sizeof dst0)                                    \
++                || memcmp(prev0, prev1, sizeof prev0)                          \
++                || memcmp(next0, next1, sizeof next0)                          \
++                || memcmp( cur0,  cur1, sizeof cur0))                          \
++            fail();                                                            \
++        bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,      \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++    } while (0)
++
++void checkasm_check_vf_bwdif(void)
++{
++    BWDIFContext ctx_8, ctx_10;
++
++    ff_bwdif_init_filter_line(&ctx_8, 8);
++    ff_bwdif_init_filter_line(&ctx_10, 10);
++
++    if (check_func(ctx_8.filter_line, "bwdif8")) {
++        BODY(uint8_t, 8);
++        report("bwdif8");
++    }
++
++    if (check_func(ctx_10.filter_line, "bwdif10")) {
++        BODY(uint16_t, 10);
++        report("bwdif10");
++    }
++
++    if (!ctx_8.filter_line3)
++        ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
++
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int parity;
++
++        for (parity = 0; parity != 2; ++parity) {
++            if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
++
++                declare_func(void, void * dst1, int d_stride,
++                                          const void * prev1, const void * cur1, const void * next1, int prefs,
++                                          int w, int parity, int clip_max);
++
++                randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                randomize_buffers(next0, next1, mask, 11*WIDTH);
++                randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++
++                call_ref(dst0, stride,
++                         prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                         WIDTH, parity, mask);
++                call_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++
++                if (memcmp(dst0, dst1, WIDTH*3)
++                        || memcmp(prev0, prev1, WIDTH*11)
++                        || memcmp(next0, next1, WIDTH*11)
++                        || memcmp( cur0,  cur1, WIDTH*11))
++                    fail();
++
++                bench_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++            }
++        }
++
++        // Use just 0s and ~0s to try to provoke bad cropping or overflow
++        // Parity makes no difference to this test so just test 0
++        if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
++
++            declare_func(void, void * dst1, int d_stride,
++                                      const void * prev1, const void * cur1, const void * next1, int prefs,
++                                      int w, int parity, int clip_max);
++
++            randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
++            randomize_overflow_check(next0, next1, mask, 11*WIDTH);
++            randomize_overflow_check( cur0,  cur1, mask, 11*WIDTH);
++
++            call_ref(dst0, stride,
++                     prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                     WIDTH, 0, mask);
++            call_new(dst1, stride,
++                     prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                     WIDTH, 0, mask);
++
++            if (memcmp(dst0, dst1, WIDTH*3)
++                    || memcmp(prev0, prev1, WIDTH*11)
++                    || memcmp(next0, next1, WIDTH*11)
++                    || memcmp( cur0,  cur1, WIDTH*11))
++                fail();
++
++            // No point to benching
++        }
++
++        report("bwdif8.line3");
++    }
++
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int spat;
++        int parity;
++
++        for (spat = 0; spat != 2; ++spat) {
++            for (parity = 0; parity != 2; ++parity) {
++                if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
++
++                    declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
++                                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                                            int parity, int clip_max, int spat);
++
++                    randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                    randomize_buffers(next0, next1, mask, 11*WIDTH);
++                    randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++                    memset(dst0, 0xba, WIDTH * 3);
++                    memset(dst1, 0xba, WIDTH * 3);
++
++                    call_ref(dst0 + stride,
++                             prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                    call_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++
++                    if (memcmp(dst0, dst1, WIDTH*3)
++                            || memcmp(prev0, prev1, WIDTH*11)
++                            || memcmp(next0, next1, WIDTH*11)
++                            || memcmp( cur0,  cur1, WIDTH*11))
++                        fail();
++
++                    bench_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                }
++            }
++        }
++
++        report("bwdif8.edge");
++    }
++
++    if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++
++        declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
++                     int prefs3, int mrefs3, int parity, int clip_max);
++
++        randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++        memset(dst0, 0xba, WIDTH * 3);
++        memset(dst1, 0xba, WIDTH * 3);
++
++        call_ref(dst0 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++        call_new(dst1 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++
++        if (memcmp(dst0, dst1, WIDTH*3)
++                || memcmp( cur0,  cur1, WIDTH*11))
++            fail();
++
++        bench_new(dst1 + stride,
++                  cur0 + stride * 4, WIDTH,
++                  stride, -stride, stride * 3, -stride * 3,
++                  0, mask);
++
++        report("bwdif8.intra");
++    }
++}
 diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
-index 07f1d8238e..aa5f45ec8f 100644
+index 07f1d8238e..723c2b26ef 100644
 --- a/tests/fate/checkasm.mak
 +++ b/tests/fate/checkasm.mak
-@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+@@ -16,18 +16,22 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                  fate-checkasm-hevc_add_res                              \
                  fate-checkasm-hevc_idct                                 \
                  fate-checkasm-hevc_sao                                  \
@@ -72038,11 +73533,18 @@ index 07f1d8238e..aa5f45ec8f 100644
                  fate-checkasm-jpeg2000dsp                               \
                  fate-checkasm-llviddsp                                  \
                  fate-checkasm-llviddspenc                               \
-@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-opusdsp                                   \
+                 fate-checkasm-pixblockdsp                               \
++                fate-checkasm-rpi_sand                                  \
+                 fate-checkasm-sbrdsp                                    \
+                 fate-checkasm-synth_filter                              \
+                 fate-checkasm-sw_rgb                                    \
                  fate-checkasm-sw_scale                                  \
                  fate-checkasm-v210dec                                   \
                  fate-checkasm-v210enc                                   \
 +                fate-checkasm-vc1dsp                                    \
                  fate-checkasm-vf_blend                                  \
++                fate-checkasm-vf_bwdif                                  \
                  fate-checkasm-vf_colorspace                             \
                  fate-checkasm-vf_eq                                     \
+                 fate-checkasm-vf_gblur                                  \