diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 659c5d7d13..974d7e0d45 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -51212,7 +51212,7 @@ index 8dbc7fc104..0bda4dd06b 100644 #endif // AVCODEC_V4L2_BUFFERS_H diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ff1ea8e57b..fcd5fdf359 100644 +index ff1ea8e57b..65b2648557 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c @@ -27,11 +27,13 @@ @@ -51414,6 +51414,12 @@ index ff1ea8e57b..fcd5fdf359 100644 -static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) +static inline int ctx_buffers_alloced(const V4L2Context * const ctx) ++{ ++ return ctx->bufrefs != NULL; ++} ++ ++// Width/Height changed or we don't have an alloc in the first place? ++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) { - struct v4l2_format *fmt1 = &ctx->format; - int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? @@ -51422,12 +51428,6 @@ index ff1ea8e57b..fcd5fdf359 100644 - : - fmt1->fmt.pix.width != fmt2->fmt.pix.width || - fmt1->fmt.pix.height != fmt2->fmt.pix.height; -+ return ctx->bufrefs != NULL; -+} -+ -+// Width/Height changed or we don't have an alloc in the first place? -+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) -+{ + const struct v4l2_format *fmt1 = &ctx->format; + int ret = !ctx_buffers_alloced(ctx) || + (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? @@ -51520,12 +51520,12 @@ index ff1ea8e57b..fcd5fdf359 100644 - s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); - } + get_default_selection(&s->capture, &s->capture.selection); - -- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ + reinit = ctx_resolution_changed(&s->capture, &cap_fmt); + if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) + reinit = 1; -+ + +- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); + s->capture.format = cap_fmt; if (reinit) { - s->capture.height = v4l2_get_height(&cap_fmt); @@ -51574,16 +51574,16 @@ index ff1ea8e57b..fcd5fdf359 100644 if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); + av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); -+ return AVERROR(EINVAL); -+ } + return AVERROR(EINVAL); + } + + if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || + s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { + av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", + s->capture.width, s->capture.height, + ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); - return AVERROR(EINVAL); - } ++ return AVERROR(EINVAL); ++ } + + // Update pixel format - should only actually do something on initial change + s->capture.av_pix_fmt = @@ -51928,17 +51928,17 @@ index ff1ea8e57b..fcd5fdf359 100644 } - return NULL; + return AVERROR(EAGAIN); -+ } -+ -+ if ((pfd.revents & POLLERR) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); -+ return AVERROR_UNKNOWN; } - if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { - int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? - buf.m.planes[0].bytesused : buf.bytesused; - if (bytesused == 0) { ++ if ((pfd.revents & POLLERR) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); ++ return AVERROR_UNKNOWN; ++ } ++ + if ((pfd.revents & poll_event) != 0) { + ret = get_event(m); + if (ret < 0) { @@ -51951,13 +51951,6 @@ index ff1ea8e57b..fcd5fdf359 100644 - ctx->done = 1; -#endif + continue; -+ } -+ -+ if ((pfd.revents & poll_cap) != 0) { -+ ret = dq_buf(ctx, ppavbuf); -+ if (ret == AVERROR(EPIPE)) -+ continue; -+ return ret; } - avbuf = &ctx->buffers[buf.index]; @@ -51966,12 +51959,19 @@ index ff1ea8e57b..fcd5fdf359 100644 - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - memcpy(avbuf->planes, planes, sizeof(planes)); - avbuf->buf.m.planes = avbuf->planes; ++ if ((pfd.revents & poll_cap) != 0) { ++ ret = dq_buf(ctx, ppavbuf); ++ if (ret == AVERROR(EPIPE)) ++ continue; ++ return ret; + } +- return avbuf; ++ + if ((pfd.revents & poll_out) != 0) { + if (is_cap) + return AVERROR(EAGAIN); + return dq_buf(ctx, ppavbuf); - } -- return avbuf; ++ } + + av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); + return AVERROR_UNKNOWN; @@ -52062,18 +52062,18 @@ index ff1ea8e57b..fcd5fdf359 100644 + .type = ctx->type, + .count = 0, /* 0 -> unmap all buffers from the driver */ + }; -+ -+ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno == EINTR) -+ continue; -+ -+ ret = AVERROR(errno); - for (j = 0; j < buffer->num_planes; j++) { - struct V4L2Plane_info *p = &buffer->plane_info[j]; - if (p->mm_addr && p->length) - if (munmap(p->mm_addr, p->length) < 0) - av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); ++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno == EINTR) ++ continue; ++ ++ ret = AVERROR(errno); ++ + av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", + ctx->name, av_err2str(AVERROR(errno))); + @@ -52289,7 +52289,7 @@ index ff1ea8e57b..fcd5fdf359 100644 return ret; return ff_v4l2_buffer_enqueue(avbuf); -@@ -635,42 +1032,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -635,42 +1032,77 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) { @@ -52307,9 +52307,6 @@ index ff1ea8e57b..fcd5fdf359 100644 - if (!avbuf) { - if (ctx->done) - return AVERROR_EOF; -- -- return AVERROR(EAGAIN); -- } + do { + if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) + return rv; @@ -52317,6 +52314,9 @@ index ff1ea8e57b..fcd5fdf359 100644 + return rv; + } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); +- return AVERROR(EAGAIN); +- } +- - return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); + return 0; } @@ -52346,14 +52346,53 @@ index ff1ea8e57b..fcd5fdf359 100644 + } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); - return AVERROR(EAGAIN); -- } -- -- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); + return 0; ++} ++ ++// Return 0 terminated list of drm fourcc video formats for this context ++// NULL if none found or error ++// Returned list is malloced so must be freed ++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN) ++{ ++ unsigned int i; ++ unsigned int n = 0; ++ unsigned int size = 0; ++ uint32_t * e = NULL; ++ *pN = 0; ++ ++ for (i = 0; i < 1024; ++i) { ++ struct v4l2_fmtdesc fdesc = { ++ .index = i, ++ .type = ctx->type ++ }; ++ ++ if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc)) ++ return e; ++ ++ if (n + 1 >= size) { ++ unsigned int newsize = (size == 0) ? 16 : size * 2; ++ uint32_t * t = av_realloc(e, newsize * sizeof(*t)); ++ if (!t) ++ return e; ++ e = t; ++ size = newsize; ++ } ++ ++ e[n] = fdesc.pixelformat; ++ e[++n] = 0; ++ if (pN) ++ *pN = n; + } + +- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); ++ // If we've looped 1024 times we are clearly confused ++ *pN = 0; ++ av_free(e); ++ return NULL; } int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) -@@ -702,78 +1093,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -702,78 +1134,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) int ff_v4l2_context_set_format(V4L2Context* ctx) { @@ -52566,7 +52605,7 @@ index ff1ea8e57b..fcd5fdf359 100644 return ret; } diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 22a9532444..108fc05a6f 100644 +index 22a9532444..0c8c020be1 100644 --- a/libavcodec/v4l2_context.h +++ b/libavcodec/v4l2_context.h @@ -31,6 +31,7 @@ @@ -52637,7 +52676,27 @@ index 22a9532444..108fc05a6f 100644 } V4L2Context; /** -@@ -147,7 +177,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); +@@ -119,6 +149,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx); + */ + int ff_v4l2_context_get_format(V4L2Context* ctx, int probe); + ++/** ++ * Get the list of drm fourcc pixel formats for this context ++ * ++ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context ++ * description for required variables. ++ * @param[in] pN A pointer to receive the number of formats ++ * found. May be NULL if not wanted. ++ * @return Pointer to malloced list of zero terminated formats, ++ * NULL if none or error. As list is malloced it must be ++ * freed. ++ */ ++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN); ++ + /** + * Releases a V4L2Context. + * +@@ -147,7 +190,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); * @param[inout] pkt The AVPacket to dequeue to. * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. */ @@ -52646,7 +52705,7 @@ index 22a9532444..108fc05a6f 100644 /** * Dequeues a buffer from a V4L2Context to an AVFrame. -@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); +@@ -156,7 +199,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); * @param[in] ctx The V4L2Context to dequeue from. * @param[inout] f The AVFrame to dequeue to. * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) @@ -52657,7 +52716,7 @@ index 22a9532444..108fc05a6f 100644 */ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); -@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); +@@ -170,7 +216,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); * @param[in] pkt A pointer to an AVPacket. * @return 0 in case of success, a negative error otherwise. */ @@ -52666,7 +52725,7 @@ index 22a9532444..108fc05a6f 100644 /** * Enqueues a buffer to a V4L2Context from an AVFrame -@@ -183,4 +216,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); +@@ -183,4 +229,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); */ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); @@ -52696,7 +52755,7 @@ index 22a9532444..108fc05a6f 100644 + #endif // AVCODEC_V4L2_CONTEXT_H diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index cdfd579810..025cf24769 100644 +index cdfd579810..143656e792 100644 --- a/libavcodec/v4l2_m2m.c +++ b/libavcodec/v4l2_m2m.c @@ -35,6 +35,15 @@ @@ -52854,14 +52913,19 @@ index cdfd579810..025cf24769 100644 if (s->fd >= 0) { ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); if (ret) -@@ -356,7 +413,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) +@@ -355,8 +412,20 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + } ff_v4l2_context_release(&s->output); - -+ dmabufs_ctl_unref(&s->db_ctl); -+ close(s->fd); -+ s->fd = -1; ++ av_buffer_unref(&s->device_ref); + ++ dmabufs_ctl_unref(&s->db_ctl); ++ ++ if (s->fd != -1) { ++ close(s->fd); ++ s->fd = -1; ++ } + s->self_ref = NULL; + // This is only called on avctx close so after this point we don't have that + // Crash sooner if we find we are using it (can still log with avctx = NULL) @@ -52870,7 +52934,7 @@ index cdfd579810..025cf24769 100644 av_buffer_unref(&priv->context_ref); return 0; -@@ -400,35 +465,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) +@@ -400,35 +469,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) return v4l2_configure_contexts(s); } @@ -53060,7 +53124,7 @@ index b67b216331..a506e69d67 100644 + #endif /* AVCODEC_V4L2_M2M_H */ diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index ab07c0a24a..cec98cc16a 100644 +index ab07c0a24a..e7fd8980e5 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -21,8 +21,14 @@ @@ -53406,7 +53470,7 @@ index ab07c0a24a..cec98cc16a 100644 return 0; } -@@ -133,58 +343,768 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) +@@ -133,46 +343,822 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) return 0; } @@ -53625,27 +53689,27 @@ index ab07c0a24a..cec98cc16a 100644 + av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); return ret; + } - } - -- if (s->draining) -- goto dequeue; ++ } ++ + if (s->draining) { + if (s->buf_pkt.size) { + av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); + av_packet_unref(&s->buf_pkt); + } + return NQ_DRAINING; -+ } -+ + } + +- if (s->draining) +- goto dequeue; + if (!s->buf_pkt.size) + return NQ_NONE; -+ -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); - if (ret < 0 && ret != AVERROR(EAGAIN)) - goto fail; ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; ++ + if (s->extdata_sent) + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + else @@ -53972,10 +54036,9 @@ index ab07c0a24a..cec98cc16a 100644 +}; + +static int -+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) ++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc) +{ + unsigned int i; -+ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); + const uint32_t w = avctx->coded_width; + const uint32_t h = avctx->coded_height; + @@ -54103,11 +54166,13 @@ index ab07c0a24a..cec98cc16a 100644 +#if CONFIG_H264_DECODER + case AV_CODEC_ID_H264: + { -+ H264ParamSets ps = {{NULL}}; ++ H264ParamSets ps; + int is_avc = 0; + int nal_length_size = 0; + int ret; + ++ memset(&ps, 0, sizeof(ps)); ++ + ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size, + &ps, &is_avc, &nal_length_size, + avctx->err_recognition, avctx); @@ -54133,12 +54198,15 @@ index ab07c0a24a..cec98cc16a 100644 +#if CONFIG_HEVC_DECODER + case AV_CODEC_ID_HEVC: + { -+ HEVCParamSets ps = {{NULL}}; -+ HEVCSEI sei = {{{{0}}}}; ++ HEVCParamSets ps; ++ HEVCSEI sei; + int is_nalff = 0; + int nal_length_size = 0; + int ret; + ++ memset(&ps, 0, sizeof(ps)); ++ memset(&sei, 0, sizeof(sei)); ++ + ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size, + &ps, &sei, &is_nalff, &nal_length_size, + avctx->err_recognition, 0, avctx); @@ -54165,14 +54233,92 @@ index ab07c0a24a..cec98cc16a 100644 + default: + break; + } ++} ++ ++static int ++choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ const V4L2m2mPriv * const priv = avctx->priv_data; ++ unsigned int fmts_n; ++ uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n); ++ enum AVPixelFormat *fmts2 = NULL; ++ enum AVPixelFormat t; ++ enum AVPixelFormat gf_pix_fmt; ++ unsigned int i; ++ unsigned int n = 0; ++ unsigned int pref_n = 1; ++ int rv = AVERROR(ENOENT); ++ ++ if (!fmts) ++ return AVERROR(ENOENT); ++ ++ if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) { ++ rv = AVERROR(ENOMEM); ++ goto error; ++ } ++ ++ // Filter for formats that are supported by ffmpeg and ++ // can accomodate the stream size ++ fmts2[n++] = AV_PIX_FMT_DRM_PRIME; ++ for (i = 0; i != fmts_n; ++i) { ++ const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO); ++ if (f == AV_PIX_FMT_NONE) ++ continue; ++ ++ if (check_size(avctx, s, fmts[i]) != 0) ++ continue; ++ ++ if (f == priv->pix_fmt) ++ pref_n = n; ++ fmts2[n++] = f; ++ } ++ fmts2[n] = AV_PIX_FMT_NONE; ++ ++ if (n < 2) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__); ++ goto error; ++ } ++ ++ // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt ++ t = fmts2[n - 1]; ++ fmts2[n - 1] = fmts2[pref_n]; ++ fmts2[pref_n] = t; ++ ++ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); ++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", ++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), ++ avctx->coded_width, avctx->coded_height, ++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); ++ ++ if (gf_pix_fmt == AV_PIX_FMT_NONE) ++ goto error; ++ ++ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ s->capture.av_pix_fmt = avctx->sw_pix_fmt; ++ s->output_drm = 1; ++ } ++ else { ++ avctx->pix_fmt = gf_pix_fmt; ++ s->capture.av_pix_fmt = gf_pix_fmt; ++ s->output_drm = 0; ++ } ++ ++ // Get format converts capture.av_pix_fmt back into a V4L2 format in the context ++ if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0) ++ goto error; ++ rv = ff_v4l2_context_set_format(&s->capture); ++ ++error: ++ av_free(fmts2); ++ av_free(fmts); ++ return rv; +} static av_cold int v4l2_decode_init(AVCodecContext *avctx) { - V4L2Context *capture, *output; - V4L2m2mContext *s; +@@ -181,10 +1167,27 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) V4L2m2mPriv *priv = avctx->priv_data; -+ int gf_pix_fmt; int ret; + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); @@ -54199,7 +54345,7 @@ index ab07c0a24a..cec98cc16a 100644 capture = &s->capture; output = &s->output; -@@ -192,14 +1112,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -192,14 +1195,45 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) * by the v4l2 driver; this event will trigger a full pipeline reconfig and * the proper values will be retrieved from the kernel driver. */ @@ -54218,28 +54364,8 @@ index ab07c0a24a..cec98cc16a 100644 capture->av_pix_fmt = avctx->pix_fmt; + capture->min_buf_size = 0; + -+ /* the client requests the codec to generate DRM frames: -+ * - data[0] will therefore point to the returned AVDRMFrameDescriptor -+ * check the ff_v4l2_buffer_to_avframe conversion function. -+ * - the DRM frame format is passed in the DRM frame descriptor layer. -+ * check the v4l2_get_drm_frame function. -+ */ -+ -+ avctx->sw_pix_fmt = avctx->pix_fmt; -+ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); -+ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", -+ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), -+ avctx->coded_width, avctx->coded_height, -+ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); -+ -+ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ s->output_drm = 1; -+ } -+ else { -+ capture->av_pix_fmt = gf_pix_fmt; -+ s->output_drm = 0; -+ } ++ capture->av_pix_fmt = AV_PIX_FMT_NONE; ++ s->output_drm = 0; + + s->db_ctl = NULL; + if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) { @@ -54267,7 +54393,7 @@ index ab07c0a24a..cec98cc16a 100644 s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); -@@ -208,12 +1179,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -208,12 +1242,90 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) return ret; } @@ -54281,19 +54407,21 @@ index ab07c0a24a..cec98cc16a 100644 + return ret; + } + -+ if ((ret = v4l2_prepare_decoder(s)) < 0) -+ return ret; -+ + if ((ret = get_quirks(avctx, s)) != 0) + return ret; + -+ if ((ret = check_size(avctx, s)) != 0) -+ return ret; -+ + if ((ret = check_profile(avctx, s)) != 0) { + av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile); + return ret; + } ++ ++ // Size check done as part of format filtering ++ if ((ret = choose_capture_format(avctx, s)) != 0) ++ return ret; ++ ++ if ((ret = v4l2_prepare_decoder(s)) < 0) ++ return ret; ++ + return 0; } @@ -54358,7 +54486,7 @@ index ab07c0a24a..cec98cc16a 100644 } #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -222,10 +1269,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) +@@ -222,10 +1334,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", @@ -54377,7 +54505,7 @@ index ab07c0a24a..cec98cc16a 100644 #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -246,9 +1300,15 @@ static const AVOption options[] = { +@@ -246,9 +1365,15 @@ static const AVOption options[] = { .init = v4l2_decode_init, \ .receive_frame = v4l2_receive_frame, \ .close = v4l2_decode_close, \ @@ -54972,10 +55100,10 @@ index 0000000000..af7bbe1de4 + diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c new file mode 100644 -index 0000000000..cfa94d55c4 +index 0000000000..ee8527ba1f --- /dev/null +++ b/libavcodec/v4l2_req_devscan.c -@@ -0,0 +1,449 @@ +@@ -0,0 +1,451 @@ +#include +#include +#include @@ -55415,12 +55543,14 @@ index 0000000000..cfa94d55c4 + } + + udev_enumerate_unref(enumerate); ++ udev_unref(udev); + + *pscan = scan; + return 0; + +fail: -+ udev_unref(udev); ++ if (udev) ++ udev_unref(udev); + devscan_delete(&scan); + return ret; +} @@ -60261,10 +60391,10 @@ index 75db62b1b4..e192b431be 100644 void ff_vc1dsp_init(VC1DSPContext* c); diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c new file mode 100644 -index 0000000000..f234a985b9 +index 0000000000..5a79e89ed7 --- /dev/null +++ b/libavcodec/weak_link.c -@@ -0,0 +1,102 @@ +@@ -0,0 +1,103 @@ +#include +#include +#include @@ -60286,6 +60416,7 @@ index 0000000000..f234a985b9 + struct ff_weak_link_master * w = malloc(sizeof(*w)); + if (!w) + return NULL; ++ atomic_init(&w->ref_count, 0); + w->ptr = p; + if (pthread_rwlock_init(&w->lock, NULL)) { + free(w); @@ -62464,6 +62595,941 @@ index b2c254ea67..144fbda652 100644 OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ opencl/unsharp.o +diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile +index b58daa3a3f..b68209bc94 100644 +--- a/libavfilter/aarch64/Makefile ++++ b/libavfilter/aarch64/Makefile +@@ -1,3 +1,5 @@ ++OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o + OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o + ++NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o + NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o +diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +new file mode 100644 +index 0000000000..f52bc4b9b4 +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c +@@ -0,0 +1,125 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/common.h" ++#include "libavfilter/bwdif.h" ++#include "libavutil/aarch64/cpu.h" ++ ++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ ++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ ++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); ++ ++ ++static void filter_line3_helper(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ // Asm works on 16 byte chunks ++ // If w is a multiple of 16 then all is good - if not then if width rounded ++ // up to nearest 16 will fit in both src & dst strides then allow the asm ++ // to write over the padding bytes as that is almost certainly faster than ++ // having to invoke the C version to clean up the tail. ++ const int w1 = FFALIGN(w, 16); ++ const int w0 = clip_max != 255 ? 0 : ++ d_stride <= w1 && s_stride <= w1 ? w : w & ~15; ++ ++ ff_bwdif_filter_line3_neon(dst1, d_stride, ++ prev1, cur1, next1, s_stride, ++ w0, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride, ++ (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride, ++ w - w0, parity, clip_max); ++} ++ ++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, ++ w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); ++} ++ ++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++ ++ if (w0 < w) ++ ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, ++ w - w0, prefs, mrefs, prefs2, mrefs2, ++ parity, clip_max, spat); ++} ++ ++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) ++{ ++ const int w0 = clip_max != 255 ? 0 : w & ~15; ++ ++ ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++ ++ if (w0 < w) ++ ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0, ++ w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max); ++} ++ ++void ++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) ++{ ++ const int cpu_flags = av_get_cpu_flags(); ++ ++ if (bit_depth != 8) ++ return; ++ ++ if (!have_neon(cpu_flags)) ++ return; ++ ++ s->filter_intra = filter_intra_helper; ++ s->filter_line = filter_line_helper; ++ s->filter_edge = filter_edge_helper; ++ s->filter_line3 = filter_line3_helper; ++} ++ +diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S +new file mode 100644 +index 0000000000..ae9aab20cd +--- /dev/null ++++ b/libavfilter/aarch64/vf_bwdif_neon.S +@@ -0,0 +1,788 @@ ++/* ++ * bwdif aarch64 NEON optimisations ++ * ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Space taken on the stack by an int (32-bit) ++#ifdef __APPLE__ ++.set SP_INT, 4 ++#else ++.set SP_INT, 8 ++#endif ++ ++.macro SQSHRUNN b, s0, s1, s2, s3, n ++ sqshrun \s0\().4h, \s0\().4s, #\n - 8 ++ sqshrun2 \s0\().8h, \s1\().4s, #\n - 8 ++ sqshrun \s1\().4h, \s2\().4s, #\n - 8 ++ sqshrun2 \s1\().8h, \s3\().4s, #\n - 8 ++ uzp2 \b\().16b, \s0\().16b, \s1\().16b ++.endm ++ ++.macro SMULL4K a0, a1, a2, a3, s0, s1, k ++ smull \a0\().4s, \s0\().4h, \k ++ smull2 \a1\().4s, \s0\().8h, \k ++ smull \a2\().4s, \s1\().4h, \k ++ smull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMULL4K a0, a1, a2, a3, s0, s1, k ++ umull \a0\().4s, \s0\().4h, \k ++ umull2 \a1\().4s, \s0\().8h, \k ++ umull \a2\().4s, \s1\().4h, \k ++ umull2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k ++ umlal \a0\().4s, \s0\().4h, \k ++ umlal2 \a1\().4s, \s0\().8h, \k ++ umlal \a2\().4s, \s1\().4h, \k ++ umlal2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k ++ umlsl \a0\().4s, \s0\().4h, \k ++ umlsl2 \a1\().4s, \s0\().8h, \k ++ umlsl \a2\().4s, \s1\().4h, \k ++ umlsl2 \a3\().4s, \s1\().8h, \k ++.endm ++ ++// int b = m2s1 - m1; ++// int f = p2s1 - p1; ++// int dc = c0s1 - m1; ++// int de = c0s1 - p1; ++// int sp_max = FFMIN(p1 - c0s1, m1 - c0s1); ++// sp_max = FFMIN(sp_max, FFMAX(-b,-f)); ++// int sp_min = FFMIN(c0s1 - p1, c0s1 - m1); ++// sp_min = FFMIN(sp_min, FFMAX(b,f)); ++// diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max); ++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3 ++ uqsub \t0\().16b, \p1\().16b, \c0s1\().16b ++ uqsub \t2\().16b, \m1\().16b, \c0s1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m1\().16b, \m2s1\().16b ++ uqsub \t3\().16b, \p1\().16b, \p2s1\().16b ++ umax \t3\().16b, \t3\().16b, \t1\().16b ++ umin \t3\().16b, \t3\().16b, \t2\().16b ++ ++ uqsub \t0\().16b, \c0s1\().16b, \p1\().16b ++ uqsub \t2\().16b, \c0s1\().16b, \m1\().16b ++ umin \t2\().16b, \t0\().16b, \t2\().16b ++ ++ uqsub \t1\().16b, \m2s1\().16b, \m1\().16b ++ uqsub \t0\().16b, \p2s1\().16b, \p1\().16b ++ umax \t0\().16b, \t0\().16b, \t1\().16b ++ umin \t2\().16b, \t2\().16b, \t0\().16b ++ ++ cmeq \t1\().16b, \diff\().16b, #0 ++ umax \diff\().16b, \diff\().16b, \t3\().16b ++ umax \diff\().16b, \diff\().16b, \t2\().16b ++ bic \diff\().16b, \diff\().16b, \t1\().16b ++.endm ++ ++// i0 = s0; ++// if (i0 > d0 + diff0) ++// i0 = d0 + diff0; ++// else if (i0 < d0 - diff0) ++// i0 = d0 - diff0; ++// ++// i0 = s0 is safe ++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1 ++ uqadd \t0\().16b, \d0\().16b, \diff\().16b ++ uqsub \t1\().16b, \d0\().16b, \diff\().16b ++ umin \i0\().16b, \s0\().16b, \t0\().16b ++ umax \i0\().16b, \i0\().16b, \t1\().16b ++.endm ++ ++// i0 = FFABS(m1 - p1) > td0 ? i1 : i2; ++// DIFF_CLIP ++// ++// i0 = i1 is safe ++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2 ++ uabd \t0\().16b, \m1\().16b, \p1\().16b ++ cmhi \t0\().16b, \t0\().16b, \td0\().16b ++ bsl \t0\().16b, \i1\().16b, \i2\().16b ++ DIFF_CLIP \i0, \t0, \d0, \diff, \t1, \t2 ++.endm ++ ++.macro PUSH_VREGS ++ stp d8, d9, [sp, #-64]! ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++.endm ++ ++.macro POP_VREGS ++ ldp d14, d15, [sp, #48] ++ ldp d12, d13, [sp, #32] ++ ldp d10, d11, [sp, #16] ++ ldp d8, d9, [sp], #64 ++.endm ++ ++.macro LDR_COEFFS d, t0 ++ movrel \t0, coeffs, 0 ++ ld1 {\d\().8h}, [\t0] ++.endm ++ ++// static const uint16_t coef_lf[2] = { 4309, 213 }; ++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 }; ++// static const uint16_t coef_sp[2] = { 5077, 981 }; ++ ++const coeffs, align=4 // align 4 means align on 2^4 boundry ++ .hword 4309 * 4, 213 * 4 // lf[0]*4 = v0.h[0] ++ .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] ++ .hword 5077, 981 // sp[0] = v0.h[6] ++endconst ++ ++// =========================================================================== ++// ++// void ff_bwdif_filter_line3_neon( ++// void * dst1, // x0 ++// int d_stride, // w1 ++// const void * prev1, // x2 ++// const void * cur1, // x3 ++// const void * next1, // x4 ++// int s_stride, // w5 ++// int w, // w6 ++// int parity, // w7 ++// int clip_max); // [sp, #0] (Ignored) ++ ++function ff_bwdif_filter_line3_neon, export=1 ++ // Sanity check w ++ cmp w6, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ cmp w7, #0 ++ csel x17, x2, x4, ne ++ ++ // We want all the V registers - save all the ones we must ++ PUSH_VREGS ++ ++ // Some rearrangement of initial values for nice layout of refs in regs ++ mov w10, w6 // w10 = loop count ++ neg w9, w5 // w9 = mref ++ lsl w8, w9, #1 // w8 = mref2 ++ add w7, w9, w9, LSL #1 // w7 = mref3 ++ lsl w6, w9, #2 // w6 = mref4 ++ mov w11, w5 // w11 = pref ++ lsl w12, w5, #1 // w12 = pref2 ++ add w13, w5, w5, LSL #1 // w13 = pref3 ++ lsl w14, w5, #2 // w14 = pref4 ++ add w15, w5, w5, LSL #2 // w15 = pref5 ++ add w16, w14, w12 // w16 = pref6 ++ ++ lsl w5, w1, #1 // w5 = d_stride * 2 ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// j1 = -coef_hf[1] * (c0 + p4); // j1 = v6-v9 (-c0:v20,v21) ++ add v20.8h, v20.8h, v24.8h ++ add v21.8h, v21.8h, v25.8h ++ SMULL4K v6, v7, v8, v9, v20, v21, v0.h[5] ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ ldr q28, [x3, w16, sxtw] ++ ldr q25, [x17, w16, sxtw] ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// p6 = prev2[prefs6] + next2[prefs6]; // p6 = v24,v25 ++ uaddl v24.8h, v25.8b, v28.8b ++ uaddl2 v25.8h, v25.16b, v28.16b ++ ++// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) ++ add v24.8h, v24.8h, v22.8h ++ add v25.8h, v25.8h, v23.8h ++ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p5 = cur[prefs5]; // p5 = v25 ++ ldr q25, [x3, w15, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// j1 += coef_hf[0] * p2; // - ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[2] ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// j1 -= coef_lf[1] * 4 * (m1 + p5); // - ++ uaddl v26.8h, v24.8b, v25.8b ++ uaddl2 v27.8h, v24.16b, v25.16b ++ UMLSL4K v6, v7, v8, v9, v26, v27, v0.h[1] ++ ++// j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16 ++ uaddl v18.8h, v22.8b, v21.8b ++ uaddl2 v19.8h, v22.16b, v21.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v24.8b, v25.8b ++ uaddl2 v19.8h, v24.16b, v25.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v16, v28, v29, v30, v31, 13 ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++// j1 += coef_lf[0] * 4 * (p1 + p3); // p1 = v22, p3 = v21 ++ uaddl v26.8h, v21.8b, v22.8b ++ uaddl2 v27.8h, v21.16b, v22.16b ++ UMLAL4K v6, v7, v8, v9, v26, v27, v0.h[0] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// j1 >>= 15; // j1 = v3, -v6*, -v7*, -v8*, -v9* ++ SQSHRUNN v3, v6, v7, v8, v9, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++ ldr q27, [x2, w13, sxtw] ++ ldr q26, [x4, w13, sxtw] ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++// } // v28, v30 preserved for next block ++// { // tdiff2 = v14 ++// int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1; ++// int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1; ++ uabd v31.16b, v21.16b, v27.16b ++ uabd v29.16b, v21.16b, v26.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19 ++ ushr v19.16b, v14.16b, #1 ++ umax v19.16b, v19.16b, v31.16b ++ umax v19.16b, v19.16b, v29.16b ++// } ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12 ++ SPAT_CHECK v19, v10, v22, v15, v21, v12, v31, v30, v29, v28 ++ ++ // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19 ++ INTERPOL v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29 ++ ++// dst[d_stride * 2] = av_clip_uint8(interpol); ++ str q3, [x0, w5, sxtw] ++ ++// dst[d_stride] = p1; ++ str q22, [x0, w1, sxtw] ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ ++// =========================================================================== ++// ++// void filter_line( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int prefs3, // [sp, #SP_INT] ++// int mrefs3, // [sp, #SP_INT*2] ++// int prefs4, // [sp, #SP_INT*3] ++// int mrefs4, // [sp, #SP_INT*4] ++// int parity, // [sp, #SP_INT*5] ++// int clip_max) // [sp, #SP_INT*6] ++ ++function ff_bwdif_filter_line_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++ // Rearrange regs to be the same as line3 for ease of debug! ++ mov w10, w4 // w10 = loop count ++ mov w9, w6 // w9 = mref ++ mov w12, w7 // w12 = pref2 ++ mov w11, w5 // w11 = pref ++ ldr w8, [sp, #0] // w8 = mref2 ++ ldr w7, [sp, #SP_INT*2] // w7 = mref3 ++ ldr w6, [sp, #SP_INT*4] // w6 = mref4 ++ ldr w13, [sp, #SP_INT] // w13 = pref3 ++ ldr w14, [sp, #SP_INT*3] // w14 = pref4 ++ ++ mov x4, x3 ++ mov x3, x2 ++ mov x2, x1 ++ ++ LDR_COEFFS v0, x17 ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ldr w17, [sp, #SP_INT*5] // parity ++ cmp w17, #0 ++ csel x17, x2, x4, ne ++ ++ PUSH_VREGS ++ ++// for (x = 0; x < w; x++) { ++// int diff0, diff2; ++// int d0, d2; ++// int temporal_diff0, temporal_diff2; ++// ++// int i1, i2; ++// int j1, j2; ++// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; ++ ++10: ++// c0 = prev2[0] + next2[0]; // c0 = v20, v21 ++// d0 = c0 >> 1; // d0 = v10 ++// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 ++ ldr q31, [x3] ++ ldr q21, [x17] ++ uhadd v10.16b, v31.16b, v21.16b ++ uabd v11.16b, v31.16b, v21.16b ++ uaddl v20.8h, v21.8b, v31.8b ++ uaddl2 v21.8h, v21.16b, v31.16b ++ ++ ldr q31, [x3, w6, sxtw] ++ ldr q23, [x17, w6, sxtw] ++ ++// i1 = coef_hf[0] * c0; // i1 = v2-v5 ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] ++ ++ ldr q30, [x3, w14, sxtw] ++ ldr q25, [x17, w14, sxtw] ++ ++// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 ++ uaddl v22.8h, v23.8b, v31.8b ++ uaddl2 v23.8h, v23.16b, v31.16b ++ ++// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 ++ uhadd v12.16b, v25.16b, v30.16b ++ uaddl v24.8h, v25.8b, v30.8b ++ uaddl2 v25.8h, v25.16b, v30.16b ++ ++// m3 = cur[mrefs3]; // m3 = v20 ++ ldr q20, [x3, w7, sxtw] ++ ++// p3 = cur[prefs3]; // p3 = v21 ++ ldr q21, [x3, w13, sxtw] ++ ++// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) ++ add v22.8h, v22.8h, v24.8h ++ add v23.8h, v23.8h, v25.8h ++ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] ++ ++ ldr q29, [x3, w8, sxtw] ++ ldr q23, [x17, w8, sxtw] ++ ++// i1 -= coef_lf[1] * 4 * (m3 + p3); // - ++ uaddl v30.8h, v20.8b, v21.8b ++ uaddl2 v31.8h, v20.16b, v21.16b ++ ++ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] ++ ++ ldr q31, [x3, w12, sxtw] ++ ldr q27, [x17, w12, sxtw] ++ ++// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 ++ uhadd v13.16b, v23.16b, v29.16b ++ uaddl v22.8h, v23.8b, v29.8b ++ uaddl2 v23.8h, v23.16b, v29.16b ++ ++// m1 = cur[mrefs]; // m1 = v24 ++ ldr q24, [x3, w9, sxtw] ++ ++// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 ++// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 ++// d2 = p2 >> 1; // d2 = v15 ++ uabd v14.16b, v31.16b, v27.16b ++ uhadd v15.16b, v31.16b, v27.16b ++ uaddl v26.8h, v27.8b, v31.8b ++ uaddl2 v27.8h, v27.16b, v31.16b ++ ++// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) ++ add v22.8h, v22.8h, v26.8h ++ add v23.8h, v23.8h, v27.8h ++ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] ++ ++// p1 = cur[prefs]; // p1 = v22 ++ ldr q22, [x3, w11, sxtw] ++ ++// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 ++ uaddl v18.8h, v22.8b, v24.8b ++ uaddl2 v19.8h, v22.16b, v24.16b ++ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] ++ ++ uaddl v18.8h, v20.8b, v21.8b ++ uaddl2 v19.8h, v20.16b, v21.16b ++ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] ++ ++ SQSHRUNN v17, v28, v29, v30, v31, 13 ++ ++// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 ++ uaddl v26.8h, v24.8b, v22.8b ++ uaddl2 v27.8h, v24.16b, v22.16b ++ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] ++ ++ ldr q31, [x2, w9, sxtw] ++ ldr q29, [x4, w9, sxtw] ++ ++ ldr q30, [x2, w11, sxtw] ++ ldr q28, [x4, w11, sxtw] ++ ++// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* ++ SQSHRUNN v2, v2, v3, v4, v5, 15 ++ ++// { ++// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++ uabd v30.16b, v22.16b, v30.16b ++ uabd v31.16b, v24.16b, v31.16b ++ uabd v28.16b, v22.16b, v28.16b ++ uabd v29.16b, v24.16b, v29.16b ++ uhadd v31.16b, v31.16b, v30.16b ++ uhadd v29.16b, v29.16b, v28.16b ++ ++// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 ++ ushr v18.16b, v11.16b, #1 ++ umax v18.16b, v18.16b, v31.16b ++ umax v18.16b, v18.16b, v29.16b ++ ++ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 ++ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 ++ ++ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 ++ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 ++ ++// dst[0] = av_clip_uint8(interpol); ++ str q2, [x0], #16 ++// } ++// ++// dst++; ++// cur++; ++// prev++; ++// prev2++; ++// next++; ++// } ++ ++ subs w10, w10, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x4, x4, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++ POP_VREGS ++99: ++ ret ++endfunc ++ ++// ============================================================================ ++// ++// void ff_bwdif_filter_edge_neon( ++// void *dst1, // x0 ++// void *prev1, // x1 ++// void *cur1, // x2 ++// void *next1, // x3 ++// int w, // w4 ++// int prefs, // w5 ++// int mrefs, // w6 ++// int prefs2, // w7 ++// int mrefs2, // [sp, #0] ++// int parity, // [sp, #SP_INT] ++// int clip_max, // [sp, #SP_INT*2] unused ++// int spat); // [sp, #SP_INT*3] ++ ++function ff_bwdif_filter_edge_neon, export=1 ++ // Sanity check w ++ cmp w4, #0 ++ ble 99f ++ ++// #define prev2 cur ++// const uint8_t * restrict next2 = parity ? prev : next; ++ ++ ldr w8, [sp, #0] // mrefs2 ++ ++ ldr w17, [sp, #SP_INT] // parity ++ ldr w16, [sp, #SP_INT*3] // spat ++ cmp w17, #0 ++ csel x17, x1, x3, ne ++ ++// for (x = 0; x < w; x++) { ++ ++10: ++// int m1 = cur[mrefs]; ++// int d = (prev2[0] + next2[0]) >> 1; ++// int p1 = cur[prefs]; ++// int temporal_diff0 = FFABS(prev2[0] - next2[0]); ++// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; ++// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; ++// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); ++ ldr q31, [x2] ++ ldr q21, [x17] ++ uhadd v16.16b, v31.16b, v21.16b // d0 = v16 ++ uabd v17.16b, v31.16b, v21.16b // td0 = v17 ++ ldr q24, [x2, w6, sxtw] // m1 = v24 ++ ldr q22, [x2, w5, sxtw] // p1 = v22 ++ ++ ldr q0, [x1, w6, sxtw] // prev[mrefs] ++ ldr q2, [x1, w5, sxtw] // prev[prefs] ++ ldr q1, [x3, w6, sxtw] // next[mrefs] ++ ldr q3, [x3, w5, sxtw] // next[prefs] ++ ++ ushr v29.16b, v17.16b, #1 ++ ++ uabd v31.16b, v0.16b, v24.16b ++ uabd v30.16b, v2.16b, v22.16b ++ uhadd v0.16b, v31.16b, v30.16b // td1 = q0 ++ ++ uabd v31.16b, v1.16b, v24.16b ++ uabd v30.16b, v3.16b, v22.16b ++ uhadd v1.16b, v31.16b, v30.16b // td2 = q1 ++ ++ umax v0.16b, v0.16b, v29.16b ++ umax v0.16b, v0.16b, v1.16b // diff = v0 ++ ++// if (spat) { ++// SPAT_CHECK() ++// } ++// i0 = (m1 + p1) >> 1; ++ cbz w16, 1f ++ ++ ldr q31, [x2, w8, sxtw] ++ ldr q18, [x17, w8, sxtw] ++ ldr q30, [x2, w7, sxtw] ++ ldr q19, [x17, w7, sxtw] ++ uhadd v18.16b, v18.16b, v31.16b ++ uhadd v19.16b, v19.16b, v30.16b ++ ++ SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28 ++ ++1: ++ uhadd v2.16b, v22.16b, v24.16b ++ ++ // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30 ++ DIFF_CLIP v2, v2, v16, v0, v31, v30 ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ subs w4, w4, #16 ++ add x1, x1, #16 ++ add x2, x2, #16 ++ add x3, x3, #16 ++ add x17, x17, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc ++ ++// ============================================================================ ++// ++// void ff_bwdif_filter_intra_neon( ++// void *dst1, // x0 ++// void *cur1, // x1 ++// int w, // w2 ++// int prefs, // w3 ++// int mrefs, // w4 ++// int prefs3, // w5 ++// int mrefs3, // w6 ++// int parity, // w7 unused ++// int clip_max) // [sp, #0] unused ++ ++function ff_bwdif_filter_intra_neon, export=1 ++ cmp w2, #0 ++ ble 99f ++ ++ LDR_COEFFS v0, x17 ++ ++// for (x = 0; x < w; x++) { ++10: ++ ++// interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; ++ ldr q31, [x1, w4, sxtw] ++ ldr q30, [x1, w3, sxtw] ++ ldr q29, [x1, w6, sxtw] ++ ldr q28, [x1, w5, sxtw] ++ ++ uaddl v20.8h, v31.8b, v30.8b ++ uaddl2 v21.8h, v31.16b, v30.16b ++ ++ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[6] ++ ++ uaddl v20.8h, v29.8b, v28.8b ++ uaddl2 v21.8h, v29.16b, v28.16b ++ ++ UMLSL4K v2, v3, v4, v5, v20, v21, v0.h[7] ++ ++// dst[0] = av_clip(interpol, 0, clip_max); ++ SQSHRUNN v2, v2, v3, v4, v5, 13 ++ str q2, [x0], #16 ++ ++// dst++; ++// cur++; ++// } ++ ++ subs w2, w2, #16 ++ add x1, x1, #16 ++ bgt 10b ++ ++99: ++ ret ++endfunc diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 0872c6e0f2..1dd05e4d75 100644 --- a/libavfilter/allfilters.c @@ -62696,108 +63762,194 @@ index da1cf9941e..c588ed23cb 100644 frame->format, frame->pts); break; case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h +index 889ff772ed..496cec72ef 100644 +--- a/libavfilter/bwdif.h ++++ b/libavfilter/bwdif.h +@@ -35,8 +35,29 @@ typedef struct BWDIFContext { + void (*filter_edge)(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat); ++ void (*filter_line3)(void *dst, int dstride, ++ const void *prev, const void *cur, const void *next, int prefs, ++ int w, int parity, int clip_max); + } BWDIFContext; + +-void ff_bwdif_init_x86(BWDIFContext *bwdif); ++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth); ++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth); ++ ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat); ++ ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max); ++ ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max); ++ ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max); + + #endif /* AVFILTER_BWDIF_H */ diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c -index b6aed7a450..92e26d54bc 100644 +index b6aed7a450..b268113271 100644 --- a/libavfilter/vf_bwdif.c +++ b/libavfilter/vf_bwdif.c -@@ -75,10 +75,10 @@ typedef struct ThreadData { - int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \ - int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \ - int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ -- \ -+ {/*\ - if (!diff) { \ - dst[0] = d; \ -- } else { -+ } else {*/ - - #define SPAT_CHECK() \ - int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \ -@@ -90,15 +90,16 @@ typedef struct ThreadData { - diff = FFMAX3(diff, min, -max); - - #define FILTER_LINE() \ -+ int i1, i2; \ - SPAT_CHECK() \ -- if (FFABS(c - e) > temporal_diff0) { \ -- interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \ -+ /*if (FFABS(c - e) > temporal_diff0)*/ { \ -+ i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \ - - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \ - + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \ - + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -- } else { \ -- interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -- } -+ } /*else*/ { \ -+ i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ -+ }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\ - - #define FILTER_EDGE() \ - if (spat) { \ -@@ -112,7 +113,7 @@ typedef struct ThreadData { - else if (interpol < d - diff) \ - interpol = d - diff; \ - \ -- dst[0] = av_clip(interpol, 0, clip_max); \ -+ dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \ - } \ - \ - dst++; \ -@@ -123,7 +124,7 @@ typedef struct ThreadData { +@@ -123,8 +123,8 @@ typedef struct ThreadData { next2++; \ } -static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, -+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, - int prefs3, int mrefs3, int parity, int clip_max) +- int prefs3, int mrefs3, int parity, int clip_max) ++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, ++ int prefs3, int mrefs3, int parity, int clip_max) { uint8_t *dst = dst1; -@@ -133,7 +134,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, + uint8_t *cur = cur1; +@@ -133,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, FILTER_INTRA() } -static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) -@@ -151,7 +152,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int prefs3, int mrefs3, int prefs4, int mrefs4, +- int parity, int clip_max) ++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int prefs3, int mrefs3, int prefs4, int mrefs4, ++ int parity, int clip_max) + { + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -151,9 +151,34 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, FILTER2() } -static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int parity, int clip_max, int spat) +- int w, int prefs, int mrefs, int prefs2, int mrefs2, +- int parity, int clip_max, int spat) ++#define NEXT_LINE()\ ++ dst += d_stride; \ ++ prev += prefs; \ ++ cur += prefs; \ ++ next += prefs; ++ ++void ff_bwdif_filter_line3_c(void * dst1, int d_stride, ++ const void * prev1, const void * cur1, const void * next1, int s_stride, ++ int w, int parity, int clip_max) ++{ ++ const int prefs = s_stride; ++ uint8_t * dst = dst1; ++ const uint8_t * prev = prev1; ++ const uint8_t * cur = cur1; ++ const uint8_t * next = next1; ++ ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++ NEXT_LINE(); ++ memcpy(dst, cur, w); ++ NEXT_LINE(); ++ ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w, ++ prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max); ++} ++ ++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1, ++ int w, int prefs, int mrefs, int prefs2, int mrefs2, ++ int parity, int clip_max, int spat) { -@@ -168,7 +169,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, + uint8_t *dst = dst1; + uint8_t *prev = prev1; +@@ -213,6 +238,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, FILTER2() } --static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs, -+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, - int prefs3, int mrefs3, int parity, int clip_max) ++// Round job start line down to multiple of 4 so that if filter_line3 exists ++// and the frame is a multiple of 4 high then filter_line will never be called ++static inline int job_start(const int jobnr, const int nb_jobs, const int h) ++{ ++ return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3; ++} ++ + static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) { - uint16_t *dst = dst1; -@@ -178,7 +179,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre - FILTER_INTRA() + BWDIFContext *s = ctx->priv; +@@ -222,8 +254,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1; + int df = (yadif->csp->comp[td->plane].depth + 7) / 8; + int refs = linesize / df; +- int slice_start = (td->h * jobnr ) / nb_jobs; +- int slice_end = (td->h * (jobnr+1)) / nb_jobs; ++ int slice_start = job_start(jobnr, nb_jobs, td->h); ++ int slice_end = job_start(jobnr + 1, nb_jobs, td->h); + int y; + + for (y = slice_start; y < slice_end; y++) { +@@ -245,6 +277,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) + refs << 1, -(refs << 1), + td->parity ^ td->tff, clip_max, + (y < 2) || ((y + 3) > td->h) ? 0 : 1); ++ } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) { ++ s->filter_line3(dst, td->frame->linesize[td->plane], ++ prev, cur, next, linesize, td->w, ++ td->parity ^ td->tff, clip_max); ++ y += 2; + } else { + s->filter_line(dst, prev, cur, next, td->w, + refs, -refs, refs << 1, -(refs << 1), +@@ -280,7 +317,8 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic, + td.h = h; + td.plane = i; + +- ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(h, ff_filter_get_nb_threads(ctx))); ++ ctx->internal->execute(ctx, filter_slice, &td, NULL, ++ FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx))); + } + if (yadif->current_field == YADIF_FIELD_END) { + yadif->current_field = YADIF_FIELD_NORMAL; +@@ -350,20 +388,29 @@ static int config_props(AVFilterLink *link) + + yadif->csp = av_pix_fmt_desc_get(link->format); + yadif->filter = filter; +- if (yadif->csp->comp[0].depth > 8) { ++ ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth); ++ ++ return 0; ++} ++ ++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth) ++{ ++ s->filter_line3 = 0; ++ if (bit_depth > 8) { + s->filter_intra = filter_intra_16bit; + s->filter_line = filter_line_c_16bit; + s->filter_edge = filter_edge_16bit; + } else { +- s->filter_intra = filter_intra; +- s->filter_line = filter_line_c; +- s->filter_edge = filter_edge; ++ s->filter_intra = ff_bwdif_filter_intra_c; ++ s->filter_line = ff_bwdif_filter_line_c; ++ s->filter_edge = ff_bwdif_filter_edge_c; + } + +- if (ARCH_X86) +- ff_bwdif_init_x86(s); +- +- return 0; ++#if ARCH_X86 ++ ff_bwdif_init_x86(s, bit_depth); ++#elif ARCH_AARCH64 ++ ff_bwdif_init_aarch64(s, bit_depth); ++#endif } --static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int prefs3, int mrefs3, int prefs4, int mrefs4, - int parity, int clip_max) -@@ -196,7 +197,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1 - FILTER2() - } --static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, -+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, - int w, int prefs, int mrefs, int prefs2, int mrefs2, - int parity, int clip_max, int spat) - { diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c new file mode 100644 index 0000000000..d4c11cfc51 @@ -65154,6 +66306,23 @@ index 0000000000..61c03a385c + .outputs = avfilter_vf_unsand_outputs, +}; + +diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c +index b1e70b3bc6..b9e3a25921 100644 +--- a/libavfilter/x86/vf_bwdif_init.c ++++ b/libavfilter/x86/vf_bwdif_init.c +@@ -51,11 +51,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne + int mrefs2, int prefs3, int mrefs3, int prefs4, + int mrefs4, int parity, int clip_max); + +-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif) ++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) + { +- YADIFContext *yadif = &bwdif->yadif; + int cpu_flags = av_get_cpu_flags(); +- int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth; + + if (bit_depth <= 8) { + #if ARCH_X86_32 diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c index b4284a8778..692265593c 100644 --- a/libavformat/matroskaenc.c @@ -65422,10 +66591,10 @@ index 5613813ba8..ab8bcfcf34 100644 + diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S new file mode 100644 -index 0000000000..2f07d9674c +index 0000000000..11658de0c8 --- /dev/null +++ b/libavutil/aarch64/rpi_sand_neon.S -@@ -0,0 +1,781 @@ +@@ -0,0 +1,672 @@ +/* +Copyright (c) 2021 Michael Eiler + @@ -65676,199 +66845,191 @@ index 0000000000..2f07d9674c + ret +endfunc + -+//void ff_rpi_sand30_lines_to_planar_c16( -+// uint8_t * dst_u, // [x0] -+// unsigned int dst_stride_u, // [w1] == _w*2 -+// uint8_t * dst_v, // [x2] -+// unsigned int dst_stride_v, // [w3] == _w*2 -+// const uint8_t * src, // [x4] -+// unsigned int stride1, // [w5] == 128 -+// unsigned int stride2, // [w6] -+// unsigned int _x, // [w7] == 0 -+// unsigned int y, // [sp, #0] == 0 -+// unsigned int _w, // [sp, #8] -> w3 -+// unsigned int h); // [sp, #16] -> w7 ++// Unzip chroma ++// ++// On entry: ++// a0 = V0, U2, ... ++// a1 = U0, V1, ... ++// a2 = U1, V2, ... ++// b0 = V8, U10, ... ++// b1 = U8, V9, ... ++// b2 = U9, V10, ... ++// ++// On exit: ++// d0 = U0, U3, ... ++// ... ++// a0 = V0, V3, .. ++// ... ++// ++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs) + -+.macro rpi_sand30_lines_to_planar_c16_block_half -+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 ++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2 ++ uzp1 \d0\().8h, \a1\().8h, \b1\().8h ++ uzp1 \d1\().8h, \a2\().8h, \b2\().8h ++ uzp2 \d2\().8h, \a0\().8h, \b0\().8h + -+ xtn v4.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v5.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v6.4h, v0.4s -+ xtn2 v4.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v5.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v6.8h, v1.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 -+ -+ xtn v4.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v5.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v6.4h, v2.4s -+ xtn2 v4.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v5.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v6.8h, v3.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp] -+ sub sp, sp, #48 ++ uzp1 \a0\().8h, \a0\().8h, \b0\().8h ++ uzp2 \a1\().8h, \a1\().8h, \b1\().8h ++ uzp2 \a2\().8h, \a2\().8h, \b2\().8h +.endm + ++// SAND30 -> 10bit ++.macro USAND10 d0, d1, d2, a0, a1 ++ shrn \d2\().4h, \a0\().4s, #14 ++ shrn \d1\().4h, \a0\().4s, #10 ++ ++ shrn2 \d2\().8h, \a1\().4s, #14 ++ shrn2 \d1\().8h, \a1\().4s, #10 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h ++ ++ ushr \d2\().8h, \d2\().8h, #6 ++ bic \d0\().8h, #0xfc, lsl #8 ++ bic \d1\().8h, #0xfc, lsl #8 ++.endm ++ ++// SAND30 -> 8bit ++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2 ++ shrn \d1\().4h, \a0\().4s, #12 ++ shrn2 \d1\().8h, \a1\().4s, #12 ++ uzp1 \d0\().8h, \a0\().8h, \a1\().8h ++ uzp2 \d2\().8h, \a0\().8h, \a1\().8h ++ ++ shrn \t1\().4h, \a2\().4s, #12 ++ shrn2 \t1\().8h, \a3\().4s, #12 ++ uzp1 \t0\().8h, \a2\().8h, \a3\().8h ++ uzp2 \t2\().8h, \a2\().8h, \a3\().8h ++ ++ shrn \d0\().8b, \d0\().8h, #2 ++ shrn2 \d0\().16b, \t0\().8h, #2 ++ shrn \d2\().8b, \d2\().8h, #6 ++ shrn2 \d2\().16b, \t2\().8h, #6 ++ uzp1 \d1\().16b, \d1\().16b, \t1\().16b ++.endm ++ ++ ++// void ff_rpi_sand30_lines_to_planar_c16( ++// uint8_t * dst_u, // [x0] ++// unsigned int dst_stride_u, // [w1] ++// uint8_t * dst_v, // [x2] ++// unsigned int dst_stride_v, // [w3] ++// const uint8_t * src, // [x4] ++// unsigned int stride1, // [w5] 128 ++// unsigned int stride2, // [w6] ++// unsigned int _x, // [w7] 0 ++// unsigned int y, // [sp, #0] ++// unsigned int _w, // [sp, #8] w9 ++// unsigned int h); // [sp, #16] w10 ++ +function ff_rpi_sand30_lines_to_planar_c16, export=1 -+ stp x19, x20, [sp, #-48]! -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] ++ ldr w7, [sp, #0] // y ++ ldr w8, [sp, #8] // _w ++ ldr w10, [sp, #16] // h ++ lsl w6, w6, #7 // Fixup stride2 ++ sub w6, w6, #64 ++ uxtw x6, w6 ++ sub w1, w1, w8, LSL #1 // Fixup chroma strides ++ sub w3, w3, w8, LSL #1 ++ lsl w7, w7, #7 // Add y to src ++ add x4, x4, w7, UXTW ++10: ++ mov w13, #0 ++ mov x5, x4 ++ mov w9, w8 ++1: ++ ld1 {v0.4s-v3.4s}, [x5], #64 ++ ld1 {v4.4s-v7.4s}, [x5], x6 ++ subs w9, w9, #48 + -+ ldr w3, [sp, #48+8] // w3 = width -+ ldr w7, [sp, #48+16] // w7 = height ++ USAND10 v17, v16, v18, v0, v1 ++ USAND10 v20, v19, v21, v2, v3 ++ UZPH_C v0, v1, v2, v16, v17, v18, v19, v20, v21 ++ USAND10 v23, v22, v24, v4, v5 ++ USAND10 v26, v25, v27, v6, v7 ++ UZPH_C v4, v5, v6, v22, v23, v24, v25, v26, v27 + -+ // reserve space on the stack for intermediate results -+ sub sp, sp, #256 ++ blt 2f + -+ // number of 128byte blocks per row, w8 = width / 48 -+ mov w9, #48 -+ udiv w8, w3, w9 ++ st3 {v0.8h-v2.8h}, [x0], #48 ++ st3 {v4.8h-v6.8h}, [x0], #48 ++ st3 {v16.8h-v18.8h}, [x2], #48 ++ st3 {v22.8h-v24.8h}, [x2], #48 + -+ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 -+ mul w9, w8, w9 -+ sub w9, w3, w9 ++ bne 1b ++11: ++ subs w10, w10, #1 ++ add x4, x4, #128 ++ add x0, x0, w1, UXTW ++ add x2, x2, w3, UXTW ++ bne 10b ++99: ++ ret + -+ // row offset, the beginning of the next row to process -+ eor w10, w10, w10 -+ -+ // offset to the beginning of the next block, w11 = stride2 * 128 - 128 -+ lsl w11, w6, #7 -+ sub w11, w11, #128 -+ -+ // decrease the height by one and in case of remaining pixels increase the block count by one -+ sub w7, w7, #1 -+ cmp w9, #0 -+ cset w19, ne // w19 == 1 iff reamining pixels != 0 -+ add w8, w8, w19 -+ -+ // bytes we have to move dst back by at the end of every row -+ mov w21, #48*2 -+ mul w21, w21, w8 -+ sub w21, w1, w21 -+ -+ mov w20, #0 // w20 = flag, last row processed -+ -+ mov x12, #0x03ff03ff03ff03ff -+ dup v16.2d, x12 -+ -+ // iterate through rows, row counter = w12 = 0 -+ eor w12, w12, w12 -+row_loop_c16: -+ cmp w12, w7 -+ bge row_loop_c16_fin -+ -+ // address of row data = src + row_offset -+ mov x13, x4 -+ add x13, x13, x10 -+ -+ eor w14, w14, w14 -+block_loop_c16: -+ cmp w14, w8 -+ bge block_loop_c16_fin -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ add x13, x13, x11 // offset to next block -+ add w14, w14, #1 -+ b block_loop_c16 -+block_loop_c16_fin: -+ -+ add w10, w10, #128 -+ add w12, w12, #1 -+ add x0, x0, w21, sxtw // move dst pointers back by x21 -+ add x2, x2, w21, sxtw -+ b row_loop_c16 -+row_loop_c16_fin: -+ -+ cmp w20, #1 -+ beq row_loop_c16_fin2 -+ mov w20, #1 -+ sub w8, w8, w19 // decrease block count by w19 -+ add w7, w7, #1 // increase height -+ b row_loop_c16 -+ -+row_loop_c16_fin2: -+ sub x0, x0, w21, sxtw // readd x21 in case of the last row -+ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels -+ -+ // last incomplete block to be finished -+ // read operations are fine, stride2 is more than large enough even if rem_pix is 0 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp], #32 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #160 -+ -+ mov x4, sp -+ eor w20, w20, w20 -+rem_pix_c16_loop: -+ cmp w20, w9 -+ bge rem_pix_c16_fin -+ -+ ldr w22, [x4], #4 -+ str w22, [x0], #2 -+ lsr w22, w22, #16 -+ str w22, [x2], #2 -+ -+ add w20, w20, #1 -+ b rem_pix_c16_loop -+rem_pix_c16_fin: -+ -+ add sp, sp, #256 -+ -+ ldp x23, x24, [sp, #32] -+ ldp x21, x22, [sp, #16] -+ ldp x19, x20, [sp], #48 -+ ret ++// Partial final write ++2: ++ cmp w9, #24-48 ++ blt 1f ++ st3 {v0.8h - v2.8h}, [x0], #48 ++ st3 {v16.8h - v18.8h}, [x2], #48 ++ beq 11b ++ mov v0.16b, v4.16b ++ mov v1.16b, v5.16b ++ sub w9, w9, #24 ++ mov v2.16b, v6.16b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ mov v18.16b, v24.16b ++1: ++ cmp w9, #12-48 ++ blt 1f ++ st3 {v0.4h - v2.4h}, [x0], #24 ++ st3 {v16.4h - v18.4h}, [x2], #24 ++ beq 11b ++ mov v0.2d[0], v0.2d[1] ++ sub w9, w9, #12 ++ mov v1.2d[0], v1.2d[1] ++ mov v2.2d[0], v2.2d[1] ++ mov v16.2d[0], v16.2d[1] ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w9, #6-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v0.h - v2.h}[1], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ st3 {v16.h - v18.h}[1], [x2], #6 ++ beq 11b ++ mov v0.s[0], v0.s[1] ++ sub w9, w9, #6 ++ mov v1.s[0], v1.s[1] ++ mov v2.s[0], v2.s[1] ++ mov v16.s[0], v16.s[1] ++ mov v17.s[0], v17.s[1] ++ mov v18.s[0], v18.s[1] ++1: ++ cmp w9, #3-48 ++ blt 1f ++ st3 {v0.h - v2.h}[0], [x0], #6 ++ st3 {v16.h - v18.h}[0], [x2], #6 ++ beq 11b ++ mov v0.h[0], v0.h[1] ++ sub w9, w9, #3 ++ mov v1.h[0], v1.h[1] ++ mov v16.h[0], v16.h[1] ++ mov v17.h[0], v17.h[1] ++1: ++ cmp w9, #2-48 ++ blt 1f ++ st2 {v0.h - v1.h}[0], [x0], #4 ++ st2 {v16.h - v17.h}[0], [x2], #4 ++ b 11b ++1: ++ st1 {v0.h}[0], [x0], #2 ++ st1 {v16.h}[0], [x2], #2 ++ b 11b +endfunc + + -+ +//void ff_rpi_sand30_lines_to_planar_p010( +// uint8_t * dest, +// unsigned int dst_stride, @@ -65897,6 +67058,7 @@ index 0000000000..2f07d9674c +function ff_rpi_sand30_lines_to_planar_y16, export=1 + lsl w4, w4, #7 + sub w4, w4, #64 ++ uxtw x4, w4 + sub w1, w1, w7, lsl #1 + uxtw x6, w6 + add x8, x2, x6, lsl #7 @@ -65911,61 +67073,10 @@ index 0000000000..2f07d9674c + + subs w5, w5, #96 + -+ // v0, v1 -+ -+ shrn v18.4h, v0.4s, #14 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #10 -+ -+ shrn2 v18.8h, v1.4s, #14 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #10 -+ -+ ushr v18.8h, v18.8h, #6 -+ bic v16.8h, #0xfc, lsl #8 -+ bic v17.8h, #0xfc, lsl #8 -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #14 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #10 -+ -+ shrn2 v21.8h, v3.4s, #14 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #10 -+ -+ ushr v21.8h, v21.8h, #6 -+ bic v19.8h, #0xfc, lsl #8 -+ bic v20.8h, #0xfc, lsl #8 -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #14 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #10 -+ -+ shrn2 v24.8h, v5.4s, #14 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #10 -+ -+ ushr v24.8h, v24.8h, #6 -+ bic v22.8h, #0xfc, lsl #8 -+ bic v23.8h, #0xfc, lsl #8 -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #14 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #10 -+ -+ shrn2 v27.8h, v7.4s, #14 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #10 -+ -+ ushr v27.8h, v27.8h, #6 -+ bic v25.8h, #0xfc, lsl #8 -+ bic v26.8h, #0xfc, lsl #8 ++ USAND10 v16, v17, v18, v0, v1 ++ USAND10 v19, v20, v21, v2, v3 ++ USAND10 v22, v23, v24, v4, v5 ++ USAND10 v25, v26, v27, v6, v7 + + blt 2f + @@ -66062,6 +67173,7 @@ index 0000000000..2f07d9674c +function ff_rpi_sand30_lines_to_planar_y8, export=1 + lsl w4, w4, #7 + sub w4, w4, #64 ++ uxtw x4, w4 + sub w1, w1, w7 + uxtw x6, w6 + add x8, x2, x6, lsl #7 @@ -66077,60 +67189,8 @@ index 0000000000..2f07d9674c + subs w5, w5, #96 + + // v0, v1 -+ -+ shrn v18.4h, v0.4s, #16 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #12 -+ -+ shrn2 v18.8h, v1.4s, #16 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #12 -+ -+ shrn v18.8b, v18.8h, #6 -+ shrn v16.8b, v16.8h, #2 -+ xtn v17.8b, v17.8h -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #16 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #12 -+ -+ shrn2 v21.8h, v3.4s, #16 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #12 -+ -+ shrn2 v18.16b, v21.8h, #6 -+ shrn2 v16.16b, v19.8h, #2 -+ xtn2 v17.16b, v20.8h -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #16 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #12 -+ -+ shrn2 v24.8h, v5.4s, #16 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #12 -+ -+ shrn v21.8b, v24.8h, #6 -+ shrn v19.8b, v22.8h, #2 -+ xtn v20.8b, v23.8h -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #16 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #12 -+ -+ shrn2 v27.8h, v7.4s, #16 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #12 -+ -+ shrn2 v21.16b, v27.8h, #6 -+ shrn2 v19.16b, v25.8h, #2 -+ xtn2 v20.16b, v26.8h ++ USAND8 v16, v17, v18, v0, v1, v2, v3, v22, v23, v24 ++ USAND8 v19, v20, v21, v4, v5, v6, v7, v22, v23, v24 + + blt 2f + @@ -67885,10 +68945,10 @@ index 0000000000..0d5d203dc3 + diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c new file mode 100644 -index 0000000000..b6071e2928 +index 0000000000..0626bb06cb --- /dev/null +++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,445 @@ +@@ -0,0 +1,447 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. @@ -67926,10 +68986,12 @@ index 0000000000..b6071e2928 +#include "frame.h" + +#if ARCH_ARM && HAVE_NEON -+#include "arm/rpi_sand_neon.h" ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" +#define HAVE_SAND_ASM 1 +#elif ARCH_AARCH64 && HAVE_NEON -+#include "aarch64/rpi_sand_neon.h" ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" +#define HAVE_SAND_ASM 1 +#else +#define HAVE_SAND_ASM 0 @@ -67988,7 +69050,7 @@ index 0000000000..b6071e2928 + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + +#if HAVE_SAND_ASM -+ if (_x == 0) { ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { + ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); + return; + } @@ -68054,7 +69116,7 @@ index 0000000000..b6071e2928 + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + +#if HAVE_SAND_ASM -+ if (_x == 0) { ++ if (_x == 0 && have_neon(av_get_cpu_flags())) { + ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, + src, stride1, stride2, _x, y, _w, h); + return; @@ -71406,7 +72468,7 @@ index 0000000000..5935a11ca5 + do_logparse(args.logfile) + diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile -index 1827a4e134..08da4166ef 100644 +index 1827a4e134..3c765a5eb1 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP) += g722dsp.o @@ -71420,8 +72482,27 @@ index 1827a4e134..08da4166ef 100644 AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o +@@ -35,6 +37,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) + # libavfilter tests + AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o + AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o ++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o + AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o + AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o + AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o +@@ -52,8 +55,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS) + # libavutil tests + AVUTILOBJS += fixed_dsp.o + AVUTILOBJS += float_dsp.o ++AVUTILOBJS-$(CONFIG_SAND) += rpi_sand.o + +-CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) ++CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) $(AVUTILOBJS-yes) + + CHECKASMOBJS-$(ARCH_AARCH64) += aarch64/checkasm.o + CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL) += arm/checkasm.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c -index 8338e8ff58..81ef182f04 100644 +index 8338e8ff58..c1ee09c72e 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -131,6 +131,9 @@ static const struct { @@ -71444,11 +72525,31 @@ index 8338e8ff58..81ef182f04 100644 #if CONFIG_VP8DSP { "vp8dsp", checkasm_check_vp8dsp }, #endif +@@ -172,6 +178,9 @@ static const struct { + #if CONFIG_BLEND_FILTER + { "vf_blend", checkasm_check_blend }, + #endif ++ #if CONFIG_BWDIF_FILTER ++ { "vf_bwdif", checkasm_check_vf_bwdif }, ++ #endif + #if CONFIG_COLORSPACE_FILTER + { "vf_colorspace", checkasm_check_colorspace }, + #endif +@@ -198,6 +207,9 @@ static const struct { + #if CONFIG_AVUTIL + { "fixed_dsp", checkasm_check_fixed_dsp }, + { "float_dsp", checkasm_check_float_dsp }, ++ #if CONFIG_SAND ++ { "rpi_sand", checkasm_check_rpi_sand }, ++ #endif + #endif + { NULL } + }; diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h -index ef6645e3a2..1a1e17d835 100644 +index ef6645e3a2..02d3642836 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h -@@ -70,6 +70,7 @@ void checkasm_check_hevc_epel_bi(void); +@@ -70,12 +70,14 @@ void checkasm_check_hevc_epel_bi(void); void checkasm_check_hevc_epel_bi_w(void); void checkasm_check_hevc_sao(void); void checkasm_check_huffyuvdsp(void); @@ -71456,11 +72557,19 @@ index ef6645e3a2..1a1e17d835 100644 void checkasm_check_jpeg2000dsp(void); void checkasm_check_llviddsp(void); void checkasm_check_llviddspenc(void); -@@ -83,6 +84,7 @@ void checkasm_check_sw_scale(void); + void checkasm_check_nlmeans(void); + void checkasm_check_opusdsp(void); + void checkasm_check_pixblockdsp(void); ++void checkasm_check_rpi_sand(void); + void checkasm_check_sbrdsp(void); + void checkasm_check_synth_filter(void); + void checkasm_check_sw_rgb(void); +@@ -83,6 +85,8 @@ void checkasm_check_sw_scale(void); void checkasm_check_utvideodsp(void); void checkasm_check_v210dec(void); void checkasm_check_v210enc(void); +void checkasm_check_vc1dsp(void); ++void checkasm_check_vf_bwdif(void); void checkasm_check_vf_eq(void); void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); @@ -71568,6 +72677,130 @@ index 0000000000..02724536a7 + check_add_put_clamped(); + report("idctdsp"); +} +diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c +new file mode 100644 +index 0000000000..0888714c4c +--- /dev/null ++++ b/tests/checkasm/rpi_sand.c +@@ -0,0 +1,118 @@ ++/* ++ * Copyright (c) 2023 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++#include "checkasm.h" ++#include "libavutil/common.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#if ARCH_ARM ++#include "libavutil/arm/cpu.h" ++#include "libavutil/arm/rpi_sand_neon.h" ++#elif ARCH_AARCH64 ++#include "libavutil/aarch64/cpu.h" ++#include "libavutil/aarch64/rpi_sand_neon.h" ++#endif ++ ++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c) ++{ ++ return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20); ++} ++ ++void checkasm_check_rpi_sand(void) ++{ ++ const unsigned int w = 1280; ++ const unsigned int h = 66; ++ const unsigned int stride1 = 128; ++ const unsigned int stride2 = h*3/2; ++ const unsigned int ssize = ((w+95)/96)*128*h*3/2; ++ const unsigned int ysize = ((w + 32) * (h + 32) * 2); ++ ++ uint8_t * sbuf0 = malloc(ssize); ++ uint8_t * sbuf1 = malloc(ssize); ++ uint8_t * ybuf0 = malloc(ysize); ++ uint8_t * ybuf1 = malloc(ysize); ++ uint8_t * vbuf0 = malloc(ysize); ++ uint8_t * vbuf1 = malloc(ysize); ++ uint8_t * yframe0 = (w + 32) * 16 + ybuf0; ++ uint8_t * yframe1 = (w + 32) * 16 + ybuf1; ++ uint8_t * vframe0 = (w + 32) * 16 + vbuf0; ++ uint8_t * vframe1 = (w + 32) * 16 + vbuf1; ++ unsigned int i; ++ ++ for (i = 0; i != ssize; i += 4) ++ *(uint32_t*)(sbuf0 + i) = rnd(); ++ memcpy(sbuf1, sbuf0, ssize); ++ ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) { ++ declare_func(void, uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h); ++ call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize)) ++ fail(); ++ ++ bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h); ++ } ++ ++ if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) { ++ declare_func(void, uint8_t * u_dst, const unsigned int u_stride, ++ uint8_t * v_dst, const unsigned int v_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ memset(ybuf0, 0xbb, ysize); ++ memset(ybuf1, 0xbb, ysize); ++ memset(vbuf0, 0xbb, ysize); ++ memset(vbuf1, 0xbb, ysize); ++ ++ call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2); ++ call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); ++ ++ if (memcmp(sbuf0, sbuf1, ssize) ++ || memcmp(ybuf0, ybuf1, ysize) ++ || memcmp(vbuf0, vbuf1, ysize)) ++ fail(); ++ ++ bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2); ++ } ++ ++ ++ report("sand30"); ++ ++ free(sbuf0); ++ free(sbuf1); ++ free(ybuf0); ++ free(ybuf1); ++ free(vbuf0); ++ free(vbuf1); ++} ++ diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c new file mode 100644 index 0000000000..52628d15e4 @@ -72026,11 +73259,273 @@ index 0000000000..52628d15e4 + check_unescape(); + report("unescape_buffer"); +} +diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c +new file mode 100644 +index 0000000000..3399cacdf7 +--- /dev/null ++++ b/tests/checkasm/vf_bwdif.c +@@ -0,0 +1,256 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with FFmpeg; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#include ++#include "checkasm.h" ++#include "libavcodec/internal.h" ++#include "libavfilter/bwdif.h" ++#include "libavutil/mem_internal.h" ++ ++#define WIDTH 256 ++ ++#define randomize_buffers(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = rnd() & mask ++ ++#define randomize_overflow_check(buf0, buf1, mask, count) \ ++ for (size_t i = 0; i < count; i++) \ ++ buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0; ++ ++#define BODY(type, depth) \ ++ do { \ ++ type prev0[9*WIDTH], prev1[9*WIDTH]; \ ++ type next0[9*WIDTH], next1[9*WIDTH]; \ ++ type cur0[9*WIDTH], cur1[9*WIDTH]; \ ++ type dst0[WIDTH], dst1[WIDTH]; \ ++ const int stride = WIDTH; \ ++ const int mask = (1<