From 251b6ee13d596867423d7e69dd3836972999e4a4 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Sun, 11 Jun 2023 22:50:30 +0200 Subject: [PATCH] ffmpeg: update rpi patch Patch created using revisions 7e0d640..e3d9763 from branch test/4.4.1/main of https://github.com/jc-kynesim/rpi-ffmpeg --- .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch | 1507 ++++++++++++++++- 1 file changed, 1444 insertions(+), 63 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 998d86ff6c..c835a0de3a 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -52696,7 +52696,7 @@ index 22a9532444..108fc05a6f 100644 + #endif // AVCODEC_V4L2_CONTEXT_H diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index cdfd579810..a919bdc030 100644 +index cdfd579810..025cf24769 100644 --- a/libavcodec/v4l2_m2m.c +++ b/libavcodec/v4l2_m2m.c @@ -35,6 +35,15 @@ @@ -52725,7 +52725,7 @@ index cdfd579810..a919bdc030 100644 atomic_init(&s->refcount, 0); sem_init(&s->refsync, 0, 0); -@@ -85,12 +96,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) +@@ -85,18 +96,58 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) if (v4l2_mplane_video(&cap)) { s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; @@ -52740,7 +52740,63 @@ index cdfd579810..a919bdc030 100644 return 0; } -@@ -215,13 +228,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) + return AVERROR(EINVAL); + } + ++static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ struct v4l2_format fmt = {.type = s->output.type}; ++ int rv; ++ uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt); ++ unsigned int w; ++ unsigned int h; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ fmt.fmt.pix_mp.pixelformat = pixfmt; ++ fmt.fmt.pix_mp.width = avctx->width; ++ fmt.fmt.pix_mp.height = avctx->height; ++ } ++ else { ++ fmt.fmt.pix.pixelformat = pixfmt; ++ fmt.fmt.pix.width = avctx->width; ++ fmt.fmt.pix.height = avctx->height; ++ } ++ ++ rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt); ++ ++ if (rv != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ ++ w = ff_v4l2_get_format_width(&fmt); ++ h = ff_v4l2_get_format_height(&fmt); ++ ++ if (w < avctx->width || h < avctx->height) { ++ av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ + static int v4l2_probe_driver(V4L2m2mContext *s) + { + void *log_ctx = s->avctx; +@@ -116,6 +167,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s) + goto done; + } + ++ // If being given frames (encode) check that V4L2 can cope with the size ++ if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO && ++ (ret = check_size(s->avctx, s)) != 0) ++ goto done; ++ + ret = ff_v4l2_context_get_format(&s->capture, 1); + if (ret) { + av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n"); +@@ -215,13 +271,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); /* 2. unmap the capture buffers (v4l2 and ffmpeg): @@ -52754,7 +52810,7 @@ index cdfd579810..a919bdc030 100644 ff_v4l2_context_release(&s->capture); /* 3. get the new capture format */ -@@ -240,7 +247,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) +@@ -240,7 +290,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) /* 5. complete reinit */ s->draining = 0; @@ -52762,7 +52818,7 @@ index cdfd579810..a919bdc030 100644 return 0; } -@@ -274,7 +280,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s) +@@ -274,7 +323,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s) /* start again now that we know the stream dimensions */ s->draining = 0; @@ -52770,7 +52826,7 @@ index cdfd579810..a919bdc030 100644 ret = ff_v4l2_context_get_format(&s->output, 0); if (ret) { -@@ -328,10 +333,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) +@@ -328,10 +376,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) ff_v4l2_context_release(&s->capture); sem_destroy(&s->refsync); @@ -52786,7 +52842,7 @@ index cdfd579810..a919bdc030 100644 av_free(s); } -@@ -344,6 +353,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) +@@ -344,6 +396,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) if (!s) return 0; @@ -52798,7 +52854,7 @@ index cdfd579810..a919bdc030 100644 if (s->fd >= 0) { ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); if (ret) -@@ -356,7 +370,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) +@@ -356,7 +413,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) ff_v4l2_context_release(&s->output); @@ -52814,7 +52870,7 @@ index cdfd579810..a919bdc030 100644 av_buffer_unref(&priv->context_ref); return 0; -@@ -400,35 +422,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) +@@ -400,35 +465,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) return v4l2_configure_contexts(s); } @@ -52866,7 +52922,7 @@ index cdfd579810..a919bdc030 100644 return 0; } diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index b67b216331..ded1478a49 100644 +index b67b216331..a506e69d67 100644 --- a/libavcodec/v4l2_m2m.h +++ b/libavcodec/v4l2_m2m.h @@ -30,6 +30,7 @@ @@ -52930,7 +52986,7 @@ index b67b216331..ded1478a49 100644 AVPacket buf_pkt; /* Reference to a frame. Only used during encoding */ -@@ -66,6 +99,35 @@ typedef struct V4L2m2mContext { +@@ -66,6 +99,36 @@ typedef struct V4L2m2mContext { /* reference back to V4L2m2mPriv */ void *priv; @@ -52950,6 +53006,7 @@ index b67b216331..ded1478a49 100644 + + /* req pkt */ + int req_pkt; ++ int reorder_size; + + /* Ext data sent */ + int extdata_sent; @@ -52966,7 +53023,7 @@ index b67b216331..ded1478a49 100644 } V4L2m2mContext; typedef struct V4L2m2mPriv { -@@ -76,6 +138,8 @@ typedef struct V4L2m2mPriv { +@@ -76,6 +139,8 @@ typedef struct V4L2m2mPriv { int num_output_buffers; int num_capture_buffers; @@ -52975,7 +53032,7 @@ index b67b216331..ded1478a49 100644 } V4L2m2mPriv; /** -@@ -129,4 +193,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); +@@ -129,4 +194,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); */ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); @@ -53003,7 +53060,7 @@ index b67b216331..ded1478a49 100644 + #endif /* AVCODEC_V4L2_M2M_H */ diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index ab07c0a24a..4c5ad55547 100644 +index ab07c0a24a..80d131eae4 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -21,8 +21,14 @@ @@ -53021,7 +53078,7 @@ index ab07c0a24a..4c5ad55547 100644 #include "libavutil/pixfmt.h" #include "libavutil/pixdesc.h" #include "libavutil/opt.h" -@@ -30,75 +36,274 @@ +@@ -30,75 +36,279 @@ #include "libavcodec/decode.h" #include "libavcodec/internal.h" @@ -53131,13 +53188,18 @@ index ab07c0a24a..4c5ad55547 100644 - if (ret) { - av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); - return ret; -+static int64_t pts_stats_guess(const pts_stats_t * const stats) ++static unsigned int pts_stats_interval(const pts_stats_t * const stats) ++{ ++ return stats->last_interval; ++} ++ ++static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess) +{ + if (stats->last_count <= 1) + return stats->last_pts; + if (stats->last_pts == AV_NOPTS_VALUE || -+ stats->last_interval == 0 || -+ stats->last_count >= STATS_LAST_COUNT_MAX) ++ fail_bad_guess && (stats->last_interval == 0 || ++ stats->last_count >= STATS_LAST_COUNT_MAX)) + return AV_NOPTS_VALUE; + return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; +} @@ -53344,7 +53406,7 @@ index ab07c0a24a..4c5ad55547 100644 return 0; } -@@ -133,58 +338,742 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) +@@ -133,58 +343,768 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) return 0; } @@ -53361,7 +53423,7 @@ index ab07c0a24a..4c5ad55547 100644 + frame->pkt_pts = frame->pts; +FF_ENABLE_DEPRECATION_WARNINGS +#endif -+ frame->best_effort_timestamp = pts_stats_guess(ps); ++ frame->best_effort_timestamp = pts_stats_guess(ps, 1); + // If we can't guess from just PTS - try DTS + if (frame->best_effort_timestamp == AV_NOPTS_VALUE) + frame->best_effort_timestamp = frame->pkt_dts; @@ -53396,15 +53458,25 @@ index ab07c0a24a..4c5ad55547 100644 +} + +static int -+xlat_pending(const xlat_track_t * const x) ++xlat_pending(const V4L2m2mContext * const s) +{ ++ const xlat_track_t *const x = &s->xlat; + unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; + int i; -+ const int64_t now = x->last_pts; ++ const int64_t now = pts_stats_guess(&s->pts_stat, 0); ++ int64_t first_dts = AV_NOPTS_VALUE; ++ int no_dts_count = 0; ++ unsigned int interval = pts_stats_interval(&s->pts_stat); + + for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { + const V4L2m2mTrackEl * const t = x->track_els + n; + ++ if (first_dts == AV_NOPTS_VALUE) ++ if (t->dts == AV_NOPTS_VALUE) ++ ++no_dts_count; ++ else ++ first_dts = t->dts; ++ + // Discard only set on never-set or flushed entries + // So if we get here we've never successfully decoded a frame so allow + // more frames into the buffer before stalling @@ -53424,6 +53496,18 @@ index ab07c0a24a..4c5ad55547 100644 + break; + } + ++ if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) { ++ const int iframes = (first_dts - now) / (int)interval; ++ const int t = iframes - s->reorder_size + no_dts_count; ++ ++// av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n", ++// x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count); ++ ++ if (iframes > 0 && iframes < 64 && t < i) { ++ return t; ++ } ++ } ++ + return i; +} + @@ -53627,12 +53711,12 @@ index ab07c0a24a..4c5ad55547 100644 +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -+ int src_rv = NQ_OK; ++ int src_rv = -1; + int dst_rv = 1; // Non-zero (done), non-negative (error) number + unsigned int i = 0; + + do { -+ const int pending = xlat_pending(&s->xlat); ++ const int pending = xlat_pending(s); + const int prefer_dq = (pending > 4); + const int last_src_rv = src_rv; + @@ -53834,7 +53918,7 @@ index ab07c0a24a..4c5ad55547 100644 + + // An unset profile is almost certainly zero or -99 - do not reject + if (avctx->profile <= 0) { -+ av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n"); ++ av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile); + return 0; + } + @@ -54008,8 +54092,10 @@ index ab07c0a24a..4c5ad55547 100644 +} + +static void -+parse_extradata(AVCodecContext *avctx) ++parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s) +{ ++ s->reorder_size = 0; ++ + if (!avctx->extradata || !avctx->extradata_size) + return; + @@ -54038,6 +54124,7 @@ index ab07c0a24a..4c5ad55547 100644 + avctx->profile = ff_h264_get_profile(sps); + avctx->level = sps->level_idc; + } ++ s->reorder_size = sps->num_reorder_frames; + } + ff_h264_ps_uninit(&ps); + break; @@ -54067,6 +54154,7 @@ index ab07c0a24a..4c5ad55547 100644 + if (sps) { + avctx->profile = sps->ptl.general_ptl.profile_idc; + avctx->level = sps->ptl.general_ptl.level_idc; ++ s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering; + } + } + ff_hevc_ps_uninit(&ps); @@ -54098,20 +54186,20 @@ index ab07c0a24a..4c5ad55547 100644 + } + avctx->ticks_per_frame = 2; + } -+ -+ parse_extradata(avctx); + ret = ff_v4l2_m2m_create_context(priv, &s); if (ret < 0) return ret; ++ parse_extradata(avctx, s); ++ + xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); + capture = &s->capture; output = &s->output; -@@ -192,14 +1081,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -192,14 +1112,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) * by the v4l2 driver; this event will trigger a full pipeline reconfig and * the proper values will be retrieved from the kernel driver. */ @@ -54179,7 +54267,7 @@ index ab07c0a24a..4c5ad55547 100644 s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); -@@ -208,12 +1148,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -208,12 +1179,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) return ret; } @@ -54270,7 +54358,7 @@ index ab07c0a24a..4c5ad55547 100644 } #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -222,10 +1238,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) +@@ -222,10 +1269,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", @@ -54289,7 +54377,7 @@ index ab07c0a24a..4c5ad55547 100644 #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -246,9 +1269,15 @@ static const AVOption options[] = { +@@ -246,9 +1300,15 @@ static const AVOption options[] = { .init = v4l2_decode_init, \ .receive_frame = v4l2_receive_frame, \ .close = v4l2_decode_close, \ @@ -62608,6 +62696,108 @@ index da1cf9941e..c588ed23cb 100644 frame->format, frame->pts); break; case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c +index b6aed7a450..92e26d54bc 100644 +--- a/libavfilter/vf_bwdif.c ++++ b/libavfilter/vf_bwdif.c +@@ -75,10 +75,10 @@ typedef struct ThreadData { + int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \ + int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \ + int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \ +- \ ++ {/*\ + if (!diff) { \ + dst[0] = d; \ +- } else { ++ } else {*/ + + #define SPAT_CHECK() \ + int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \ +@@ -90,15 +90,16 @@ typedef struct ThreadData { + diff = FFMAX3(diff, min, -max); + + #define FILTER_LINE() \ ++ int i1, i2; \ + SPAT_CHECK() \ +- if (FFABS(c - e) > temporal_diff0) { \ +- interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \ ++ /*if (FFABS(c - e) > temporal_diff0)*/ { \ ++ i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \ + - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \ + + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \ + + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ +- } else { \ +- interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ +- } ++ } /*else*/ { \ ++ i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \ ++ }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\ + + #define FILTER_EDGE() \ + if (spat) { \ +@@ -112,7 +113,7 @@ typedef struct ThreadData { + else if (interpol < d - diff) \ + interpol = d - diff; \ + \ +- dst[0] = av_clip(interpol, 0, clip_max); \ ++ dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \ + } \ + \ + dst++; \ +@@ -123,7 +124,7 @@ typedef struct ThreadData { + next2++; \ + } + +-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, ++static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max) + { + uint8_t *dst = dst1; +@@ -133,7 +134,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs, + FILTER_INTRA() + } + +-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, ++static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +@@ -151,7 +152,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + +-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, ++static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) + { +@@ -168,7 +169,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1, + FILTER2() + } + +-static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs, ++static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs, + int prefs3, int mrefs3, int parity, int clip_max) + { + uint16_t *dst = dst1; +@@ -178,7 +179,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre + FILTER_INTRA() + } + +-static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1, ++static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +@@ -196,7 +197,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1 + FILTER2() + } + +-static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1, ++static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) + { diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c new file mode 100644 index 0000000000..d4c11cfc51 @@ -65055,6 +65245,44 @@ index 2cd5773dc5..0cbbc094de 100644 trk->par->codec_id == AV_CODEC_ID_FLAC) { buffer_size_t side_size; uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); +diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c +index 38e4c65c4e..5e04c1df08 100644 +--- a/libavformat/rtpenc.c ++++ b/libavformat/rtpenc.c +@@ -19,6 +19,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "avc.h" + #include "avformat.h" + #include "mpegts.h" + #include "internal.h" +@@ -582,8 +583,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt) + ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0); + break; + case AV_CODEC_ID_H264: ++ { ++ uint8_t *side_data; ++ int side_data_size = 0; ++ ++ side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &side_data_size); ++ ++ if (side_data_size != 0) { ++ int ps_size = side_data_size; ++ uint8_t * ps_buf = NULL; ++ ++ ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size); ++ av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size); ++ ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size); ++ av_free(ps_buf); ++ } + ff_rtp_send_h264_hevc(s1, pkt->data, size); + break; ++ } + case AV_CODEC_ID_H261: + ff_rtp_send_h261(s1, pkt->data, size); + break; diff --git a/libavformat/utils.c b/libavformat/utils.c index 75e5350a27..e10b493dae 100644 --- a/libavformat/utils.c @@ -68300,12 +68528,1121 @@ index 0000000000..462ccb8abd + +#endif + +diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c +index a9bf6ff9e0..6a0e2dcc09 100644 +--- a/libswscale/aarch64/rgb2rgb.c ++++ b/libswscale/aarch64/rgb2rgb.c +@@ -30,6 +30,12 @@ + void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride); ++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + av_cold void rgb2rgb_init_aarch64(void) + { +@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void) + + if (have_neon(cpu_flags)) { + interleaveBytes = ff_interleave_bytes_neon; ++ ff_rgb24toyv12 = ff_rgb24toyv12_aarch64; ++ ff_bgr24toyv12 = ff_bgr24toyv12_aarch64; + } + } +diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S +index d81110ec57..476ca723a0 100644 +--- a/libswscale/aarch64/rgb2rgb_neon.S ++++ b/libswscale/aarch64/rgb2rgb_neon.S +@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1 + 0: + ret + endfunc ++ ++// void ff_rgb24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++function ff_rgb24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld1 {v3.s}[2], [x15], #4 ++ ld1 {v3.s}[1], [x15], #4 ++ ld1 {v3.s}[0], [x15], #4 ++ ld1 {v4.s}[2], [x15], #4 ++ ld1 {v4.s}[1], [x15], #4 ++ ld1 {v4.s}[0], [x15], #4 ++ ld1 {v5.s}[2], [x15], #4 ++ ld1 {v5.s}[1], [x15], #4 ++ ld1 {v5.s}[0], [x15] ++ b 99f ++endfunc ++ ++// void ff_bgr24toyv12_aarch64( ++// const uint8_t *src, // x0 ++// uint8_t *ydst, // x1 ++// uint8_t *udst, // x2 ++// uint8_t *vdst, // x3 ++// int width, // w4 ++// int height, // w5 ++// int lumStride, // w6 ++// int chromStride, // w7 ++// int srcStr, // [sp, #0] ++// int32_t *rgb2yuv); // [sp, #8] ++ ++// regs ++// v0-2 Src bytes - reused as chroma src ++// v3-5 Coeffs (packed very inefficiently - could be squashed) ++// v6 128b ++// v7 128h ++// v8-15 Reserved ++// v16-18 Lo Src expanded as H ++// v19 - ++// v20-22 Hi Src expanded as H ++// v23 - ++// v24 U out ++// v25 U tmp ++// v26 Y out ++// v27-29 Y tmp ++// v30 V out ++// v31 V tmp ++ ++// Assumes Little Endian in tail stores & conversion matrix ++ ++function ff_bgr24toyv12_aarch64, export=1 ++ ldr x15, [sp, #8] ++ ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 ++ ld3 {v3.s, v4.s, v5.s}[2], [x15] ++99: ++ ldr w14, [sp, #0] ++ movi v7.8b, #128 ++ uxtl v6.8h, v7.8b ++ // Ensure if nothing to do then we do nothing ++ cmp w4, #0 ++ b.le 90f ++ cmp w5, #0 ++ b.le 90f ++ // If w % 16 != 0 then -16 so we do main loop 1 fewer times with ++ // the remainder done in the tail ++ tst w4, #15 ++ b.eq 1f ++ sub w4, w4, #16 ++1: ++ ++// -------------------- Even line body - YUV ++11: ++ subs w9, w4, #0 ++ mov x10, x0 ++ mov x11, x1 ++ mov x12, x2 ++ mov x13, x3 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ ++ b.gt 10b ++ ++// -------------------- Even line tail - YUV ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ bic v0.8h, #0xff, LSL #8 ++ bic v1.8h, #0xff, LSL #8 ++ bic v2.8h, #0xff, LSL #8 ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // U ++ // Vector subscript *2 as we loaded into S but are only using H ++ smull v24.4s, v0.4h, v3.h[2] ++ smlal v24.4s, v1.4h, v4.h[2] ++ smlal v24.4s, v2.4h, v5.h[2] ++ smull2 v25.4s, v0.8h, v3.h[2] ++ smlal2 v25.4s, v1.8h, v4.h[2] ++ smlal2 v25.4s, v2.8h, v5.h[2] ++ ++ // V ++ smull v30.4s, v0.4h, v3.h[4] ++ smlal v30.4s, v1.4h, v4.h[4] ++ smlal v30.4s, v2.4h, v5.h[4] ++ smull2 v31.4s, v0.8h, v3.h[4] ++ smlal2 v31.4s, v1.8h, v4.h[4] ++ smlal2 v31.4s, v2.8h, v5.h[4] ++ ++ cmp w9, #-16 ++ ++ shrn v24.4h, v24.4s, #14 ++ shrn2 v24.8h, v25.4s, #14 ++ sqrshrn v24.8b, v24.8h, #1 ++ add v24.8b, v24.8b, v7.8b // +128 ++ shrn v30.4h, v30.4s, #14 ++ shrn2 v30.8h, v31.4s, #14 ++ sqrshrn v30.8b, v30.8h, #1 ++ add v30.8b, v30.8b, v7.8b // +128 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ st1 {v24.8b}, [x12], #8 ++ st1 {v30.8b}, [x13], #8 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++ st1 {v24.s}[0], [x12], #4 ++ st1 {v30.s}[0], [x13], #4 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++ st1 {v24.h}[2], [x12], #2 ++ st1 {v30.h}[2], [x13], #2 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++ st1 {v24.b}[6], [x12], #1 ++ st1 {v30.b}[6], [x13], #1 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++ st1 {v24.b}[7], [x12] ++ st1 {v30.b}[7], [x13] ++1: ++3: ++ ++// -------------------- Odd line body - Y only ++ ++ subs w5, w5, #1 ++ b.eq 90f ++ ++ subs w9, w4, #0 ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ mov x10, x0 ++ mov x11, x1 ++ b.lt 12f ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ subs w9, w9, #16 ++ b.le 13f ++ ++10: ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Testing shows it is faster to stack the smull/smlal ops together ++ // rather than interleave them between channels and indeed even the ++ // shift/add sections seem happier not interleaved ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ subs w9, w9, #16 ++ ++ st1 {v26.16b}, [x11], #16 ++ ++ b.gt 10b ++ ++// -------------------- Odd line tail - Y ++// If width % 16 == 0 then simply runs once with preloaded RGB ++// If other then deals with preload & then does remaining tail ++ ++13: ++ // Body is simple copy of main loop body minus preload ++ ++ uxtl v16.8h, v0.8b ++ uxtl v17.8h, v1.8b ++ uxtl v18.8h, v2.8b ++ ++ uxtl2 v20.8h, v0.16b ++ uxtl2 v21.8h, v1.16b ++ uxtl2 v22.8h, v2.16b ++ ++ // Y0 ++ smull v26.4s, v16.4h, v3.h[0] ++ smlal v26.4s, v17.4h, v4.h[0] ++ smlal v26.4s, v18.4h, v5.h[0] ++ smull2 v27.4s, v16.8h, v3.h[0] ++ smlal2 v27.4s, v17.8h, v4.h[0] ++ smlal2 v27.4s, v18.8h, v5.h[0] ++ // Y1 ++ smull v28.4s, v20.4h, v3.h[0] ++ smlal v28.4s, v21.4h, v4.h[0] ++ smlal v28.4s, v22.4h, v5.h[0] ++ smull2 v29.4s, v20.8h, v3.h[0] ++ smlal2 v29.4s, v21.8h, v4.h[0] ++ smlal2 v29.4s, v22.8h, v5.h[0] ++ ++ cmp w9, #-16 ++ ++ shrn v26.4h, v26.4s, #12 ++ shrn2 v26.8h, v27.4s, #12 ++ add v26.8h, v26.8h, v6.8h // +128 (>> 3 = 16) ++ sqrshrun v26.8b, v26.8h, #3 ++ shrn v28.4h, v28.4s, #12 ++ shrn2 v28.8h, v29.4s, #12 ++ add v28.8h, v28.8h, v6.8h ++ sqrshrun2 v26.16b, v28.8h, #3 ++ // Y0/Y1 ++ ++ // Here: ++ // w9 == 0 width % 16 == 0, tail done ++ // w9 > -16 1st tail done (16 pels), remainder still to go ++ // w9 == -16 shouldn't happen ++ // w9 > -32 2nd tail done ++ // w9 <= -32 shouldn't happen ++ ++ b.lt 2f ++ st1 {v26.16b}, [x11], #16 ++ cbz w9, 3f ++ ++12: ++ sub w9, w9, #16 ++ ++ tbz w9, #3, 1f ++ ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 ++1: tbz w9, #2, 1f ++ ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 ++1: tbz w9, #1, 1f ++ ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 ++ ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 ++1: tbz w9, #0, 13b ++ ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 ++ b 13b ++ ++2: ++ tbz w9, #3, 1f ++ st1 {v26.8b}, [x11], #8 ++1: tbz w9, #2, 1f ++ st1 {v26.s}[2], [x11], #4 ++1: tbz w9, #1, 1f ++ st1 {v26.h}[6], [x11], #2 ++1: tbz w9, #0, 1f ++ st1 {v26.b}[14], [x11] ++1: ++3: ++ ++// ------------------- Loop to start ++ ++ add x0, x0, w14, SXTX ++ add x1, x1, w6, SXTX ++ add x2, x2, w7, SXTX ++ add x3, x3, w7, SXTX ++ subs w5, w5, #1 ++ b.gt 11b ++90: ++ ret ++endfunc +diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c +index a7300f3ba4..ba1db155b0 100644 +--- a/libswscale/rgb2rgb.c ++++ b/libswscale/rgb2rgb.c +@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, ++ uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, +diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h +index 48bba1586a..6329533f18 100644 +--- a/libswscale/rgb2rgb.h ++++ b/libswscale/rgb2rgb.h +@@ -82,6 +82,9 @@ void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size); + void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv); + + /** + * Height should be a multiple of 2 and width should be a multiple of 16. +@@ -131,6 +134,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + int width, int height, + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv); ++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); ++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ++ int width, int height, ++ int lumStride, int chromStride, int srcStride, ++ int32_t *rgb2yuv); + extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, + int srcStride, int dstStride); + +diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c +index 42c69801ba..e711589e1e 100644 +--- a/libswscale/rgb2rgb_template.c ++++ b/libswscale/rgb2rgb_template.c +@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst, + * others are ignored in the C version. + * FIXME: Write HQ version. + */ +-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, +- int chromStride, int srcStride, int32_t *rgb2yuv) ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) + { +- int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; +- int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; +- int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; + int y; + const int chromWidth = width >> 1; + +@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } + ydst += lumStride; + src += srcStride; + +@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; + ydst[2 * i + 1] = Y; + } ++ if ((width & 1) != 0) { ++ unsigned int b = src[6 * i + 0]; ++ unsigned int g = src[6 * i + 1]; ++ unsigned int r = src[6 * i + 2]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } ++ udst += chromStride; ++ vdst += chromStride; ++ ydst += lumStride; ++ src += srcStride; ++ } ++} ++ ++static const uint8_t x_rgb[9] = { ++ RY_IDX, GY_IDX, BY_IDX, ++ RU_IDX, GU_IDX, BU_IDX, ++ RV_IDX, GV_IDX, BV_IDX, ++}; ++ ++static const uint8_t x_bgr[9] = { ++ BY_IDX, GY_IDX, RY_IDX, ++ BU_IDX, GU_IDX, RU_IDX, ++ BV_IDX, GV_IDX, RV_IDX, ++}; ++ ++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv, ++ const uint8_t x[9]) ++{ ++ int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]]; ++ int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]]; ++ int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]]; ++ int y; ++ const int chromWidth = width >> 1; ++ ++ for (y = 0; y < height; y += 2) { ++ int i; ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128; ++ unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128; ++ ++ udst[i] = U; ++ vdst[i] = V; ++ ydst[2 * i] = Y; ++ } ++ ydst += lumStride; ++ src += srcStride; ++ ++ if (y+1 == height) ++ break; ++ ++ for (i = 0; i < chromWidth; i++) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ ++ b = src[8 * i + 6]; ++ g = src[8 * i + 5]; ++ r = src[8 * i + 4]; ++ ++ Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ydst[2 * i + 1] = Y; ++ } ++ if ((width & 1) != 0) { ++ unsigned int b = src[8 * i + 2]; ++ unsigned int g = src[8 * i + 1]; ++ unsigned int r = src[8 * i + 0]; ++ ++ unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16; ++ ++ ydst[2 * i] = Y; ++ } + udst += chromStride; + vdst += chromStride; + ydst += lumStride; +@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + } + } + ++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++// As the general code does no SIMD-like ops simply adding 1 to the src address ++// will fix the ignored alpha position ++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb); ++} ++ ++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, ++ uint8_t *vdst, int width, int height, int lumStride, ++ int chromStride, int srcStride, int32_t *rgb2yuv) ++{ ++ rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr); ++} ++ ++ + static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2, + uint8_t *dest, int width, int height, + int src1Stride, int src2Stride, int dstStride) +@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void) + yuy2toyv12 = yuy2toyv12_c; + planar2x = planar2x_c; + ff_rgb24toyv12 = ff_rgb24toyv12_c; ++ ff_bgr24toyv12 = ff_bgr24toyv12_c; ++ ff_rgbxtoyv12 = ff_rgbxtoyv12_c; ++ ff_bgrxtoyv12 = ff_bgrxtoyv12_c; ++ ff_xrgbtoyv12 = ff_xrgbtoyv12_c; ++ ff_xbgrtoyv12 = ff_xbgrtoyv12_c; + interleaveBytes = interleaveBytes_c; + deinterleaveBytes = deinterleaveBytes_c; + vu9_to_vu12 = vu9_to_vu12_c; +diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c +index c4dd8a4d83..da38d7f8ac 100644 +--- a/libswscale/swscale_unscaled.c ++++ b/libswscale/swscale_unscaled.c +@@ -1655,6 +1655,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + return srcSliceH; + } + ++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgr24toyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_bgrxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_rgbxtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xbgrtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ ++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[], ++ int srcStride[], int srcSliceY, int srcSliceH, ++ uint8_t *dst[], int dstStride[]) ++{ ++ ff_xrgbtoyv12( ++ src[0], ++ dst[0] + srcSliceY * dstStride[0], ++ dst[1] + (srcSliceY >> 1) * dstStride[1], ++ dst[2] + (srcSliceY >> 1) * dstStride[2], ++ c->srcW, srcSliceH, ++ dstStride[0], dstStride[1], srcStride[0], ++ c->input_rgb2yuv_table); ++ if (dst[3]) ++ fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255); ++ return srcSliceH; ++} ++ + static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) +@@ -2035,6 +2120,32 @@ void ff_get_unscaled_swscale(SwsContext *c) + (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && + !(flags & SWS_ACCURATE_RND)) + c->swscale = bgr24ToYv12Wrapper; ++ /* rgb24toYV12 */ ++ if (srcFormat == AV_PIX_FMT_RGB24 && ++ (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgb24ToYv12Wrapper; ++ ++ /* bgrxtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = bgrxToYv12Wrapper; ++ /* rgbx24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = rgbxToYv12Wrapper; ++ /* xbgrtoYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xbgrToYv12Wrapper; ++ /* xrgb24toYV12 */ ++ if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) || ++ (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) && ++ !(flags & SWS_ACCURATE_RND)) ++ c->swscale = xrgbToYv12Wrapper; + + /* RGB/BGR -> RGB/BGR (no dither needed forms) */ + if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c) +diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c +index 6c38041ddb..12776ffec7 100644 +--- a/libswscale/tests/swscale.c ++++ b/libswscale/tests/swscale.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #undef HAVE_AV_CONFIG_H + #include "libavutil/cpu.h" +@@ -78,6 +79,15 @@ struct Results { + uint32_t crc; + }; + ++static int time_rep = 0; ++ ++static uint64_t utime(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000; ++} ++ + // test by ref -> src -> dst -> out & compare out against ref + // ref & out are YV12 + static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, +@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + goto end; + } + +- printf(" %s %dx%d -> %s %3dx%3d flags=%2d", ++ printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d", + desc_src->name, srcW, srcH, + desc_dst->name, dstW, dstH, + flags); +@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h, + + sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); + ++ if (time_rep != 0) ++ { ++ const uint64_t now = utime(); ++ uint64_t done; ++ for (i = 1; i != time_rep; ++i) { ++ sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride); ++ } ++ done = utime(); ++ printf(" T=%7"PRId64"us ", done-now); ++ } ++ + for (i = 0; i < 4 && dstStride[i]; i++) + crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], + dstStride[i] * dstH); +@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4], + return 0; + } + +-#define W 96 +-#define H 96 +- + int main(int argc, char **argv) + { ++ unsigned int W = 96; ++ unsigned int H = 96; ++ unsigned int W2; ++ unsigned int H2; ++ unsigned int S; + enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE; + enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE; +- uint8_t *rgb_data = av_malloc(W * H * 4); +- const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL }; +- int rgb_stride[4] = { 4 * W, 0, 0, 0 }; +- uint8_t *data = av_malloc(4 * W * H); +- const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 }; +- int stride[4] = { W, W, W, W }; + int x, y; + struct SwsContext *sws; + AVLFG rand; + int res = -1; + int i; + FILE *fp = NULL; +- +- if (!rgb_data || !data) +- return -1; ++ uint8_t *rgb_data; ++ uint8_t * rgb_src[4] = { NULL }; ++ int rgb_stride[4] = { 0 }; ++ uint8_t *data; ++ uint8_t * src[4] = { NULL }; ++ int stride[4] = { 0 }; + + for (i = 1; i < argc; i += 2) { ++ const char * const arg2 = argv[i+1]; ++ + if (argv[i][0] != '-' || i + 1 == argc) + goto bad_option; + if (!strcmp(argv[i], "-ref")) { +- fp = fopen(argv[i + 1], "r"); ++ fp = fopen(arg2, "r"); + if (!fp) { +- fprintf(stderr, "could not open '%s'\n", argv[i + 1]); ++ fprintf(stderr, "could not open '%s'\n", arg2); + goto error; + } + } else if (!strcmp(argv[i], "-cpuflags")) { + unsigned flags = av_get_cpu_flags(); +- int ret = av_parse_cpu_caps(&flags, argv[i + 1]); ++ int ret = av_parse_cpu_caps(&flags, arg2); + if (ret < 0) { +- fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid cpu flags %s\n", arg2); + return ret; + } + av_force_cpu_flags(flags); + } else if (!strcmp(argv[i], "-src")) { +- srcFormat = av_get_pix_fmt(argv[i + 1]); ++ srcFormat = av_get_pix_fmt(arg2); + if (srcFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); + return -1; + } + } else if (!strcmp(argv[i], "-dst")) { +- dstFormat = av_get_pix_fmt(argv[i + 1]); ++ dstFormat = av_get_pix_fmt(arg2); + if (dstFormat == AV_PIX_FMT_NONE) { +- fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]); ++ fprintf(stderr, "invalid pixel format %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-w")) { ++ char * p = NULL; ++ W = strtoul(arg2, &p, 0); ++ if (!W || *p) { ++ fprintf(stderr, "bad width %s\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-h")) { ++ char * p = NULL; ++ H = strtoul(arg2, &p, 0); ++ if (!H || *p) { ++ fprintf(stderr, "bad height '%s'\n", arg2); ++ return -1; ++ } ++ } else if (!strcmp(argv[i], "-t")) { ++ char * p = NULL; ++ time_rep = (int)strtol(arg2, &p, 0); ++ if (*p) { ++ fprintf(stderr, "bad time repetitions '%s'\n", arg2); + return -1; + } + } else { +@@ -414,15 +457,34 @@ bad_option: + } + } + +- sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H, ++ S = (W + 15) & ~15; ++ rgb_data = av_mallocz(S * H * 4); ++ rgb_src[0] = rgb_data; ++ rgb_stride[0] = 4 * S; ++ data = av_mallocz(4 * S * H); ++ src[0] = data; ++ src[1] = data + S * H; ++ src[2] = data + S * H * 2; ++ src[3] = data + S * H * 3; ++ stride[0] = S; ++ stride[1] = S; ++ stride[2] = S; ++ stride[3] = S; ++ H2 = H < 96 ? 8 : H / 12; ++ W2 = W < 96 ? 8 : W / 12; ++ ++ if (!rgb_data || !data) ++ return -1; ++ ++ sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H, + AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL); + + av_lfg_init(&rand, 1); + + for (y = 0; y < H; y++) + for (x = 0; x < W * 4; x++) +- rgb_data[ x + y * 4 * W] = av_lfg_get(&rand); +- res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride); ++ rgb_data[ x + y * 4 * S] = av_lfg_get(&rand); ++ res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride); + if (res < 0 || res != H) { + res = -1; + goto error; +@@ -431,10 +493,10 @@ bad_option: + av_free(rgb_data); + + if(fp) { +- res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat); ++ res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat); + fclose(fp); + } else { +- selfTest(src, stride, W, H, srcFormat, dstFormat); ++ selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat); + res = 0; + } + error: diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000000..b050971f63 +index 0000000000..2b62d660c0 --- /dev/null +++ b/pi-util/BUILD.txt -@@ -0,0 +1,59 @@ +@@ -0,0 +1,67 @@ +Building Pi FFmpeg +================== + @@ -68332,6 +69669,8 @@ index 0000000000..b050971f63 + paths being confused and therefore running the wrong code, Shared + is what is needed, in most cases, when building for use by other + programs. ++ --usr Set install dir to /usr (i.e. system default) rather than in ++ /install + +So for a static build +--------------------- @@ -68345,25 +69684,31 @@ index 0000000000..b050971f63 +For a shared build +------------------ + ++There are two choices here ++ +$ pi-util/conf_native.sh -+ -+You will normally want an install target if shared. Note that the script has -+set this up to be generated in out//install, you don't have to worry -+about overwriting your system libs. -+ +$ make -j8 -C out/ install + ++This sets the install prefix to /install and is probably what you ++want if you don't want to overwrite the system files. ++ +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was -+built or install the image on the system - you have to be careful to get rid -+of all other ffmpeg libs or confusion may result. There is a little script -+that wipes all other versions - obviously use with care! ++built. You can copy the contents of /install to /usr and that mostly ++works. The only downside is that paths in pkgconfig end up being set to the ++install directory in your build directory which may be less than ideal when ++building other packages. + ++The alternative if you just want to replace the system libs is: ++ ++$ pi-util/conf_native.sh --usr ++$ make -j8 -C out/ +$ sudo pi-util/clean_usr_libs.sh ++$ sudo make -j8 -C out/ install + -+Then simply copying from the install to /usr works -+ -+$ sudo cp -r out//install/* /usr -+ ++The clean_usr_libs.sh step wipes any existing libs & includes (for all ++architectures) from the system which helps avoid confusion when running other ++progs as you can be sure you're not running old code which is unfortunately ++easy to do otherwise. + diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt new file mode 100644 @@ -68530,11 +69875,27 @@ index 0000000000..92bc13a3df + diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh new file mode 100755 -index 0000000000..b3b2d5509d +index 0000000000..01bd6a6a22 --- /dev/null +++ b/pi-util/clean_usr_libs.sh -@@ -0,0 +1,26 @@ +@@ -0,0 +1,42 @@ +set -e ++U=/usr/include/arm-linux-gnueabihf ++rm -rf $U/libavcodec ++rm -rf $U/libavdevice ++rm -rf $U/libavfilter ++rm -rf $U/libavformat ++rm -rf $U/libavutil ++rm -rf $U/libswresample ++rm -rf $U/libswscale ++U=/usr/include/aarch64-linux-gnu ++rm -rf $U/libavcodec ++rm -rf $U/libavdevice ++rm -rf $U/libavfilter ++rm -rf $U/libavformat ++rm -rf $U/libavutil ++rm -rf $U/libswresample ++rm -rf $U/libswscale +U=/usr/lib/arm-linux-gnueabihf +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* @@ -69117,10 +70478,10 @@ index 0000000000..fc14f2a3c2 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh new file mode 100755 -index 0000000000..a9e053801c +index 0000000000..5fb69ccee2 --- /dev/null +++ b/pi-util/conf_native.sh -@@ -0,0 +1,107 @@ +@@ -0,0 +1,127 @@ +echo "Configure for native build" + +FFSRC=`pwd` @@ -69132,6 +70493,7 @@ index 0000000000..a9e053801c + +NOSHARED= +MMAL= ++USR_PREFIX= + +while [ "$1" != "" ] ; do + case $1 in @@ -69141,8 +70503,14 @@ index 0000000000..a9e053801c + --mmal) + MMAL=1 + ;; ++ --usr) ++ USR_PREFIX=/usr ++ ;; + *) -+ echo "Usage $0: [--noshared] [--mmal]" ++ echo "Usage $0: [--noshared] [--mmal] [--usr]" ++ echo " noshared Build static libs and executable - good for testing" ++ echo " mmal Build mmal decoders" ++ echo " usr Set install prefix to /usr [default=/install]" + exit 1 + ;; + esac @@ -69156,18 +70524,28 @@ index 0000000000..a9e053801c +RPI_DEFINES= +RPI_EXTRALIBS= + -+if [ "$MC" == "arm64" ]; then -+ echo "M/C aarch64" -+ A=aarch64-linux-gnu -+ B=arm64 -+elif [ "$MC" == "armhf" ]; then -+ echo "M/C armv7" -+ A=arm-linux-gnueabihf -+ B=armv7 -+ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" -+ RPI_DEFINES=-mfpu=neon-vfpv4 ++# uname -m gives kernel type which may not have the same ++# 32/64bitness as userspace :-( getconf shoudl provide the answer ++# but use uname to check we are on the right processor ++MC=`uname -m` ++LB=`getconf LONG_BIT` ++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then ++ if [ "$LB" == "32" ]; then ++ echo "M/C armv7" ++ A=arm-linux-gnueabihf ++ B=armv7 ++ MCOPTS="--arch=armv6t2 --cpu=cortex-a7" ++ RPI_DEFINES=-mfpu=neon-vfpv4 ++ elif [ "$LB" == "64" ]; then ++ echo "M/C aarch64" ++ A=aarch64-linux-gnu ++ B=arm64 ++ else ++ echo "Unknown LONG_BIT name: $LB" ++ exit 1 ++ fi +else -+ echo Unexpected architecture $MC ++ echo "Unknown machine name: $MC" + exit 1 +fi + @@ -69195,7 +70573,9 @@ index 0000000000..a9e053801c + OUT=$BUILDBASE/$B-$C-$V-shared-rel +fi + -+USR_PREFIX=$OUT/install ++if [ ! $USR_PREFIX ]; then ++ USR_PREFIX=$OUT/install ++fi +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + @@ -69225,6 +70605,7 @@ index 0000000000..a9e053801c + --extra-libs="$RPI_EXTRALIBS"\ + --extra-version="rpi" + ++echo "Configured into $OUT" + +# gcc option for getting asm listing +# -Wa,-ahls