From 4060ba2ab4b7070cf7bab555975021e2bf15f8af Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Tue, 8 Nov 2022 20:59:30 +0100 Subject: [PATCH 01/12] ffmpeg gen-patches: update to ffmpeg 5.1.2 Signed-off-by: Matthias Reichl --- tools/ffmpeg/gen-patches.sh | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tools/ffmpeg/gen-patches.sh b/tools/ffmpeg/gen-patches.sh index 1d786f2bda..9b0c618a66 100755 --- a/tools/ffmpeg/gen-patches.sh +++ b/tools/ffmpeg/gen-patches.sh @@ -2,10 +2,10 @@ # base ffmpeg version FFMPEG_REPO="git://source.ffmpeg.org/ffmpeg.git" -FFMPEG_VERSION="n4.4.1" +FFMPEG_VERSION="n5.1.2" KODI_FFMPEG_REPO="https://github.com/xbmc/FFmpeg" -KODI_FFMPEG_VERSION="4.4.1-Nexus-Alpha1" +KODI_FFMPEG_VERSION="5.1.2-Nexus-Alpha3" ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi kodi" @@ -28,26 +28,17 @@ create_patch() { PATCH_CREATE_DIFF="no" case "${FEATURE_SET}" in - v4l2-drmprime) + v4l2-drmprime|v4l2-request) REPO="https://github.com/jernejsk/FFmpeg" - REFSPEC="v4l2-drmprime-v6-4.4.1-Nexus-Alpha1" - BASE_REPO="${KODI_FFMPEG_REPO}" - BASE_VERSION="${KODI_FFMPEG_VERSION}" - ;; - v4l2-request) - REPO="https://github.com/jernejsk/FFmpeg" - REFSPEC="v4l2-request-hwaccel-4.4.1-Nexus-Alpha1" - BASE_REPO="${KODI_FFMPEG_REPO}" - BASE_VERSION="${KODI_FFMPEG_VERSION}" + REFSPEC="${FEATURE_SET}-${FFMPEG_VERSION}" ;; libreelec) REPO="https://github.com/LibreELEC/FFmpeg" - REFSPEC="4.4-libreelec-misc" + REFSPEC="5.1.2-libreelec-misc" ;; rpi) REPO="https://github.com/jc-kynesim/rpi-ffmpeg" - REFSPEC="dev/4.4/rpi_import_1" - PATCH_CREATE_DIFF="yes" + REFSPEC="dev/5.1.2/rpi_import_1" ;; kodi) REPO="${KODI_FFMPEG_REPO}" From 97a690410f509a701900a61c2ad25fbd87eedc36 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Tue, 8 Nov 2022 21:03:37 +0100 Subject: [PATCH 02/12] ffmpeg: update kodi patch Patch created using revisions eacfcba..ca8882f from tag 5.1.2-Nexus-Alpha3 of https://github.com/xbmc/FFmpeg --- .../ffmpeg/patches/kodi/ffmpeg-001-kodi.patch | 717 +----------------- 1 file changed, 26 insertions(+), 691 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch b/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch index aad6fac8cc..c1aefeac04 100644 --- a/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch +++ b/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch @@ -1,7 +1,7 @@ -From 5180cdc317139414eedcb49627d240519435b104 Mon Sep 17 00:00:00 2001 +From c4b5aa630053c59eac2c2fe52071cd26c570107a Mon Sep 17 00:00:00 2001 From: marc Date: Mon, 18 Feb 2013 17:18:18 +0000 -Subject: [PATCH 01/15] dxva-h264: Fix an AMD driver issue with playback of +Subject: [PATCH 1/6] dxva-h264: Fix an AMD driver issue with playback of streams that don't start with an I-Frame --- @@ -12,10 +12,10 @@ Subject: [PATCH 01/15] dxva-h264: Fix an AMD driver issue with playback of 4 files changed, 12 insertions(+) diff --git a/libavcodec/dxva2_h264.c b/libavcodec/dxva2_h264.c -index 5b23b28f12..c0a8d80f3b 100644 +index 6300b1418d..9e53355fae 100644 --- a/libavcodec/dxva2_h264.c +++ b/libavcodec/dxva2_h264.c -@@ -504,6 +504,14 @@ static int dxva2_h264_end_frame(AVCodecContext *avctx) +@@ -506,6 +506,14 @@ static int dxva2_h264_end_frame(AVCodecContext *avctx) if (ctx_pic->slice_count <= 0 || ctx_pic->bitstream_size <= 0) return -1; @@ -31,10 +31,10 @@ index 5b23b28f12..c0a8d80f3b 100644 &ctx_pic->pp, sizeof(ctx_pic->pp), &ctx_pic->qm, sizeof(ctx_pic->qm), diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index 7c69016338..0b415ada6f 100644 +index d56722a5c2..a94a5a1784 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c -@@ -942,6 +942,7 @@ static int h264_slice_header_init(H264Context *h) +@@ -971,6 +971,7 @@ static int h264_slice_header_init(H264Context *h) h->first_field = 0; h->prev_interlaced_frame = 1; @@ -43,10 +43,10 @@ index 7c69016338..0b415ada6f 100644 init_scan_tables(h); ret = ff_h264_alloc_tables(h); diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c -index 485f47d36e..1705046e29 100644 +index 2a5b53ea56..8689b462d1 100644 --- a/libavcodec/h264dec.c +++ b/libavcodec/h264dec.c -@@ -442,6 +442,7 @@ void ff_h264_flush_change(H264Context *h) +@@ -448,6 +448,7 @@ void ff_h264_flush_change(H264Context *h) h->next_outputed_poc = INT_MIN; h->prev_interlaced_frame = 1; @@ -55,10 +55,10 @@ index 485f47d36e..1705046e29 100644 h->poc.prev_frame_num = -1; diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h -index b3677cdbb9..b7b19ba4f1 100644 +index 9a1ec1bace..eab495cdd0 100644 --- a/libavcodec/h264dec.h +++ b/libavcodec/h264dec.h -@@ -540,6 +540,8 @@ typedef struct H264Context { +@@ -532,6 +532,8 @@ typedef struct H264Context { * slices) anymore */ int setup_finished; @@ -68,20 +68,20 @@ index b3677cdbb9..b7b19ba4f1 100644 int cur_bit_depth_luma; int16_t slice_row[MAX_SLICES]; ///< to detect when MAX_SLICES is too low -From 02731d93b4c725f13bf3b3217b48db3be18e0bce Mon Sep 17 00:00:00 2001 +From df996a4c35b85b61c73fa7cabc587299ed6b3957 Mon Sep 17 00:00:00 2001 From: Rechi Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 02/15] use Kodi as extra version +Subject: [PATCH 2/6] use Kodi as extra version --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile -index 7e9d8b08c3..882a3cb31b 100644 +index 61f79e27ae..7ab083fd70 100644 --- a/Makefile +++ b/Makefile -@@ -130,7 +130,7 @@ GIT_LOG = $(SRC_PATH)/.git/logs/HEAD +@@ -137,7 +137,7 @@ GIT_LOG = $(SRC_PATH)/.git/logs/HEAD .version: M=@ libavutil/ffversion.h .version: @@ -91,10 +91,10 @@ index 7e9d8b08c3..882a3cb31b 100644 # force version.sh to run whenever version might have changed -From 1ec811b6f330f60ec7a522cc60e98bd8ae30c766 Mon Sep 17 00:00:00 2001 +From e02f5681f78b18d886af512d0ad5d553faa8968d Mon Sep 17 00:00:00 2001 From: Rechi Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 03/15] common.mak: never ignore an error if strip doesn't +Subject: [PATCH 3/6] common.mak: never ignore an error if strip doesn't succeed --- @@ -102,10 +102,10 @@ Subject: [PATCH 03/15] common.mak: never ignore an error if strip doesn't 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ffbuild/common.mak b/ffbuild/common.mak -index 164a43932d..2ab5dd0dfd 100644 +index f52473453e..85f1d43bb8 100644 --- a/ffbuild/common.mak +++ b/ffbuild/common.mak -@@ -87,7 +87,7 @@ COMPILE_MSA = $(call COMPILE,CC,MSAFLAGS) +@@ -101,7 +101,7 @@ COMPILE_LASX = $(call COMPILE,CC,LASXFLAGS) %.o: %.asm $(COMPILE_X86ASM) @@ -115,11 +115,10 @@ index 164a43932d..2ab5dd0dfd 100644 %.o: %.rc $(WINDRES) $(IFLAGS) $(foreach ARG,$(CC_DEPFLAGS),--preprocessor-arg "$(ARG)") -o $@ $< -From 29b9dec3de00f69339e6a5fed79b2d8ce2b3c105 Mon Sep 17 00:00:00 2001 +From 43ad570ce0dd168bdcc206244594302f922d95f2 Mon Sep 17 00:00:00 2001 From: wsnipex Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 04/15] only check for a git rev if the src tree is in a git - repo +Subject: [PATCH 4/6] only check for a git rev if the src tree is in a git repo fixes the version string when building from the kodi depends src tree --- @@ -147,11 +146,11 @@ index edc4dd33c5..239a138ca7 100755 # no revision number found test "$revision" || revision=$(cd "$1" && cat RELEASE 2> /dev/null) -From d2e9030c8a0d55426e13d1007e163c48f2533819 Mon Sep 17 00:00:00 2001 +From fec4c2031296c414685477c75e6a4bdff105a3ac Mon Sep 17 00:00:00 2001 From: Anton Fedchin Date: Fri, 11 Jan 2019 10:47:43 +0100 -Subject: [PATCH 05/15] after 153b36f there is a possibility to crash when - trying to get index of a surface which points to nirvana. +Subject: [PATCH 5/6] after 153b36f there is a possibility to crash when trying + to get index of a surface which points to nirvana. it may occurs when a stream starts with non i-frame. --- @@ -159,7 +158,7 @@ it may occurs when a stream starts with non i-frame. 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c -index b57ea21941..542bc2f18d 100644 +index 568d686f39..735127f83a 100644 --- a/libavcodec/dxva2.c +++ b/libavcodec/dxva2.c @@ -777,16 +777,18 @@ unsigned ff_dxva2_get_surface_index(const AVCodecContext *avctx, @@ -186,10 +185,10 @@ index b57ea21941..542bc2f18d 100644 #endif -From 916223c0a47091272c9d0b6035f187310908ff37 Mon Sep 17 00:00:00 2001 +From ca8882fcaf5da0192772733a7ba832765df7c391 Mon Sep 17 00:00:00 2001 From: Rainer Hochecker Date: Sat, 26 Jan 2019 19:48:35 +0100 -Subject: [PATCH 06/15] avcodec/vaapi_h264: skip decode if pic has no slices +Subject: [PATCH 6/6] avcodec/vaapi_h264: skip decode if pic has no slices This fixes / workarounds https://bugs.freedesktop.org/show_bug.cgi?id=105368. It was hit frequently when watching h264 channels received via DVB-X. @@ -214,667 +213,3 @@ index 9332aa6f31..d4494beebf 100644 ret = ff_vaapi_decode_issue(avctx, pic); if (ret < 0) goto finish; - -From b211c09d17ef86d7b38d1bfe9814a01e9040bf03 Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:23:56 +0800 -Subject: [PATCH 07/15] cbs_av1: fix incorrect data type - -Since order_hint_bits_minus_1 range is 0~7, cur_frame_hint can be -most 128. And similar return value for cbs_av1_get_relative_dist. -So if plus them and use int8_t for the result may lose its precision. - -Signed-off-by: Fei Wang -(cherry picked from commit e7ff5722b1abae4284e79da707e71ff82b409699) -(cherry picked from commit 8aab15a91d6e8ca726580e969ff71828ad63baaa) ---- - libavcodec/cbs_av1_syntax_template.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/cbs_av1_syntax_template.c b/libavcodec/cbs_av1_syntax_template.c -index 6fe6e9a4f3..d98d3d42de 100644 ---- a/libavcodec/cbs_av1_syntax_template.c -+++ b/libavcodec/cbs_av1_syntax_template.c -@@ -355,7 +355,7 @@ static int FUNC(set_frame_refs)(CodedBitstreamContext *ctx, RWContext *rw, - AV1_REF_FRAME_ALTREF2, AV1_REF_FRAME_ALTREF - }; - int8_t ref_frame_idx[AV1_REFS_PER_FRAME], used_frame[AV1_NUM_REF_FRAMES]; -- int8_t shifted_order_hints[AV1_NUM_REF_FRAMES]; -+ int16_t shifted_order_hints[AV1_NUM_REF_FRAMES]; - int cur_frame_hint, latest_order_hint, earliest_order_hint, ref; - int i, j; - - -From 2db5def80f1913a00410d6f16ae3730de567c3b8 Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:23:57 +0800 -Subject: [PATCH 08/15] avcodec/av1: extend some definitions in spec section 3 - -Signed-off-by: Fei Wang -(cherry picked from commit 75de7fe26218cb37fff9d5afa7b5b2b8bee4a9a8) -(cherry picked from commit 2f459697445df67cc61c9a6c2930fdf3f830e629) ---- - libavcodec/av1.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/libavcodec/av1.h b/libavcodec/av1.h -index 0f99ae4829..951a18ecb2 100644 ---- a/libavcodec/av1.h -+++ b/libavcodec/av1.h -@@ -114,6 +114,13 @@ enum { - AV1_WARP_MODEL_TRANSLATION = 1, - AV1_WARP_MODEL_ROTZOOM = 2, - AV1_WARP_MODEL_AFFINE = 3, -+ AV1_WARP_PARAM_REDUCE_BITS = 6, -+ -+ AV1_DIV_LUT_BITS = 8, -+ AV1_DIV_LUT_PREC_BITS = 14, -+ AV1_DIV_LUT_NUM = 257, -+ -+ AV1_MAX_LOOP_FILTER = 63, - }; - - - -From ddc3058a3e7b7c44a3911fb356f932853565b3d0 Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:23:58 +0800 -Subject: [PATCH 09/15] avcodec/av1dec: support setup shear process - -Defined in spec 7.11.3.6/7.11.3.7. - -Signed-off-by: Fei Wang -(cherry picked from commit de7475b111679120b3b089fe543224f50882287c) -(cherry picked from commit 481d3930d90d52587ad76d277cbd2f9cb3109079) ---- - libavcodec/av1dec.c | 98 +++++++++++++++++++++++++++++++++++++++++++++ - libavcodec/av1dec.h | 1 + - 2 files changed, 99 insertions(+) - -diff --git a/libavcodec/av1dec.c b/libavcodec/av1dec.c -index a75d6744d3..a3301f454f 100644 ---- a/libavcodec/av1dec.c -+++ b/libavcodec/av1dec.c -@@ -28,6 +28,34 @@ - #include "internal.h" - #include "profiles.h" - -+/**< same with Div_Lut defined in spec 7.11.3.7 */ -+static const uint16_t div_lut[AV1_DIV_LUT_NUM] = { -+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, -+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, -+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, -+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, -+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, -+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, -+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, -+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, -+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, -+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, -+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, -+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, -+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, -+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, -+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, -+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, -+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, -+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, -+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, -+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, -+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, -+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, -+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, -+ 8240, 8224, 8208, 8192 -+}; -+ - static uint32_t inverse_recenter(int r, uint32_t v) - { - if (v > 2 * r) -@@ -97,6 +125,70 @@ static void read_global_param(AV1DecContext *s, int type, int ref, int idx) - -mx, mx + 1, r) << prec_diff) + round; - } - -+static uint64_t round_two(uint64_t x, uint16_t n) -+{ -+ if (n == 0) -+ return x; -+ return ((x + ((uint64_t)1 << (n - 1))) >> n); -+} -+ -+static int64_t round_two_signed(int64_t x, uint16_t n) -+{ -+ return ((x<0) ? -((int64_t)round_two(-x, n)) : (int64_t)round_two(x, n)); -+} -+ -+/** -+ * Resolve divisor process. -+ * see spec 7.11.3.7 -+ */ -+static int16_t resolve_divisor(uint32_t d, uint16_t *shift) -+{ -+ int32_t e, f; -+ -+ *shift = av_log2(d); -+ e = d - (1 << (*shift)); -+ if (*shift > AV1_DIV_LUT_BITS) -+ f = round_two(e, *shift - AV1_DIV_LUT_BITS); -+ else -+ f = e << (AV1_DIV_LUT_BITS - (*shift)); -+ -+ *shift += AV1_DIV_LUT_PREC_BITS; -+ -+ return div_lut[f]; -+} -+ -+/** -+ * check if global motion params is valid. -+ * see spec 7.11.3.6 -+ */ -+static uint8_t get_shear_params_valid(AV1DecContext *s, int idx) -+{ -+ int16_t alpha, beta, gamma, delta, divf, divs; -+ int64_t v, w; -+ int32_t *param = &s->cur_frame.gm_params[idx][0]; -+ if (param[2] < 0) -+ return 0; -+ -+ alpha = av_clip_int16(param[2] - (1 << AV1_WARPEDMODEL_PREC_BITS)); -+ beta = av_clip_int16(param[3]); -+ divf = resolve_divisor(abs(param[2]), &divs); -+ v = (int64_t)param[4] * (1 << AV1_WARPEDMODEL_PREC_BITS); -+ w = (int64_t)param[3] * param[4]; -+ gamma = av_clip_int16((int)round_two_signed((v * divf), divs)); -+ delta = av_clip_int16(param[5] - (int)round_two_signed((w * divf), divs) - (1 << AV1_WARPEDMODEL_PREC_BITS)); -+ -+ alpha = round_two_signed(alpha, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; -+ beta = round_two_signed(beta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; -+ gamma = round_two_signed(gamma, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; -+ delta = round_two_signed(delta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; -+ -+ if ((4 * abs(alpha) + 7 * abs(beta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS) || -+ (4 * abs(gamma) + 4 * abs(delta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS)) -+ return 0; -+ -+ return 1; -+} -+ - /** - * update gm type/params, since cbs already implemented part of this funcation, - * so we don't need to full implement spec. -@@ -144,6 +236,9 @@ static void global_motion_params(AV1DecContext *s) - read_global_param(s, type, ref, 0); - read_global_param(s, type, ref, 1); - } -+ if (type <= AV1_WARP_MODEL_AFFINE) { -+ s->cur_frame.gm_invalid[ref] = !get_shear_params_valid(s, ref); -+ } - } - } - -@@ -509,6 +604,9 @@ static int av1_frame_ref(AVCodecContext *avctx, AV1Frame *dst, const AV1Frame *s - - dst->spatial_id = src->spatial_id; - dst->temporal_id = src->temporal_id; -+ memcpy(dst->gm_invalid, -+ src->gm_invalid, -+ AV1_NUM_REF_FRAMES * sizeof(uint8_t)); - memcpy(dst->gm_type, - src->gm_type, - AV1_NUM_REF_FRAMES * sizeof(uint8_t)); -diff --git a/libavcodec/av1dec.h b/libavcodec/av1dec.h -index 248a68750f..4e140588b9 100644 ---- a/libavcodec/av1dec.h -+++ b/libavcodec/av1dec.h -@@ -42,6 +42,7 @@ typedef struct AV1Frame { - int temporal_id; - int spatial_id; - -+ uint8_t gm_invalid[AV1_NUM_REF_FRAMES]; - uint8_t gm_type[AV1_NUM_REF_FRAMES]; - int32_t gm_params[AV1_NUM_REF_FRAMES][6]; - - -From ad58733c84c131216e04ceb39d0dff64bfac5a2c Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:23:59 +0800 -Subject: [PATCH 10/15] avcodec/av1_vaapi: add gm params valid check - -Signed-off-by: Fei Wang -(cherry picked from commit 0d0ea70e7bdd85def85d526480d728740a371744) -(cherry picked from commit 8b9a48b7aa3c14103f975035bb18601b13ed1707) ---- - libavcodec/vaapi_av1.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/vaapi_av1.c b/libavcodec/vaapi_av1.c -index 16b7e35747..f577447be4 100644 ---- a/libavcodec/vaapi_av1.c -+++ b/libavcodec/vaapi_av1.c -@@ -213,7 +213,8 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - frame_header->height_in_sbs_minus_1[i]; - } - for (int i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF; i++) { -- pic_param.wm[i - 1].wmtype = s->cur_frame.gm_type[i]; -+ pic_param.wm[i - 1].invalid = s->cur_frame.gm_invalid[i]; -+ pic_param.wm[i - 1].wmtype = s->cur_frame.gm_type[i]; - for (int j = 0; j < 6; j++) - pic_param.wm[i - 1].wmmat[j] = s->cur_frame.gm_params[i][j]; - } - -From fd1acddbee3686c97c2c38cc4befea38794eb44d Mon Sep 17 00:00:00 2001 -From: Tong Wu -Date: Tue, 12 Oct 2021 16:24:00 +0800 -Subject: [PATCH 11/15] avcodec/dxva2_av1: fix global motion params - -Defined in spec 5.9.24/5.9.25. Since function void -global_motion_params(AV1DecContext *s) already updates -gm type/params, the wminvalid parameter only need to get -the value from cur_frame.gm_invalid. - -Signed-off-by: Tong Wu -(cherry picked from commit 4e7a7d75e3c21a6af03c4cd52ffc50270664e58a) -(cherry picked from commit 03f5a57b9364d7ce789589594450fdf714a23e70) ---- - libavcodec/dxva2_av1.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/dxva2_av1.c b/libavcodec/dxva2_av1.c -index c30b57799c..8a912bf6c1 100644 ---- a/libavcodec/dxva2_av1.c -+++ b/libavcodec/dxva2_av1.c -@@ -139,7 +139,7 @@ static int fill_picture_parameters(const AVCodecContext *avctx, AVDXVAContext *c - pp->frame_refs[i].Index = ref_frame->buf[0] ? ref_idx : 0xFF; - - /* Global Motion */ -- pp->frame_refs[i].wminvalid = (h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i] == AV1_WARP_MODEL_IDENTITY); -+ pp->frame_refs[i].wminvalid = h->cur_frame.gm_invalid[AV1_REF_FRAME_LAST + i]; - pp->frame_refs[i].wmtype = h->cur_frame.gm_type[AV1_REF_FRAME_LAST + i]; - for (j = 0; j < 6; ++j) { - pp->frame_refs[i].wmmat[j] = h->cur_frame.gm_params[AV1_REF_FRAME_LAST + i][j]; - -From 06181d3fd98d040ad2e1cb297896b3ba9235a9b4 Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:24:01 +0800 -Subject: [PATCH 12/15] avcodec/vaapi: increase av1 decode pool size - -For film grain clip, vaapi_av1 decoder will cache additional 8 -surfaces that will be used to store frames which apply film grain. -So increase the pool size by plus 8 to avoid leak of surface. - -Signed-off-by: Fei Wang -(cherry picked from commit 53403158cc19b9e5baeff6af9317f14d1a20d0cb) -(cherry picked from commit 5774a0524c0851293a36acf3f3586e7c39a64b4a) ---- - libavcodec/vaapi_decode.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/libavcodec/vaapi_decode.c b/libavcodec/vaapi_decode.c -index 57a0eb4e6e..032e8531f2 100644 ---- a/libavcodec/vaapi_decode.c -+++ b/libavcodec/vaapi_decode.c -@@ -577,10 +577,10 @@ static int vaapi_decode_make_config(AVCodecContext *avctx, - switch (avctx->codec_id) { - case AV_CODEC_ID_H264: - case AV_CODEC_ID_HEVC: -+ case AV_CODEC_ID_AV1: - frames->initial_pool_size += 16; - break; - case AV_CODEC_ID_VP9: -- case AV_CODEC_ID_AV1: - frames->initial_pool_size += 8; - break; - case AV_CODEC_ID_VP8: - -From 1d82fef4ddc4e5512219c8c0e2e93fe351d1f65a Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:24:02 +0800 -Subject: [PATCH 13/15] avcodec/av1_vaapi: setting 2 output surface for film - grain - -VAAPI needs 2 output surface for film grain frame. One used for -reference and the other used for applying film grain and pushing -to downstream. - -Signed-off-by: Fei Wang -(cherry picked from commit 7871144cf801bc8b9e3b00319dd7c3c3d91dd3fa) -(cherry picked from commit 5962698d25f148d6b89dc4e526fffc5db2295f1e) ---- - libavcodec/vaapi_av1.c | 115 ++++++++++++++++++++++++++++++++++++++--- - 1 file changed, 108 insertions(+), 7 deletions(-) - -diff --git a/libavcodec/vaapi_av1.c b/libavcodec/vaapi_av1.c -index f577447be4..26476c7738 100644 ---- a/libavcodec/vaapi_av1.c -+++ b/libavcodec/vaapi_av1.c -@@ -21,8 +21,28 @@ - #include "libavutil/pixdesc.h" - #include "hwconfig.h" - #include "vaapi_decode.h" -+#include "internal.h" - #include "av1dec.h" - -+typedef struct VAAPIAV1FrameRef { -+ ThreadFrame frame; -+ int valid; -+} VAAPIAV1FrameRef; -+ -+typedef struct VAAPIAV1DecContext { -+ VAAPIDecodeContext base; -+ -+ /** -+ * For film grain case, VAAPI generate 2 output for each frame, -+ * current_frame will not apply film grain, and will be used for -+ * references for next frames. Maintain the reference list without -+ * applying film grain here. And current_display_picture will be -+ * used to apply film grain and push to downstream. -+ */ -+ VAAPIAV1FrameRef ref_tab[AV1_NUM_REF_FRAMES]; -+ ThreadFrame tmp_frame; -+} VAAPIAV1DecContext; -+ - static VASurfaceID vaapi_av1_surface_id(AV1Frame *vf) - { - if (vf) -@@ -49,6 +69,48 @@ static int8_t vaapi_av1_get_bit_depth_idx(AVCodecContext *avctx) - return bit_depth == 8 ? 0 : bit_depth == 10 ? 1 : 2; - } - -+static int vaapi_av1_decode_init(AVCodecContext *avctx) -+{ -+ VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; -+ -+ ctx->tmp_frame.f = av_frame_alloc(); -+ if (!ctx->tmp_frame.f) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Failed to allocate frame.\n"); -+ return AVERROR(ENOMEM); -+ } -+ -+ for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) { -+ ctx->ref_tab[i].frame.f = av_frame_alloc(); -+ if (!ctx->ref_tab[i].frame.f) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Failed to allocate reference table frame %d.\n", i); -+ return AVERROR(ENOMEM); -+ } -+ ctx->ref_tab[i].valid = 0; -+ } -+ -+ return ff_vaapi_decode_init(avctx); -+} -+ -+static int vaapi_av1_decode_uninit(AVCodecContext *avctx) -+{ -+ VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; -+ -+ if (ctx->tmp_frame.f->buf[0]) -+ ff_thread_release_buffer(avctx, &ctx->tmp_frame); -+ av_frame_free(&ctx->tmp_frame.f); -+ -+ for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) { -+ if (ctx->ref_tab[i].frame.f->buf[0]) -+ ff_thread_release_buffer(avctx, &ctx->ref_tab[i].frame); -+ av_frame_free(&ctx->ref_tab[i].frame.f); -+ } -+ -+ return ff_vaapi_decode_uninit(avctx); -+} -+ -+ - static int vaapi_av1_start_frame(AVCodecContext *avctx, - av_unused const uint8_t *buffer, - av_unused uint32_t size) -@@ -58,18 +120,28 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - const AV1RawFrameHeader *frame_header = s->raw_frame_header; - const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain; - VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private; -+ VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; - VADecPictureParameterBufferAV1 pic_param; - int8_t bit_depth_idx; - int err = 0; - int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain; - uint8_t remap_lr_type[4] = {AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ}; - -- pic->output_surface = vaapi_av1_surface_id(&s->cur_frame); -- - bit_depth_idx = vaapi_av1_get_bit_depth_idx(avctx); - if (bit_depth_idx < 0) - goto fail; - -+ if (apply_grain) { -+ if (ctx->tmp_frame.f->buf[0]) -+ ff_thread_release_buffer(avctx, &ctx->tmp_frame); -+ err = ff_thread_get_buffer(avctx, &ctx->tmp_frame, AV_GET_BUFFER_FLAG_REF); -+ if (err < 0) -+ goto fail; -+ pic->output_surface = ff_vaapi_get_surface_id(ctx->tmp_frame.f); -+ } else { -+ pic->output_surface = vaapi_av1_surface_id(&s->cur_frame); -+ } -+ - memset(&pic_param, 0, sizeof(VADecPictureParameterBufferAV1)); - pic_param = (VADecPictureParameterBufferAV1) { - .profile = seq->seq_profile, -@@ -77,6 +149,7 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - .bit_depth_idx = bit_depth_idx, - .current_frame = pic->output_surface, - .current_display_picture = pic->output_surface, -+ .current_display_picture = vaapi_av1_surface_id(&s->cur_frame), - .frame_width_minus1 = frame_header->frame_width_minus_1, - .frame_height_minus1 = frame_header->frame_height_minus_1, - .primary_ref_frame = frame_header->primary_ref_frame, -@@ -185,7 +258,9 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - if (pic_param.pic_info_fields.bits.frame_type == AV1_FRAME_KEY) - pic_param.ref_frame_map[i] = VA_INVALID_ID; - else -- pic_param.ref_frame_map[i] = vaapi_av1_surface_id(&s->ref[i]); -+ pic_param.ref_frame_map[i] = ctx->ref_tab[i].valid ? -+ ff_vaapi_get_surface_id(ctx->ref_tab[i].frame.f) : -+ vaapi_av1_surface_id(&s->ref[i]); - } - for (int i = 0; i < AV1_REFS_PER_FRAME; i++) { - pic_param.ref_frame_idx[i] = frame_header->ref_frame_idx[i]; -@@ -264,8 +339,34 @@ fail: - static int vaapi_av1_end_frame(AVCodecContext *avctx) - { - const AV1DecContext *s = avctx->priv_data; -+ const AV1RawFrameHeader *header = s->raw_frame_header; -+ const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain; - VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private; -- return ff_vaapi_decode_issue(avctx, pic); -+ VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; -+ -+ int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain; -+ int ret; -+ ret = ff_vaapi_decode_issue(avctx, pic); -+ if (ret < 0) -+ return ret; -+ -+ for (int i = 0; i < AV1_NUM_REF_FRAMES; i++) { -+ if (header->refresh_frame_flags & (1 << i)) { -+ if (ctx->ref_tab[i].frame.f->buf[0]) -+ ff_thread_release_buffer(avctx, &ctx->ref_tab[i].frame); -+ -+ if (apply_grain) { -+ ret = ff_thread_ref_frame(&ctx->ref_tab[i].frame, &ctx->tmp_frame); -+ if (ret < 0) -+ return ret; -+ ctx->ref_tab[i].valid = 1; -+ } else { -+ ctx->ref_tab[i].valid = 0; -+ } -+ } -+ } -+ -+ return 0; - } - - static int vaapi_av1_decode_slice(AVCodecContext *avctx, -@@ -312,9 +413,9 @@ const AVHWAccel ff_av1_vaapi_hwaccel = { - .end_frame = vaapi_av1_end_frame, - .decode_slice = vaapi_av1_decode_slice, - .frame_priv_data_size = sizeof(VAAPIDecodePicture), -- .init = ff_vaapi_decode_init, -- .uninit = ff_vaapi_decode_uninit, -+ .init = vaapi_av1_decode_init, -+ .uninit = vaapi_av1_decode_uninit, - .frame_params = ff_vaapi_common_frame_params, -- .priv_data_size = sizeof(VAAPIDecodeContext), -+ .priv_data_size = sizeof(VAAPIAV1DecContext), - .caps_internal = HWACCEL_CAP_ASYNC_SAFE, - }; - -From be18092cec1867b5e30525e023f49c5e7547931b Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:24:03 +0800 -Subject: [PATCH 14/15] avcodec/av1_vaapi: enable segmentation features - -Signed-off-by: Fei Wang -(cherry picked from commit dc94f2eaaf0ae623d7dc02e1273c829015c025a3) -(cherry picked from commit 582fb329a483774f0345cbfebc3a12f0ad8f5bba) ---- - libavcodec/vaapi_av1.c | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) - -diff --git a/libavcodec/vaapi_av1.c b/libavcodec/vaapi_av1.c -index 26476c7738..c57d1b898a 100644 ---- a/libavcodec/vaapi_av1.c -+++ b/libavcodec/vaapi_av1.c -@@ -126,6 +126,9 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - int err = 0; - int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain; - uint8_t remap_lr_type[4] = {AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ}; -+ uint8_t segmentation_feature_signed[AV1_SEG_LVL_MAX] = {1, 1, 1, 1, 1, 0, 0, 0}; -+ uint8_t segmentation_feature_max[AV1_SEG_LVL_MAX] = {255, AV1_MAX_LOOP_FILTER, -+ AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, 7 , 0 , 0 }; - - bit_depth_idx = vaapi_av1_get_bit_depth_idx(avctx); - if (bit_depth_idx < 0) -@@ -293,6 +296,17 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - for (int j = 0; j < 6; j++) - pic_param.wm[i - 1].wmmat[j] = s->cur_frame.gm_params[i][j]; - } -+ for (int i = 0; i < AV1_MAX_SEGMENTS; i++) { -+ for (int j = 0; j < AV1_SEG_LVL_MAX; j++) { -+ pic_param.seg_info.feature_mask[i] |= (frame_header->feature_enabled[i][j] << j); -+ if (segmentation_feature_signed[j]) -+ pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j], -+ -segmentation_feature_max[j], segmentation_feature_max[j]); -+ else -+ pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j], -+ 0, segmentation_feature_max[j]); -+ } -+ } - if (apply_grain) { - for (int i = 0; i < film_grain->num_y_points; i++) { - pic_param.film_grain_info.point_y_value[i] = - -From 293e067b0c0f592628ee0de71769ed2e9c3d07f2 Mon Sep 17 00:00:00 2001 -From: Fei Wang -Date: Tue, 12 Oct 2021 16:24:04 +0800 -Subject: [PATCH 15/15] avcodec/av1_vaapi: improve decode quality - -- quantizer delta and matrix level specific. -- support loop filter delta. -- support use superres. - -Signed-off-by: Fei Wang -(cherry picked from commit 84c73102d933c9b7f64f504196c91edddad99618) -(cherry picked from commit 2c887141b8318b7d4b198461bbb8d94ac662a96c) ---- - libavcodec/vaapi_av1.c | 68 +++++++++++++++++++++++++----------------- - 1 file changed, 41 insertions(+), 27 deletions(-) - -diff --git a/libavcodec/vaapi_av1.c b/libavcodec/vaapi_av1.c -index c57d1b898a..5985493b8d 100644 ---- a/libavcodec/vaapi_av1.c -+++ b/libavcodec/vaapi_av1.c -@@ -147,27 +147,35 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - - memset(&pic_param, 0, sizeof(VADecPictureParameterBufferAV1)); - pic_param = (VADecPictureParameterBufferAV1) { -- .profile = seq->seq_profile, -- .order_hint_bits_minus_1 = seq->order_hint_bits_minus_1, -- .bit_depth_idx = bit_depth_idx, -- .current_frame = pic->output_surface, -- .current_display_picture = pic->output_surface, -- .current_display_picture = vaapi_av1_surface_id(&s->cur_frame), -- .frame_width_minus1 = frame_header->frame_width_minus_1, -- .frame_height_minus1 = frame_header->frame_height_minus_1, -- .primary_ref_frame = frame_header->primary_ref_frame, -- .order_hint = frame_header->order_hint, -- .tile_cols = frame_header->tile_cols, -- .tile_rows = frame_header->tile_rows, -- .context_update_tile_id = frame_header->context_update_tile_id, -- .interp_filter = frame_header->interpolation_filter, -- .filter_level[0] = frame_header->loop_filter_level[0], -- .filter_level[1] = frame_header->loop_filter_level[1], -- .filter_level_u = frame_header->loop_filter_level[2], -- .filter_level_v = frame_header->loop_filter_level[3], -- .base_qindex = frame_header->base_q_idx, -- .cdef_damping_minus_3 = frame_header->cdef_damping_minus_3, -- .cdef_bits = frame_header->cdef_bits, -+ .profile = seq->seq_profile, -+ .order_hint_bits_minus_1 = seq->order_hint_bits_minus_1, -+ .bit_depth_idx = bit_depth_idx, -+ .matrix_coefficients = seq->color_config.matrix_coefficients, -+ .current_frame = pic->output_surface, -+ .current_display_picture = vaapi_av1_surface_id(&s->cur_frame), -+ .frame_width_minus1 = frame_header->frame_width_minus_1, -+ .frame_height_minus1 = frame_header->frame_height_minus_1, -+ .primary_ref_frame = frame_header->primary_ref_frame, -+ .order_hint = frame_header->order_hint, -+ .tile_cols = frame_header->tile_cols, -+ .tile_rows = frame_header->tile_rows, -+ .context_update_tile_id = frame_header->context_update_tile_id, -+ .superres_scale_denominator = frame_header->use_superres ? -+ frame_header->coded_denom + AV1_SUPERRES_DENOM_MIN : -+ AV1_SUPERRES_NUM, -+ .interp_filter = frame_header->interpolation_filter, -+ .filter_level[0] = frame_header->loop_filter_level[0], -+ .filter_level[1] = frame_header->loop_filter_level[1], -+ .filter_level_u = frame_header->loop_filter_level[2], -+ .filter_level_v = frame_header->loop_filter_level[3], -+ .base_qindex = frame_header->base_q_idx, -+ .y_dc_delta_q = frame_header->delta_q_y_dc, -+ .u_dc_delta_q = frame_header->delta_q_u_dc, -+ .u_ac_delta_q = frame_header->delta_q_u_ac, -+ .v_dc_delta_q = frame_header->delta_q_v_dc, -+ .v_ac_delta_q = frame_header->delta_q_v_ac, -+ .cdef_damping_minus_3 = frame_header->cdef_damping_minus_3, -+ .cdef_bits = frame_header->cdef_bits, - .seq_info_fields.fields = { - .still_picture = seq->still_picture, - .use_128x128_superblock = seq->use_128x128_superblock, -@@ -238,12 +246,15 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - .mode_ref_delta_update = frame_header->loop_filter_delta_update, - }, - .mode_control_fields.bits = { -- .delta_q_present_flag = frame_header->delta_q_present, -- .log2_delta_q_res = frame_header->delta_q_res, -- .tx_mode = frame_header->tx_mode, -- .reference_select = frame_header->reference_select, -- .reduced_tx_set_used = frame_header->reduced_tx_set, -- .skip_mode_present = frame_header->skip_mode_present, -+ .delta_q_present_flag = frame_header->delta_q_present, -+ .log2_delta_q_res = frame_header->delta_q_res, -+ .delta_lf_present_flag = frame_header->delta_lf_present, -+ .log2_delta_lf_res = frame_header->delta_lf_res, -+ .delta_lf_multi = frame_header->delta_lf_multi, -+ .tx_mode = frame_header->tx_mode, -+ .reference_select = frame_header->reference_select, -+ .reduced_tx_set_used = frame_header->reduced_tx_set, -+ .skip_mode_present = frame_header->skip_mode_present, - }, - .loop_restoration_fields.bits = { - .yframe_restoration_type = remap_lr_type[frame_header->lr_type[0]], -@@ -254,6 +265,9 @@ static int vaapi_av1_start_frame(AVCodecContext *avctx, - }, - .qmatrix_fields.bits = { - .using_qmatrix = frame_header->using_qmatrix, -+ .qm_y = frame_header->qm_y, -+ .qm_u = frame_header->qm_u, -+ .qm_v = frame_header->qm_v, - } - }; - From f887a675184c8b275de52ade9b86ccdf180a6bd6 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Fri, 21 Oct 2022 16:27:14 +0200 Subject: [PATCH 03/12] ffmpeg: update libreelec patch Patch created using revisions eacfcba..3af25a2 from branch 5.1.2-libreelec-misc of https://github.com/LibreELEC/FFmpeg --- .../libreelec/ffmpeg-001-libreelec.patch | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch b/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch index cb1c3ec27d..52fa0f68bf 100644 --- a/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch +++ b/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch @@ -1,4 +1,4 @@ -From 8f7c8a0f9e28641880d72996b9452e0a9da1288c Mon Sep 17 00:00:00 2001 +From 3c53cf80b0de8af387694a464bdce294988d0fa5 Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Wed, 10 Apr 2019 13:39:21 -0700 Subject: [PATCH 1/2] libavcodec/libdav1d: add libdav1d_get_format method to @@ -18,10 +18,10 @@ decoding is properly activated. 1 file changed, 11 insertions(+) diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c -index 3c2a68b7e0..68996426cc 100644 +index 0a46cf2264..1f4e708bcf 100644 --- a/libavcodec/libdav1d.c +++ b/libavcodec/libdav1d.c -@@ -58,6 +58,16 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = { +@@ -66,6 +66,16 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = { AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, }; @@ -38,16 +38,16 @@ index 3c2a68b7e0..68996426cc 100644 static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl) { AVCodecContext *c = opaque; -@@ -264,6 +274,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame) - c->profile = p->seq_hdr->profile; - c->level = ((p->seq_hdr->operating_points[0].major_level - 2) << 2) - | p->seq_hdr->operating_points[0].minor_level; +@@ -390,6 +400,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame) + if (res < 0) + goto fail; + + frame->format = c->pix_fmt = libdav1d_get_format(c, p); frame->width = p->p.w; frame->height = p->p.h; if (c->width != p->p.w || c->height != p->p.h) { -From 635cf67be3d37159c96e75f00399b3e232372251 Mon Sep 17 00:00:00 2001 +From 3af25a2cae2ad3152e2969eefd9f13c9bb183969 Mon Sep 17 00:00:00 2001 From: chewitt Date: Sun, 11 Aug 2019 07:08:19 +0000 Subject: [PATCH 2/2] add long-term yuv2rgb logging patch @@ -57,7 +57,7 @@ Subject: [PATCH 2/2] add long-term yuv2rgb logging patch 1 file changed, 4 deletions(-) diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c -index 6a3956e8e2..d6f9aea166 100644 +index 6ee483d12a..c22161741b 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -688,10 +688,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c) From eb2bac080d958aa3c080f02a56707bf59501fda9 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Wed, 26 Oct 2022 23:05:01 +0200 Subject: [PATCH 04/12] ffmpeg: drop no longer used dav1d patchset Signed-off-by: Matthias Reichl --- .../dav1d/ffmpeg-support-dav1d-1-0-0.patch | 124 ------------------ 1 file changed, 124 deletions(-) delete mode 100644 packages/multimedia/ffmpeg/patches/dav1d/ffmpeg-support-dav1d-1-0-0.patch diff --git a/packages/multimedia/ffmpeg/patches/dav1d/ffmpeg-support-dav1d-1-0-0.patch b/packages/multimedia/ffmpeg/patches/dav1d/ffmpeg-support-dav1d-1-0-0.patch deleted file mode 100644 index 6f570375d2..0000000000 --- a/packages/multimedia/ffmpeg/patches/dav1d/ffmpeg-support-dav1d-1-0-0.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 7ee17ec7e46afef0e0af20af196292ec75f50b62 Mon Sep 17 00:00:00 2001 -From: James Almer -Date: Sat, 26 Jun 2021 17:24:15 -0300 -Subject: [PATCH] avcodec/libdav1d: don't repeatedly parse the same sequence - header - -Look at the event flag that signals a new sequence header was found -in the bitstream on supported libdav1d versions for this purpose. - -Signed-off-by: James Almer ---- - libavcodec/libdav1d.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c -index 6370ae1fbf02..c39df418d515 100644 ---- a/libavcodec/libdav1d.c -+++ b/libavcodec/libdav1d.c -@@ -33,6 +33,9 @@ - #include "decode.h" - #include "internal.h" - -+#define FF_DAV1D_VERSION_AT_LEAST(x,y) \ -+ (DAV1D_API_VERSION_MAJOR > (x) || DAV1D_API_VERSION_MAJOR == (x) && DAV1D_API_VERSION_MINOR >= (y)) -+ - typedef struct Libdav1dContext { - AVClass *class; - Dav1dContext *c; - - -From d873b5fffc8292242549c4c026023e370e15c05b Mon Sep 17 00:00:00 2001 -From: James Almer -Date: Mon, 20 Sep 2021 22:30:35 -0300 -Subject: [PATCH] avcodec/libdav1d: pass auto threads value to libdav1d - -libdav1d 1.0.0 will be the first version supporting Dav1dSettings.n_threads == 0. - -Signed-off-by: James Almer ---- - libavcodec/libdav1d.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c -index 4711337f39a7..e4fdaf722907 100644 ---- a/libavcodec/libdav1d.c -+++ b/libavcodec/libdav1d.c -@@ -207,7 +207,11 @@ static av_cold int libdav1d_init(AVCodecContext *c) - { - Libdav1dContext *dav1d = c->priv_data; - Dav1dSettings s; -+#if FF_DAV1D_VERSION_AT_LEAST(6,0) -+ int threads = c->thread_count; -+#else - int threads = (c->thread_count ? c->thread_count : av_cpu_count()) * 3 / 2; -+#endif - int res; - - av_log(c, AV_LOG_INFO, "libdav1d %s\n", dav1d_version()); - - -From e204846ec16c1ab34c7f3a681734cf5190433018 Mon Sep 17 00:00:00 2001 -From: James Almer -Date: Fri, 3 Sep 2021 13:50:32 -0300 -Subject: [PATCH] avcodec/libdav1d: fix compilation after recent libdav1d API - changes - -They were done in preparation for an upcoming 1.0 release. -Keep supporting previous releases for the time being. - -Reviewed-by: BBB -Signed-off-by: James Almer ---- - libavcodec/libdav1d.c | 21 +++++++++++++++++++-- - 1 file changed, 19 insertions(+), 2 deletions(-) - -diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c -index 51e0980f6edb..4711337f39a7 100644 ---- a/libavcodec/libdav1d.c -+++ b/libavcodec/libdav1d.c -@@ -228,6 +228,15 @@ static av_cold int libdav1d_init(AVCodecContext *c) - if (dav1d->operating_point >= 0) - s.operating_point = dav1d->operating_point; - -+#if FF_DAV1D_VERSION_AT_LEAST(6,0) -+ if (dav1d->frame_threads || dav1d->tile_threads) -+ s.n_threads = FFMAX(dav1d->frame_threads, dav1d->tile_threads); -+ else -+ s.n_threads = FFMIN(threads, DAV1D_MAX_THREADS); -+ s.max_frame_delay = (c->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : s.n_threads; -+ av_log(c, AV_LOG_DEBUG, "Using %d threads, %d max_frame_delay\n", -+ s.n_threads, s.max_frame_delay); -+#else - s.n_tile_threads = dav1d->tile_threads - ? dav1d->tile_threads - : FFMIN(floor(sqrt(threads)), DAV1D_MAX_TILE_THREADS); -@@ -236,6 +245,7 @@ static av_cold int libdav1d_init(AVCodecContext *c) - : FFMIN(ceil(threads / s.n_tile_threads), DAV1D_MAX_FRAME_THREADS); - av_log(c, AV_LOG_DEBUG, "Using %d frame threads, %d tile threads\n", - s.n_frame_threads, s.n_tile_threads); -+#endif - - res = libdav1d_parse_extradata(c); - if (res < 0) -@@ -519,11 +529,18 @@ static av_cold int libdav1d_close(AVCodecContext *c) - return 0; - } - -+#ifndef DAV1D_MAX_FRAME_THREADS -+#define DAV1D_MAX_FRAME_THREADS DAV1D_MAX_THREADS -+#endif -+#ifndef DAV1D_MAX_TILE_THREADS -+#define DAV1D_MAX_TILE_THREADS DAV1D_MAX_THREADS -+#endif -+ - #define OFFSET(x) offsetof(Libdav1dContext, x) - #define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM - static const AVOption libdav1d_options[] = { -- { "tilethreads", "Tile threads", OFFSET(tile_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_TILE_THREADS, VD }, -- { "framethreads", "Frame threads", OFFSET(frame_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_FRAME_THREADS, VD }, -+ { "tilethreads", "Tile threads", OFFSET(tile_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_TILE_THREADS, VD | AV_OPT_FLAG_DEPRECATED }, -+ { "framethreads", "Frame threads", OFFSET(frame_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_FRAME_THREADS, VD | AV_OPT_FLAG_DEPRECATED }, - { "filmgrain", "Apply Film Grain", OFFSET(apply_grain), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VD | AV_OPT_FLAG_DEPRECATED }, - { "oppoint", "Select an operating point of the scalable bitstream", OFFSET(operating_point), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 31, VD }, - { "alllayers", "Output all spatial layers", OFFSET(all_layers), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VD }, From b27d064b981aeaf8b96a2de95b01d4790f6df801 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sat, 5 Nov 2022 18:18:45 +0100 Subject: [PATCH 05/12] ffmpeg: update v4l2-request patch Patch created using revisions eacfcba..1d0bcfa from branch v4l2-request-n5.1.2 of https://github.com/jernejsk/FFmpeg --- .../ffmpeg-001-v4l2-request.patch | 552 +++++++++++------- .../0003-v4l2_request-revert-changes.patch | 80 --- 2 files changed, 332 insertions(+), 300 deletions(-) delete mode 100644 projects/Allwinner/patches/ffmpeg/0003-v4l2_request-revert-changes.patch diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch b/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch index 6afdb2d738..4e2bff5f77 100644 --- a/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch +++ b/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch @@ -1,7 +1,7 @@ -From 46ce980905101822ca824243635d10d660172570 Mon Sep 17 00:00:00 2001 +From baea39c4a1898a8eb89548196722ae56b1e27515 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Mon, 3 Dec 2018 23:48:04 +0100 -Subject: [PATCH 01/12] avutil: add av_buffer_pool_flush() +Subject: [PATCH 01/13] avutil: add av_buffer_pool_flush() Used by V4L2 request API hwaccel @@ -12,10 +12,10 @@ Signed-off-by: Jonas Karlman 2 files changed, 18 insertions(+) diff --git a/libavutil/buffer.c b/libavutil/buffer.c -index 858633e8c7..41555d9982 100644 +index 54590be566..1af892b348 100644 --- a/libavutil/buffer.c +++ b/libavutil/buffer.c -@@ -305,6 +305,19 @@ static void buffer_pool_free(AVBufferPool *pool) +@@ -319,6 +319,19 @@ static void buffer_pool_free(AVBufferPool *pool) av_freep(&pool); } @@ -36,11 +36,11 @@ index 858633e8c7..41555d9982 100644 { AVBufferPool *pool; diff --git a/libavutil/buffer.h b/libavutil/buffer.h -index 241a80ed67..f41363faf1 100644 +index e1ef5b7f07..fde9bae4f6 100644 --- a/libavutil/buffer.h +++ b/libavutil/buffer.h -@@ -315,6 +315,11 @@ AVBufferPool *av_buffer_pool_init2(size_t size, void *opaque, - #endif +@@ -284,6 +284,11 @@ AVBufferPool *av_buffer_pool_init2(size_t size, void *opaque, + AVBufferRef* (*alloc)(void *opaque, size_t size), void (*pool_free)(void *opaque)); +/** @@ -52,10 +52,10 @@ index 241a80ed67..f41363faf1 100644 * Mark the pool as being available for freeing. It will actually be freed only * once all the allocated buffers associated with the pool are released. Thus it -From 6f3b6c4d442a9a3322305e5600ce7f84af5971cc Mon Sep 17 00:00:00 2001 +From 91c471c226611f23d0eb07a7c5975470a01be985 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Sat, 15 Dec 2018 22:32:16 +0100 -Subject: [PATCH 02/12] Add common V4L2 request API code +Subject: [PATCH 02/13] Add common V4L2 request API code Signed-off-by: Jonas Karlman Signed-off-by: Alex Bee @@ -63,17 +63,17 @@ Signed-off-by: Alex Bee configure | 12 + libavcodec/Makefile | 1 + libavcodec/hwconfig.h | 2 + - libavcodec/v4l2_request.c | 1027 +++++++++++++++++++++++++++++++++++++ + libavcodec/v4l2_request.c | 1023 +++++++++++++++++++++++++++++++++++++ libavcodec/v4l2_request.h | 77 +++ - 5 files changed, 1119 insertions(+) + 5 files changed, 1115 insertions(+) create mode 100644 libavcodec/v4l2_request.c create mode 100644 libavcodec/v4l2_request.h diff --git a/configure b/configure -index 4ba72bf84b..4a3a5ae9e0 100755 +index ba5793b2ff..0ccfd0ac36 100755 --- a/configure +++ b/configure -@@ -279,6 +279,7 @@ External library support: +@@ -281,6 +281,7 @@ External library support: if openssl, gnutls or mbedtls is not used [no] --enable-libtwolame enable MP2 encoding via libtwolame [no] --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] @@ -81,7 +81,7 @@ index 4ba72bf84b..4a3a5ae9e0 100755 --enable-libv4l2 enable libv4l2/v4l-utils [no] --enable-libvidstab enable video stabilization using vid.stab [no] --enable-libvmaf enable vmaf filter via libvmaf [no] -@@ -346,6 +347,7 @@ External library support: +@@ -349,6 +350,7 @@ External library support: --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] --enable-rkmpp enable Rockchip Media Process Platform code [no] --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] @@ -89,7 +89,7 @@ index 4ba72bf84b..4a3a5ae9e0 100755 --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] --disable-videotoolbox disable VideoToolbox code [autodetect] -@@ -1814,6 +1816,7 @@ EXTERNAL_LIBRARY_LIST=" +@@ -1869,6 +1871,7 @@ EXTERNAL_LIBRARY_LIST=" libtheora libtwolame libuavs3d @@ -97,31 +97,31 @@ index 4ba72bf84b..4a3a5ae9e0 100755 libv4l2 libvmaf libvorbis -@@ -1868,6 +1871,7 @@ HWACCEL_LIBRARY_LIST=" +@@ -1924,6 +1927,7 @@ HWACCEL_LIBRARY_LIST=" mmal omx opencl + v4l2_request - vulkan " -@@ -2920,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" + DOCUMENT_LIST=" +@@ -3011,6 +3015,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" ffnvcodec_deps_any="libdl LoadLibrary" nvdec_deps="ffnvcodec" +v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" - vaapi_x11_deps="xlib" + vaapi_x11_deps="xlib_x11" videotoolbox_hwaccel_deps="videotoolbox pthreads" videotoolbox_hwaccel_extralibs="-framework QuartzCore" -@@ -6439,6 +6444,7 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame +@@ -6634,6 +6639,7 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } enabled libuavs3d && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode +enabled libudev && require_pkg_config libudev libudev libudev.h udev_new enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit - enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 1.5.2" libvmaf.h compute_vmaf -@@ -6537,6 +6543,10 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r + enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init +@@ -6735,6 +6741,10 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r { enabled libdrm || die "ERROR: rkmpp requires --enable-libdrm"; } } @@ -132,7 +132,7 @@ index 4ba72bf84b..4a3a5ae9e0 100755 enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init -@@ -6618,6 +6628,8 @@ if enabled v4l2_m2m; then +@@ -6817,6 +6827,8 @@ if enabled v4l2_m2m; then check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" fi @@ -142,10 +142,10 @@ index 4ba72bf84b..4a3a5ae9e0 100755 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 33a280cf69..90dfffcb20 100644 +index 457ec58377..3cb7dede11 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -155,6 +155,7 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o +@@ -162,6 +162,7 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o OBJS-$(CONFIG_VP56DSP) += vp56dsp.o OBJS-$(CONFIG_VP8DSP) += vp8dsp.o OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o @@ -154,13 +154,13 @@ index 33a280cf69..90dfffcb20 100644 OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h -index f421dc909f..ee78d8ab8e 100644 +index 721424912c..00864efc27 100644 --- a/libavcodec/hwconfig.h +++ b/libavcodec/hwconfig.h -@@ -80,6 +80,8 @@ typedef struct AVCodecHWConfigInternal { +@@ -78,6 +78,8 @@ typedef struct AVCodecHWConfigInternal { + HW_CONFIG_HWACCEL(1, 1, 1, VIDEOTOOLBOX, VIDEOTOOLBOX, ff_ ## codec ## _videotoolbox_hwaccel) + #define HWACCEL_D3D11VA(codec) \ HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD, NONE, ff_ ## codec ## _d3d11va_hwaccel) - #define HWACCEL_XVMC(codec) \ - HW_CONFIG_HWACCEL(0, 0, 1, XVMC, NONE, ff_ ## codec ## _xvmc_hwaccel) +#define HWACCEL_V4L2REQUEST(codec) \ + HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) @@ -168,10 +168,10 @@ index f421dc909f..ee78d8ab8e 100644 &(const AVCodecHWConfigInternal) { \ diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c new file mode 100644 -index 0000000000..b57bbf29bc +index 0000000000..e7faf100f0 --- /dev/null +++ b/libavcodec/v4l2_request.c -@@ -0,0 +1,1027 @@ +@@ -0,0 +1,1023 @@ +/* + * This file is part of FFmpeg. + * @@ -316,14 +316,12 @@ index 0000000000..b57bbf29bc + .type = buf->buffer.type, + .memory = buf->buffer.memory, + .index = buf->index, -+ .timestamp.tv_usec = ctx->timestamp, ++ .timestamp.tv_usec = buf->index + 1, + .bytesused = buf->used, + .request_fd = request_fd, + .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags, + }; + -+ buf->buffer.timestamp = buffer.timestamp; -+ + if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) { + planes[0].bytesused = buf->used; + buffer.bytesused = 0; @@ -413,9 +411,6 @@ index 0000000000..b57bbf29bc + + av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice); + -+ if (first_slice) -+ ctx->timestamp++; -+ + ret = v4l2_request_set_controls(ctx, req->request_fd, control, count); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); @@ -910,7 +905,6 @@ index 0000000000..b57bbf29bc + + ctx->media_fd = -1; + ctx->video_fd = -1; -+ ctx->timestamp = 0; + + udev = udev_new(); + if (!udev) { @@ -1044,6 +1038,8 @@ index 0000000000..b57bbf29bc + return ret; + } + ++ buf->buffer.timestamp.tv_usec = buf->index + 1; ++ + if (V4L2_TYPE_IS_OUTPUT(type)) { + void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset); + if (addr == MAP_FAILED) { @@ -1099,7 +1095,7 @@ index 0000000000..b57bbf29bc + av_free(data); +} + -+static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size) ++static AVBufferRef *v4l2_request_frame_alloc(void *opaque, size_t size) +{ + AVCodecContext *avctx = opaque; + V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; @@ -1283,50 +1279,10 @@ index 0000000000..58d2aa70af + +#endif /* AVCODEC_V4L2_REQUEST_H */ -From 3a8ac13e041cec840d3cd1e83e6294a1a47ac6df Mon Sep 17 00:00:00 2001 -From: Ezequiel Garcia -Date: Wed, 20 Feb 2019 11:18:00 -0300 -Subject: [PATCH 03/12] h264dec: add idr_pic_id to slice context - -Used by V4L2 request API h264 hwaccel - -Signed-off-by: Ezequiel Garcia -Signed-off-by: Jonas Karlman ---- - libavcodec/h264_slice.c | 2 +- - libavcodec/h264dec.h | 1 + - 2 files changed, 2 insertions(+), 1 deletion(-) - -diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index 0b415ada6f..b3e3ef6183 100644 ---- a/libavcodec/h264_slice.c -+++ b/libavcodec/h264_slice.c -@@ -1830,7 +1830,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, - } - - if (nal->type == H264_NAL_IDR_SLICE) -- get_ue_golomb_long(&sl->gb); /* idr_pic_id */ -+ sl->idr_pic_id = get_ue_golomb_long(&sl->gb); - - sl->poc_lsb = 0; - sl->delta_poc_bottom = 0; -diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h -index b7b19ba4f1..0698ab95ba 100644 ---- a/libavcodec/h264dec.h -+++ b/libavcodec/h264dec.h -@@ -336,6 +336,7 @@ typedef struct H264SliceContext { - int delta_poc[2]; - int curr_pic_num; - int max_pic_num; -+ int idr_pic_id; - } H264SliceContext; - - /** - -From e7f515597ca5f0900f3bd08ef40bb517703433bc Mon Sep 17 00:00:00 2001 +From 1aa288b7f95876298e88eff71af50c350a51f30a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 22 May 2019 14:44:22 +0200 -Subject: [PATCH 04/12] h264dec: add ref_pic_marking and pic_order_cnt bit_size +Subject: [PATCH 03/13] h264dec: add ref_pic_marking and pic_order_cnt bit_size to slice context Used by V4L2 request API h264 hwaccel @@ -1339,10 +1295,10 @@ Signed-off-by: Jonas Karlman 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index b3e3ef6183..bcb9f70c0e 100644 +index d56722a5c2..3e239ae6f0 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c -@@ -1748,7 +1748,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, +@@ -1838,7 +1838,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, unsigned int slice_type, tmp, i; int field_pic_flag, bottom_field_flag; int first_slice = sl == h->slice_ctx && !h->current_slice; @@ -1351,7 +1307,7 @@ index b3e3ef6183..bcb9f70c0e 100644 if (first_slice) av_assert0(!h->setup_finished); -@@ -1834,6 +1834,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, +@@ -1929,6 +1929,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, sl->poc_lsb = 0; sl->delta_poc_bottom = 0; @@ -1359,7 +1315,7 @@ index b3e3ef6183..bcb9f70c0e 100644 if (sps->poc_type == 0) { sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb); -@@ -1848,6 +1849,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, +@@ -1943,6 +1944,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME) sl->delta_poc[1] = get_se_golomb(&sl->gb); } @@ -1367,7 +1323,7 @@ index b3e3ef6183..bcb9f70c0e 100644 sl->redundant_pic_count = 0; if (pps->redundant_pic_cnt_present) -@@ -1887,9 +1889,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, +@@ -1982,9 +1984,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, sl->explicit_ref_marking = 0; if (nal->ref_idc) { @@ -1380,30 +1336,30 @@ index b3e3ef6183..bcb9f70c0e 100644 if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) { diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h -index 0698ab95ba..2b39e82c3b 100644 +index 9a1ec1bace..a87415f822 100644 --- a/libavcodec/h264dec.h +++ b/libavcodec/h264dec.h -@@ -329,6 +329,7 @@ typedef struct H264SliceContext { - MMCO mmco[MAX_MMCO_COUNT]; +@@ -314,6 +314,7 @@ typedef struct H264SliceContext { + MMCO mmco[H264_MAX_MMCO_COUNT]; int nb_mmco; int explicit_ref_marking; + int ref_pic_marking_bit_size; int frame_num; - int poc_lsb; -@@ -337,6 +338,7 @@ typedef struct H264SliceContext { + int idr_pic_id; +@@ -322,6 +323,7 @@ typedef struct H264SliceContext { + int delta_poc[2]; int curr_pic_num; int max_pic_num; - int idr_pic_id; + int pic_order_cnt_bit_size; } H264SliceContext; /** -From 9f455a7adb8cabb575049204375cc3b8d97b2c86 Mon Sep 17 00:00:00 2001 +From efdc653700b45efb64af73060330b95e26aa99c5 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Sat, 15 Dec 2018 22:32:16 +0100 -Subject: [PATCH 05/12] Add V4L2 request API h264 hwaccel +Subject: [PATCH 04/13] Add V4L2 request API h264 hwaccel Signed-off-by: Jernej Skrabec Signed-off-by: Jonas Karlman @@ -1418,10 +1374,10 @@ Signed-off-by: Jonas Karlman create mode 100644 libavcodec/v4l2_request_h264.c diff --git a/configure b/configure -index 4a3a5ae9e0..efd12f1b52 100755 +index 0ccfd0ac36..4f98dec7b1 100755 --- a/configure +++ b/configure -@@ -2952,6 +2952,8 @@ h264_dxva2_hwaccel_deps="dxva2" +@@ -3045,6 +3045,8 @@ h264_dxva2_hwaccel_deps="dxva2" h264_dxva2_hwaccel_select="h264_decoder" h264_nvdec_hwaccel_deps="nvdec" h264_nvdec_hwaccel_select="h264_decoder" @@ -1430,7 +1386,7 @@ index 4a3a5ae9e0..efd12f1b52 100755 h264_vaapi_hwaccel_deps="vaapi" h264_vaapi_hwaccel_select="h264_decoder" h264_vdpau_hwaccel_deps="vdpau" -@@ -6629,6 +6631,7 @@ if enabled v4l2_m2m; then +@@ -6828,6 +6830,7 @@ if enabled v4l2_m2m; then fi check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns @@ -1439,10 +1395,10 @@ index 4a3a5ae9e0..efd12f1b52 100755 check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 90dfffcb20..426c7528e9 100644 +index 3cb7dede11..54f8ed670b 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -935,6 +935,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL) += dxva2_h264.o +@@ -966,6 +966,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL) += dxva2_h264.o OBJS-$(CONFIG_H264_DXVA2_HWACCEL) += dxva2_h264.o OBJS-$(CONFIG_H264_NVDEC_HWACCEL) += nvdec_h264.o OBJS-$(CONFIG_H264_QSV_HWACCEL) += qsvdec.o @@ -1451,10 +1407,10 @@ index 90dfffcb20..426c7528e9 100644 OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index bcb9f70c0e..6b7f569da4 100644 +index 3e239ae6f0..335dc2cac1 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c -@@ -768,6 +768,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) +@@ -792,6 +792,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \ (CONFIG_H264_D3D11VA_HWACCEL * 2) + \ CONFIG_H264_NVDEC_HWACCEL + \ @@ -1462,10 +1418,10 @@ index bcb9f70c0e..6b7f569da4 100644 CONFIG_H264_VAAPI_HWACCEL + \ CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \ CONFIG_H264_VDPAU_HWACCEL) -@@ -852,6 +853,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) +@@ -881,6 +882,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) #endif - #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL - *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; + #if CONFIG_H264_VAAPI_HWACCEL + *fmt++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_H264_V4L2REQUEST_HWACCEL + *fmt++ = AV_PIX_FMT_DRM_PRIME; @@ -1473,10 +1429,10 @@ index bcb9f70c0e..6b7f569da4 100644 if (h->avctx->codec->pix_fmts) choices = h->avctx->codec->pix_fmts; diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c -index 1705046e29..55046031b2 100644 +index 2a5b53ea56..dcba237b4f 100644 --- a/libavcodec/h264dec.c +++ b/libavcodec/h264dec.c -@@ -1076,6 +1076,9 @@ AVCodec ff_h264_decoder = { +@@ -1099,6 +1099,9 @@ const FFCodec ff_h264_decoder = { #endif #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL HWACCEL_VIDEOTOOLBOX(h264), @@ -1487,10 +1443,10 @@ index 1705046e29..55046031b2 100644 NULL }, diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index 8e54cf73f9..969a1da0f4 100644 +index aca55831f3..014b95f0c0 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h -@@ -32,6 +32,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel; +@@ -33,6 +33,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel; extern const AVHWAccel ff_h264_d3d11va2_hwaccel; extern const AVHWAccel ff_h264_dxva2_hwaccel; extern const AVHWAccel ff_h264_nvdec_hwaccel; @@ -1500,7 +1456,7 @@ index 8e54cf73f9..969a1da0f4 100644 extern const AVHWAccel ff_h264_videotoolbox_hwaccel; diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c new file mode 100644 -index 0000000000..394bae0550 +index 0000000000..c960c9c887 --- /dev/null +++ b/libavcodec/v4l2_request_h264.c @@ -0,0 +1,456 @@ @@ -1524,6 +1480,7 @@ index 0000000000..394bae0550 + +#include "h264dec.h" +#include "hwconfig.h" ++#include "internal.h" +#include "v4l2_request.h" + +typedef struct V4L2RequestControlsH264 { @@ -1758,7 +1715,7 @@ index 0000000000..394bae0550 + + fill_dpb(&controls->decode_params, h); + -+ controls->first_slice = !FIELD_PICTURE(h) || h->first_field; ++ controls->first_slice = 1; + controls->num_slices = 0; + + return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f); @@ -1889,8 +1846,7 @@ index 0000000000..394bae0550 + +static int v4l2_request_h264_end_frame(AVCodecContext *avctx) +{ -+ const H264Context *h = avctx->priv_data; -+ return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field); ++ return v4l2_request_h264_queue_decode(avctx, 1); +} + +static int v4l2_request_h264_set_controls(AVCodecContext *avctx) @@ -1961,10 +1917,10 @@ index 0000000000..394bae0550 + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; -From 1ffea498d7e1000acbaa456bb52e26757779622a Mon Sep 17 00:00:00 2001 +From a1e633c09c135ff68937f1d25bbffc18c8cb55d1 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Sat, 15 Dec 2018 22:32:16 +0100 -Subject: [PATCH 06/12] Add V4L2 request API mpeg2 hwaccel +Subject: [PATCH 05/13] Add V4L2 request API mpeg2 hwaccel Signed-off-by: Jonas Karlman --- @@ -1977,10 +1933,10 @@ Signed-off-by: Jonas Karlman create mode 100644 libavcodec/v4l2_request_mpeg2.c diff --git a/configure b/configure -index efd12f1b52..1b70ea65e4 100755 +index 4f98dec7b1..c173884edb 100755 --- a/configure +++ b/configure -@@ -2996,6 +2996,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2" +@@ -3085,6 +3085,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2" mpeg2_dxva2_hwaccel_select="mpeg2video_decoder" mpeg2_nvdec_hwaccel_deps="nvdec" mpeg2_nvdec_hwaccel_select="mpeg2video_decoder" @@ -1989,7 +1945,7 @@ index efd12f1b52..1b70ea65e4 100755 mpeg2_vaapi_hwaccel_deps="vaapi" mpeg2_vaapi_hwaccel_select="mpeg2video_decoder" mpeg2_vdpau_hwaccel_deps="vdpau" -@@ -6632,6 +6634,7 @@ fi +@@ -6831,6 +6833,7 @@ fi check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" @@ -1998,10 +1954,10 @@ index efd12f1b52..1b70ea65e4 100755 check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 426c7528e9..02c023a447 100644 +index 54f8ed670b..65ead12255 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -955,6 +955,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL) += dxva2_mpeg2.o +@@ -985,6 +985,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL) += dxva2_mpeg2.o OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL) += nvdec_mpeg12.o OBJS-$(CONFIG_MPEG2_QSV_HWACCEL) += qsvdec.o @@ -2010,7 +1966,7 @@ index 426c7528e9..02c023a447 100644 OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index 969a1da0f4..a8ae1483d8 100644 +index 014b95f0c0..3b675dd9f8 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -53,6 +53,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel; @@ -2022,10 +1978,10 @@ index 969a1da0f4..a8ae1483d8 100644 extern const AVHWAccel ff_mpeg2_vdpau_hwaccel; extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel; diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c -index 09bf01247d..bcdf26680a 100644 +index e9bde48f7a..8c007756b4 100644 --- a/libavcodec/mpeg12dec.c +++ b/libavcodec/mpeg12dec.c -@@ -1147,6 +1147,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = { +@@ -1137,6 +1137,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = { #endif #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL AV_PIX_FMT_VIDEOTOOLBOX, @@ -2035,10 +1991,10 @@ index 09bf01247d..bcdf26680a 100644 #endif AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE -@@ -2961,6 +2964,9 @@ AVCodec ff_mpeg2video_decoder = { +@@ -2935,6 +2938,9 @@ const FFCodec ff_mpeg2video_decoder = { #endif - #if CONFIG_MPEG2_XVMC_HWACCEL - HWACCEL_XVMC(mpeg2), + #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL + HWACCEL_VIDEOTOOLBOX(mpeg2), +#endif +#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL + HWACCEL_V4L2REQUEST(mpeg2), @@ -2211,10 +2167,10 @@ index 0000000000..84d53209c7 + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; -From 2190df619ea9d9cedf3d3c7442de0dc863c8b62e Mon Sep 17 00:00:00 2001 +From 6bccc47f7ee6267b0c99318ab4cc47978161cef9 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 22 May 2019 14:46:58 +0200 -Subject: [PATCH 07/12] Add V4L2 request API vp8 hwaccel +Subject: [PATCH 06/13] Add V4L2 request API vp8 hwaccel Signed-off-by: Boris Brezillon Signed-off-by: Ezequiel Garcia @@ -2229,10 +2185,10 @@ Signed-off-by: Jonas Karlman create mode 100644 libavcodec/v4l2_request_vp8.c diff --git a/configure b/configure -index 1b70ea65e4..3f8f7b195a 100755 +index c173884edb..ec16d26391 100755 --- a/configure +++ b/configure -@@ -3028,6 +3028,8 @@ vc1_vdpau_hwaccel_deps="vdpau" +@@ -3117,6 +3117,8 @@ vc1_vdpau_hwaccel_deps="vdpau" vc1_vdpau_hwaccel_select="vc1_decoder" vp8_nvdec_hwaccel_deps="nvdec" vp8_nvdec_hwaccel_select="vp8_decoder" @@ -2241,7 +2197,7 @@ index 1b70ea65e4..3f8f7b195a 100755 vp8_vaapi_hwaccel_deps="vaapi" vp8_vaapi_hwaccel_select="vp8_decoder" vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9" -@@ -6635,6 +6637,7 @@ fi +@@ -6834,6 +6836,7 @@ fi check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;" @@ -2250,10 +2206,10 @@ index 1b70ea65e4..3f8f7b195a 100755 check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 02c023a447..c79d678eb3 100644 +index 65ead12255..aed145ecb6 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -971,6 +971,7 @@ OBJS-$(CONFIG_VC1_QSV_HWACCEL) += qsvdec.o +@@ -1000,6 +1000,7 @@ OBJS-$(CONFIG_VC1_QSV_HWACCEL) += qsvdec.o OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o OBJS-$(CONFIG_VP8_NVDEC_HWACCEL) += nvdec_vp8.o @@ -2262,7 +2218,7 @@ index 02c023a447..c79d678eb3 100644 OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL) += dxva2_vp9.o OBJS-$(CONFIG_VP9_DXVA2_HWACCEL) += dxva2_vp9.o diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index a8ae1483d8..9f8d41e367 100644 +index 3b675dd9f8..6f9f078001 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -69,6 +69,7 @@ extern const AVHWAccel ff_vc1_nvdec_hwaccel; @@ -2460,10 +2416,10 @@ index 0000000000..bc0fc40072 + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c -index d16e7b6aa3..8ee768d875 100644 +index f521f2c9de..a4799fca09 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c -@@ -176,6 +176,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s) +@@ -180,6 +180,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s) #endif #if CONFIG_VP8_NVDEC_HWACCEL AV_PIX_FMT_CUDA, @@ -2473,7 +2429,7 @@ index d16e7b6aa3..8ee768d875 100644 #endif AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE, -@@ -2972,6 +2975,9 @@ AVCodec ff_vp8_decoder = { +@@ -2976,6 +2979,9 @@ const FFCodec ff_vp8_decoder = { #endif #if CONFIG_VP8_NVDEC_HWACCEL HWACCEL_NVDEC(vp8), @@ -2484,10 +2440,10 @@ index d16e7b6aa3..8ee768d875 100644 NULL }, -From 4886d1e8caeee49c6ca4d92bc1eaebcdb884924c Mon Sep 17 00:00:00 2001 +From 26e5e1407717e88c508abfb24ae329036bb1a99e Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Sat, 15 Dec 2018 22:32:16 +0100 -Subject: [PATCH 08/12] Add V4L2 request API hevc hwaccel +Subject: [PATCH 07/13] Add V4L2 request API hevc hwaccel Signed-off-by: Jernej Skrabec Signed-off-by: Jonas Karlman @@ -2498,15 +2454,15 @@ Signed-off-by: Alex Bee libavcodec/Makefile | 1 + libavcodec/hevcdec.c | 10 + libavcodec/hwaccels.h | 1 + - libavcodec/v4l2_request_hevc.c | 681 +++++++++++++++++++++++++++++++++ - 5 files changed, 696 insertions(+) + libavcodec/v4l2_request_hevc.c | 679 +++++++++++++++++++++++++++++++++ + 5 files changed, 694 insertions(+) create mode 100644 libavcodec/v4l2_request_hevc.c diff --git a/configure b/configure -index 3f8f7b195a..6192a6c144 100755 +index ec16d26391..d366652e51 100755 --- a/configure +++ b/configure -@@ -2968,6 +2968,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" +@@ -3061,6 +3061,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" hevc_dxva2_hwaccel_select="hevc_decoder" hevc_nvdec_hwaccel_deps="nvdec" hevc_nvdec_hwaccel_select="hevc_decoder" @@ -2515,7 +2471,7 @@ index 3f8f7b195a..6192a6c144 100755 hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" hevc_vaapi_hwaccel_select="hevc_decoder" hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" -@@ -6636,6 +6638,7 @@ fi +@@ -6835,6 +6837,7 @@ fi check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" @@ -2524,10 +2480,10 @@ index 3f8f7b195a..6192a6c144 100755 check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;" diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index c79d678eb3..0059074530 100644 +index aed145ecb6..33ce5fc359 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -943,6 +943,7 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o +@@ -974,6 +974,7 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o @@ -2536,10 +2492,10 @@ index c79d678eb3..0059074530 100644 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 2231aed259..7507966d71 100644 +index f8f981e838..8db5649b55 100644 --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c -@@ -392,6 +392,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) +@@ -402,6 +402,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ CONFIG_HEVC_NVDEC_HWACCEL + \ @@ -2547,7 +2503,7 @@ index 2231aed259..7507966d71 100644 CONFIG_HEVC_VAAPI_HWACCEL + \ CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ CONFIG_HEVC_VDPAU_HWACCEL) -@@ -418,6 +419,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) +@@ -428,6 +429,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #endif #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; @@ -2557,7 +2513,7 @@ index 2231aed259..7507966d71 100644 #endif break; case AV_PIX_FMT_YUV420P10: -@@ -439,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) +@@ -449,6 +453,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #endif #if CONFIG_HEVC_NVDEC_HWACCEL *fmt++ = AV_PIX_FMT_CUDA; @@ -2567,7 +2523,7 @@ index 2231aed259..7507966d71 100644 #endif break; case AV_PIX_FMT_YUV444P: -@@ -3705,6 +3712,9 @@ AVCodec ff_hevc_decoder = { +@@ -3905,6 +3912,9 @@ const FFCodec ff_hevc_decoder = { #endif #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL HWACCEL_VIDEOTOOLBOX(hevc), @@ -2578,10 +2534,10 @@ index 2231aed259..7507966d71 100644 NULL }, diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index 9f8d41e367..ffb9fa5087 100644 +index 6f9f078001..e4e4abc060 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h -@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; +@@ -41,6 +41,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; extern const AVHWAccel ff_hevc_dxva2_hwaccel; extern const AVHWAccel ff_hevc_nvdec_hwaccel; @@ -2591,10 +2547,10 @@ index 9f8d41e367..ffb9fa5087 100644 extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c new file mode 100644 -index 0000000000..c12748ed03 +index 0000000000..3e2b9a575e --- /dev/null +++ b/libavcodec/v4l2_request_hevc.c -@@ -0,0 +1,681 @@ +@@ -0,0 +1,679 @@ +/* + * This file is part of FFmpeg. + * @@ -2615,6 +2571,7 @@ index 0000000000..c12748ed03 + +#include "hevcdec.h" +#include "hwconfig.h" ++#include "internal.h" +#include "v4l2_request.h" + +#define MAX_SLICES 600 // as per HEVC spec ? @@ -3072,29 +3029,27 @@ index 0000000000..c12748ed03 + .size = sizeof(controls->dec_params), + }; + -+ if (ctx->supports_scaling_matrix) { ++ if (ctx->supports_scaling_matrix) + control[num_controls++] = (struct v4l2_ext_control) { + .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, + .ptr = &controls->scaling_matrix, + .size = sizeof(controls->scaling_matrix), + }; -+ } + -+ if (ctx->supports_slices) { ++ if (ctx->supports_slices) + control[num_controls++] = (struct v4l2_ext_control) { + .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, + .ptr = &controls->slice_params, + .size = sizeof(*first_slice_params) * controls->num_slices, + }; -+ } ++ + //this assumes that decoders supporting entry_point_offsets submit a single slice per request -+ if (ctx->supports_entry_point_offsets && first_slice_params->num_entry_point_offsets > 0) { ++ if (ctx->supports_entry_point_offsets && first_slice_params->num_entry_point_offsets > 0) + control[num_controls++] = (struct v4l2_ext_control) { + .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, + .ptr = controls->entry_point_offsets, + .size = sizeof(*controls->entry_point_offsets) * first_slice_params->num_entry_point_offsets, + }; -+ } + + if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED) + return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, num_controls, controls->first_slice, last_slice); @@ -3217,11 +3172,10 @@ index 0000000000..c12748ed03 + } + + ret = ff_v4l2_request_query_control(avctx, &scaling_matrix); -+ if (ret) { ++ if (ret) + ctx->supports_scaling_matrix = 0; -+ } else { ++ else + ctx->supports_scaling_matrix = 1; -+ } + + av_log(avctx, AV_LOG_DEBUG, "%s: decoder is %s and supports slices %d, supports entry_point_offsets: %d supports scaling_matrix: %d max slices: %u\n", + __func__, @@ -3277,10 +3231,10 @@ index 0000000000..c12748ed03 + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; -From 44f20a53f2ef6ad0ecfb413ebcc95f92fc17377f Mon Sep 17 00:00:00 2001 +From 24e1336751a0d7346ea149aa3cbe9445e12b885e Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 12 Dec 2019 16:13:55 +0100 -Subject: [PATCH 09/12] Add V4L2 request API VP9 hwaccel +Subject: [PATCH 08/13] Add V4L2 request API VP9 hwaccel Signed-off-by: Boris Brezillon Signed-off-by: Jernej Skrabec @@ -3296,10 +3250,10 @@ Signed-off-by: Jernej Skrabec create mode 100644 libavcodec/v4l2_request_vp9.c diff --git a/configure b/configure -index 6192a6c144..36a1271a6c 100755 +index d366652e51..78d0c5a420 100755 --- a/configure +++ b/configure -@@ -3042,6 +3042,8 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9" +@@ -3131,6 +3131,8 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9" vp9_dxva2_hwaccel_select="vp9_decoder" vp9_nvdec_hwaccel_deps="nvdec" vp9_nvdec_hwaccel_select="vp9_decoder" @@ -3308,7 +3262,7 @@ index 6192a6c144..36a1271a6c 100755 vp9_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferVP9_bit_depth" vp9_vaapi_hwaccel_select="vp9_decoder" vp9_vdpau_hwaccel_deps="vdpau VdpPictureInfoVP9" -@@ -6641,6 +6643,7 @@ check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" +@@ -6840,6 +6842,7 @@ check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;" check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;" @@ -3317,19 +3271,19 @@ index 6192a6c144..36a1271a6c 100755 check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 0059074530..38edf1cfe5 100644 +index 33ce5fc359..6561518e0e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -977,6 +977,7 @@ OBJS-$(CONFIG_VP8_VAAPI_HWACCEL) += vaapi_vp8.o +@@ -1006,6 +1006,7 @@ OBJS-$(CONFIG_VP8_VAAPI_HWACCEL) += vaapi_vp8.o OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL) += dxva2_vp9.o OBJS-$(CONFIG_VP9_DXVA2_HWACCEL) += dxva2_vp9.o OBJS-$(CONFIG_VP9_NVDEC_HWACCEL) += nvdec_vp9.o +OBJS-$(CONFIG_VP9_V4L2REQUEST_HWACCEL) += v4l2_request_vp9.o OBJS-$(CONFIG_VP9_VAAPI_HWACCEL) += vaapi_vp9.o OBJS-$(CONFIG_VP9_VDPAU_HWACCEL) += vdpau_vp9.o - OBJS-$(CONFIG_VP8_QSV_HWACCEL) += qsvdec.o + OBJS-$(CONFIG_VP9_VIDEOTOOLBOX_HWACCEL) += videotoolbox_vp9.o diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index ffb9fa5087..fc5d0b0479 100644 +index e4e4abc060..53f4f61fc5 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -76,6 +76,7 @@ extern const AVHWAccel ff_vp9_d3d11va_hwaccel; @@ -3339,7 +3293,7 @@ index ffb9fa5087..fc5d0b0479 100644 +extern const AVHWAccel ff_vp9_v4l2request_hwaccel; extern const AVHWAccel ff_vp9_vaapi_hwaccel; extern const AVHWAccel ff_vp9_vdpau_hwaccel; - extern const AVHWAccel ff_wmv3_d3d11va_hwaccel; + extern const AVHWAccel ff_vp9_videotoolbox_hwaccel; diff --git a/libavcodec/v4l2_request_vp9.c b/libavcodec/v4l2_request_vp9.c new file mode 100644 index 0000000000..ec0300f66d @@ -3629,38 +3583,38 @@ index 0000000000..ec0300f66d + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c -index 4659f94ee8..1b2f1eeaf6 100644 +index fee79fb45b..bd7dc4f53a 100644 --- a/libavcodec/vp9.c +++ b/libavcodec/vp9.c -@@ -191,6 +191,7 @@ static int update_size(AVCodecContext *avctx, int w, int h) +@@ -184,6 +184,7 @@ static int update_size(AVCodecContext *avctx, int w, int h) #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \ CONFIG_VP9_D3D11VA_HWACCEL * 2 + \ CONFIG_VP9_NVDEC_HWACCEL + \ + CONFIG_VP9_V4L2REQUEST_HWACCEL + \ CONFIG_VP9_VAAPI_HWACCEL + \ - CONFIG_VP9_VDPAU_HWACCEL) - enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts; -@@ -223,6 +224,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) + CONFIG_VP9_VDPAU_HWACCEL + \ + CONFIG_VP9_VIDEOTOOLBOX_HWACCEL) +@@ -212,6 +213,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) + #if CONFIG_VP9_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; #endif - #if CONFIG_VP9_VDPAU_HWACCEL - *fmtp++ = AV_PIX_FMT_VDPAU; -+#endif +#if CONFIG_VP9_V4L2REQUEST_HWACCEL + *fmtp++ = AV_PIX_FMT_DRM_PRIME; - #endif - break; - case AV_PIX_FMT_YUV420P12: -@@ -234,6 +238,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) - #endif - #if CONFIG_VP9_VDPAU_HWACCEL - *fmtp++ = AV_PIX_FMT_VDPAU; +#endif + #if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; + #endif +@@ -226,6 +230,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) + #if CONFIG_VP9_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; + #endif +#if CONFIG_VP9_V4L2REQUEST_HWACCEL + *fmtp++ = AV_PIX_FMT_DRM_PRIME; ++#endif + #if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; #endif - break; - } -@@ -382,7 +389,7 @@ static av_always_inline int inv_recenter_nonneg(int v, int m) +@@ -379,7 +386,7 @@ static av_always_inline int inv_recenter_nonneg(int v, int m) } // differential forward probability updates @@ -3669,7 +3623,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 { static const uint8_t inv_map_table[255] = { 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, -@@ -436,8 +443,13 @@ static int update_prob(VP56RangeCoder *c, int p) +@@ -433,8 +440,13 @@ static int update_prob(VP56RangeCoder *c, int p) av_assert2(d < FF_ARRAY_ELEMS(inv_map_table)); } @@ -3685,7 +3639,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 } static int read_colorspace_details(AVCodecContext *avctx) -@@ -703,7 +715,8 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -700,7 +712,8 @@ static int decode_frame_header(AVCodecContext *avctx, get_bits(&s->gb, 8) : 255; } @@ -3695,7 +3649,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 s->s.h.segmentation.absolute_vals = get_bits1(&s->gb); for (i = 0; i < 8; i++) { if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb))) -@@ -904,6 +917,8 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -900,6 +913,8 @@ static int decode_frame_header(AVCodecContext *avctx, * as explicit copies if the fw update is missing (and skip the copy upon * fw update)? */ s->prob.p = s->prob_ctx[c].p; @@ -3704,7 +3658,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 // txfm updates if (s->s.h.lossless) { -@@ -915,18 +930,25 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -911,18 +926,25 @@ static int decode_frame_header(AVCodecContext *avctx, if (s->s.h.txfmmode == TX_SWITCHABLE) { for (i = 0; i < 2; i++) @@ -3738,7 +3692,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 } } -@@ -938,15 +960,18 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -934,15 +956,18 @@ static int decode_frame_header(AVCodecContext *avctx, for (k = 0; k < 2; k++) for (l = 0; l < 6; l++) for (m = 0; m < 6; m++) { @@ -3760,7 +3714,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 } memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8); } -@@ -961,7 +986,7 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -957,7 +982,7 @@ static int decode_frame_header(AVCodecContext *avctx, break; memcpy(p, r, 3); memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8); @@ -3769,7 +3723,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 } if (s->s.h.txfmmode == i) break; -@@ -969,25 +994,37 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -965,25 +990,37 @@ static int decode_frame_header(AVCodecContext *avctx, // mode updates for (i = 0; i < 3; i++) @@ -3815,7 +3769,7 @@ index 4659f94ee8..1b2f1eeaf6 100644 if (s->s.h.allowcompinter) { s->s.h.comppredmode = vp8_rac_get(&s->c); -@@ -995,92 +1032,134 @@ static int decode_frame_header(AVCodecContext *avctx, +@@ -991,92 +1028,134 @@ static int decode_frame_header(AVCodecContext *avctx, s->s.h.comppredmode += vp8_rac_get(&s->c); if (s->s.h.comppredmode == PRED_SWITCHABLE) for (i = 0; i < 5; i++) @@ -3981,21 +3935,21 @@ index 4659f94ee8..1b2f1eeaf6 100644 } } } -@@ -1912,6 +1991,9 @@ AVCodec ff_vp9_decoder = { - #endif +@@ -1902,6 +1981,9 @@ const FFCodec ff_vp9_decoder = { #if CONFIG_VP9_VDPAU_HWACCEL HWACCEL_VDPAU(vp9), -+#endif + #endif +#if CONFIG_VP9_V4L2REQUEST_HWACCEL + HWACCEL_V4L2REQUEST(vp9), ++#endif + #if CONFIG_VP9_VIDEOTOOLBOX_HWACCEL + HWACCEL_VIDEOTOOLBOX(vp9), #endif - NULL - }, diff --git a/libavcodec/vp9dec.h b/libavcodec/vp9dec.h -index d82b258a3d..8d2c341e0b 100644 +index 9cbd5839a8..74b854691e 100644 --- a/libavcodec/vp9dec.h +++ b/libavcodec/vp9dec.h -@@ -131,6 +131,10 @@ typedef struct VP9Context { +@@ -132,6 +132,10 @@ typedef struct VP9Context { ProbContext p; uint8_t coef[4][2][2][6][6][11]; } prob; @@ -4007,7 +3961,7 @@ index d82b258a3d..8d2c341e0b 100644 // contextual (above) cache uint8_t *above_partition_ctx; diff --git a/libavcodec/vp9shared.h b/libavcodec/vp9shared.h -index 54726df742..fee3568736 100644 +index ebaa11d2c1..3469a7312d 100644 --- a/libavcodec/vp9shared.h +++ b/libavcodec/vp9shared.h @@ -131,6 +131,7 @@ typedef struct VP9BitstreamHeader { @@ -4019,10 +3973,10 @@ index 54726df742..fee3568736 100644 uint8_t pred_prob[3]; struct { -From 0a7d6808383a5c88a690b7ba6c634970dcd33548 Mon Sep 17 00:00:00 2001 +From 3da495ee6fe1d8cf4bebde0a089bcd14e7a7785e Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Mon, 29 Apr 2019 22:08:59 +0000 -Subject: [PATCH 10/12] HACK: hwcontext_drm: do not require drm device +Subject: [PATCH 09/13] HACK: hwcontext_drm: do not require drm device Signed-off-by: Jonas Karlman --- @@ -4046,10 +4000,10 @@ index 7a9fdbd263..6297d1f9b6 100644 if (hwctx->fd < 0) return AVERROR(errno); -From d82c704c262fea62250c5989d8e97cdd769d8359 Mon Sep 17 00:00:00 2001 +From 735160c215246efb0b99014978cac558fb775488 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Fri, 15 May 2020 16:54:05 +0000 -Subject: [PATCH 11/12] WIP: add NV15 and NV20 support +Subject: [PATCH 10/13] WIP: add NV15 and NV20 support Signed-off-by: Jonas Karlman --- @@ -4058,10 +4012,10 @@ Signed-off-by: Jonas Karlman 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index 6b7f569da4..ee4c76cf41 100644 +index 335dc2cac1..23807756c5 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c -@@ -794,10 +794,17 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) +@@ -822,10 +822,17 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) *fmt++ = AV_PIX_FMT_GBRP10; } else *fmt++ = AV_PIX_FMT_YUV444P10; @@ -4081,7 +4035,7 @@ index 6b7f569da4..ee4c76cf41 100644 break; case 12: if (CHROMA444(h)) { -@@ -836,6 +843,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) +@@ -868,6 +875,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) else *fmt++ = AV_PIX_FMT_YUV444P; } else if (CHROMA422(h)) { @@ -4092,10 +4046,10 @@ index 6b7f569da4..ee4c76cf41 100644 *fmt++ = AV_PIX_FMT_YUVJ422P; else diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c -index b57bbf29bc..349ed67cb2 100644 +index e7faf100f0..c77d3a8cb1 100644 --- a/libavcodec/v4l2_request.c +++ b/libavcodec/v4l2_request.c -@@ -188,6 +188,13 @@ const uint32_t v4l2_request_capture_pixelformats[] = { +@@ -186,6 +186,13 @@ const uint32_t v4l2_request_capture_pixelformats[] = { #ifdef DRM_FORMAT_MOD_ALLWINNER_TILED V4L2_PIX_FMT_SUNXI_TILED_NV12, #endif @@ -4109,7 +4063,7 @@ index b57bbf29bc..349ed67cb2 100644 }; static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format) -@@ -206,6 +213,22 @@ static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4 +@@ -204,6 +211,22 @@ static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4 layer->format = DRM_FORMAT_NV12; desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; break; @@ -4133,17 +4087,17 @@ index b57bbf29bc..349ed67cb2 100644 default: return -1; -From 0b8eff40d910669ea03417f76468fcf2518d293e Mon Sep 17 00:00:00 2001 +From 94dd83848ec99174057dd9c9747dfeff0a416a3a Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Mon, 27 Jul 2020 23:15:45 +0000 -Subject: [PATCH 12/12] HACK: define drm NV15 and NV20 format +Subject: [PATCH 11/13] HACK: define drm NV15 and NV20 format --- libavcodec/v4l2_request.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c -index 349ed67cb2..824dcaa8e9 100644 +index c77d3a8cb1..19c41f2b3f 100644 --- a/libavcodec/v4l2_request.c +++ b/libavcodec/v4l2_request.c @@ -30,6 +30,14 @@ @@ -4161,3 +4115,161 @@ index 349ed67cb2..824dcaa8e9 100644 uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame) { V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; + +From bbedc169415db75791a60082ec80942e891ad5f3 Mon Sep 17 00:00:00 2001 +From: Alex Bee +Date: Sat, 22 Oct 2022 22:23:22 +0200 +Subject: [PATCH 12/13] HACK: Revert "lavc/pthread_frame: always transfer + stashed hwaccel state" + +This reverts commit 96c78e50a66a3b443eb2f237e2554ab84b8a12ce. +--- + libavcodec/pthread_frame.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index 43d6cc8ff4..80c15b35be 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -458,13 +458,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx, + pthread_mutex_unlock(&p->mutex); + return err; + } +- } + +- /* transfer the stashed hwaccel state, if any */ +- av_assert0(!p->avctx->hwaccel); +- FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); +- FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); +- FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ /* transfer hwaccel state stashed from previous thread, if any */ ++ av_assert0(!p->avctx->hwaccel); ++ FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); ++ FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); ++ FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ } + + av_packet_unref(p->avpkt); + ret = av_packet_ref(p->avpkt, avpkt); + +From 1d0bcfaf182c8b51b001485bc7e7e6db0ceabc4e Mon Sep 17 00:00:00 2001 +From: Alex Bee +Date: Sat, 22 Oct 2022 22:24:07 +0200 +Subject: [PATCH 13/13] HACK: Revert "lavc/pthread_frame: avoid leaving stale + hwaccel state in worker threads" + +This reverts commit 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda. +--- + libavcodec/pthread_frame.c | 47 ++++++++++---------------------------- + 1 file changed, 12 insertions(+), 35 deletions(-) + +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index 80c15b35be..8faea75a49 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -147,12 +147,6 @@ typedef struct FrameThreadContext { + * Set for the first N packets, where N is the number of threads. + * While it is set, ff_thread_en/decode_frame won't return any results. + */ +- +- /* hwaccel state is temporarily stored here in order to transfer its ownership +- * to the next decoding thread without the need for extra synchronization */ +- const AVHWAccel *stash_hwaccel; +- void *stash_hwaccel_context; +- void *stash_hwaccel_priv; + } FrameThreadContext; + + #if FF_API_THREAD_SAFE_CALLBACKS +@@ -233,17 +227,9 @@ FF_ENABLE_DEPRECATION_WARNINGS + ff_thread_finish_setup(avctx); + + if (p->hwaccel_serializing) { +- /* wipe hwaccel state to avoid stale pointers lying around; +- * the state was transferred to FrameThreadContext in +- * ff_thread_finish_setup(), so nothing is leaked */ +- avctx->hwaccel = NULL; +- avctx->hwaccel_context = NULL; +- avctx->internal->hwaccel_priv_data = NULL; +- + p->hwaccel_serializing = 0; + pthread_mutex_unlock(&p->parent->hwaccel_mutex); + } +- av_assert0(!avctx->hwaccel); + + if (p->async_serializing) { + p->async_serializing = 0; +@@ -307,6 +293,9 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src, + dst->color_range = src->color_range; + dst->chroma_sample_location = src->chroma_sample_location; + ++ dst->hwaccel = src->hwaccel; ++ dst->hwaccel_context = src->hwaccel_context; ++ + dst->sample_rate = src->sample_rate; + dst->sample_fmt = src->sample_fmt; + #if FF_API_OLD_CHANNEL_LAYOUT +@@ -319,6 +308,8 @@ FF_ENABLE_DEPRECATION_WARNINGS + if (err < 0) + return err; + ++ dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data; ++ + if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx || + (dst->hw_frames_ctx && dst->hw_frames_ctx->data != src->hw_frames_ctx->data)) { + av_buffer_unref(&dst->hw_frames_ctx); +@@ -458,12 +449,6 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx, + pthread_mutex_unlock(&p->mutex); + return err; + } +- +- /* transfer hwaccel state stashed from previous thread, if any */ +- av_assert0(!p->avctx->hwaccel); +- FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); +- FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); +- FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); + } + + av_packet_unref(p->avpkt); +@@ -669,14 +654,6 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { + async_lock(p->parent); + } + +- /* save hwaccel state for passing to the next thread; +- * this is done here so that this worker thread can wipe its own hwaccel +- * state after decoding, without requiring synchronization */ +- av_assert0(!p->parent->stash_hwaccel); +- p->parent->stash_hwaccel = avctx->hwaccel; +- p->parent->stash_hwaccel_context = avctx->hwaccel_context; +- p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; +- + pthread_mutex_lock(&p->progress_mutex); + if(atomic_load(&p->state) == STATE_SETUP_FINISHED){ + av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n"); +@@ -730,6 +707,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) + + park_frame_worker_threads(fctx, thread_count); + ++ if (fctx->prev_thread && avctx->internal->hwaccel_priv_data != ++ fctx->prev_thread->avctx->internal->hwaccel_priv_data) { ++ if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n"); ++ } ++ } ++ + for (i = 0; i < thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + AVCodecContext *ctx = p->avctx; +@@ -776,13 +760,6 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) + av_freep(&fctx->threads); + ff_pthread_free(fctx, thread_ctx_offsets); + +- /* if we have stashed hwaccel state, move it to the user-facing context, +- * so it will be freed in avcodec_close() */ +- av_assert0(!avctx->hwaccel); +- FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); +- FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); +- FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); +- + av_freep(&avctx->internal->thread_ctx); + } + diff --git a/projects/Allwinner/patches/ffmpeg/0003-v4l2_request-revert-changes.patch b/projects/Allwinner/patches/ffmpeg/0003-v4l2_request-revert-changes.patch deleted file mode 100644 index 80917a2949..0000000000 --- a/projects/Allwinner/patches/ffmpeg/0003-v4l2_request-revert-changes.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 0770ca82391fb60084080b2fd235cc039a3b8314 Mon Sep 17 00:00:00 2001 -From: Jernej Skrabec -Date: Sun, 25 Apr 2021 10:40:56 +0000 -Subject: [PATCH] v4l2_request: revert changes - ---- - libavcodec/v4l2_request.c | 10 +++------- - libavcodec/v4l2_request_h264.c | 8 ++++---- - 2 files changed, 7 insertions(+), 11 deletions(-) - -diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c -index 5234b5049b0d..1e513ee5df8d 100644 ---- a/libavcodec/v4l2_request.c -+++ b/libavcodec/v4l2_request.c -@@ -142,14 +142,12 @@ static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4 - .type = buf->buffer.type, - .memory = buf->buffer.memory, - .index = buf->index, -- .timestamp.tv_usec = ctx->timestamp, -+ .timestamp.tv_usec = buf->index + 1, - .bytesused = buf->used, - .request_fd = request_fd, - .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags, - }; - -- buf->buffer.timestamp = buffer.timestamp; -- - if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) { - planes[0].bytesused = buf->used; - buffer.bytesused = 0; -@@ -239,9 +237,6 @@ static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, stru - - av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice); - -- if (first_slice) -- ctx->timestamp++; -- - ret = v4l2_request_set_controls(ctx, req->request_fd, control, count); - if (ret < 0) { - av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); -@@ -693,7 +688,6 @@ int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t b - - ctx->media_fd = -1; - ctx->video_fd = -1; -- ctx->timestamp = 0; - - udev = udev_new(); - if (!udev) { -@@ -827,6 +821,8 @@ static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *b - return ret; - } - -+ buf->buffer.timestamp.tv_usec = buf->index + 1; -+ - if (V4L2_TYPE_IS_OUTPUT(type)) { - void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset); - if (addr == MAP_FAILED) { -diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c -index 88da8f0a2db0..a14028336a39 100644 ---- a/libavcodec/v4l2_request_h264.c -+++ b/libavcodec/v4l2_request_h264.c -@@ -252,7 +252,7 @@ static int v4l2_request_h264_start_frame(AVCodecContext *avctx, - - fill_dpb(&controls->decode_params, h); - -- controls->first_slice = !FIELD_PICTURE(h) || h->first_field; -+ controls->first_slice = 1; - controls->num_slices = 0; - - return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f); -@@ -383,8 +383,7 @@ static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t * - - static int v4l2_request_h264_end_frame(AVCodecContext *avctx) - { -- const H264Context *h = avctx->priv_data; -- return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field); -+ return v4l2_request_h264_queue_decode(avctx, 1); - } - - static int v4l2_request_h264_set_controls(AVCodecContext *avctx) From 4ed2e527fe688393bfbb180714bced35c6492a97 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sat, 5 Nov 2022 18:19:41 +0100 Subject: [PATCH 06/12] ffmpeg: update v4l2-drmprime patch Patch created using revisions eacfcba..38c65aa from branch v4l2-drmprime-n5.1.2 of https://github.com/jernejsk/FFmpeg --- .../ffmpeg-001-v4l2-drmprime.patch | 142 +++++++++--------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch b/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch index 403b073b68..e3cc547282 100644 --- a/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch +++ b/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch @@ -1,4 +1,4 @@ -From 1e73397d52d69378ebac5a390da508cd16d7d97e Mon Sep 17 00:00:00 2001 +From c4be609d822229be09cd9dd6f64cad716b0a48ce Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Tue, 24 Apr 2018 23:00:23 -0700 Subject: [PATCH 1/9] libavcodec: v4l2m2m: output AVDRMFrameDescriptor @@ -42,7 +42,7 @@ V5: 5 files changed, 213 insertions(+), 12 deletions(-) diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 4b2679eb38..cbd3e5680d 100644 +index 3f5471067a..07662b5fc3 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c @@ -21,6 +21,7 @@ @@ -53,15 +53,15 @@ index 4b2679eb38..cbd3e5680d 100644 #include #include #include -@@ -30,6 +31,7 @@ +@@ -29,6 +30,7 @@ + #include #include "libavcodec/avcodec.h" - #include "libavcodec/internal.h" #include "libavutil/pixdesc.h" +#include "libavutil/hwcontext.h" #include "v4l2_context.h" #include "v4l2_buffers.h" #include "v4l2_m2m.h" -@@ -210,7 +212,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) +@@ -209,7 +211,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) return AVCOL_TRC_UNSPECIFIED; } @@ -142,7 +142,7 @@ index 4b2679eb38..cbd3e5680d 100644 { V4L2Buffer* avbuf = opaque; V4L2m2mContext *s = buf_to_m2mctx(avbuf); -@@ -234,6 +308,36 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) +@@ -233,6 +307,36 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) } } @@ -179,7 +179,7 @@ index 4b2679eb38..cbd3e5680d 100644 static int v4l2_buf_increase_ref(V4L2Buffer *in) { V4L2m2mContext *s = buf_to_m2mctx(in); -@@ -254,6 +358,24 @@ static int v4l2_buf_increase_ref(V4L2Buffer *in) +@@ -253,6 +357,24 @@ static int v4l2_buf_increase_ref(V4L2Buffer *in) return 0; } @@ -204,7 +204,7 @@ index 4b2679eb38..cbd3e5680d 100644 static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) { int ret; -@@ -303,13 +425,24 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) +@@ -302,13 +424,24 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) frame->format = avbuf->context->av_pix_fmt; @@ -233,7 +233,7 @@ index 4b2679eb38..cbd3e5680d 100644 } /* fixup special cases */ -@@ -543,9 +676,6 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +@@ -542,9 +675,6 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) avbuf->status = V4L2BUF_AVAILABLE; @@ -243,7 +243,7 @@ index 4b2679eb38..cbd3e5680d 100644 if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.m.planes = avbuf->planes; avbuf->buf.length = avbuf->num_planes; -@@ -555,6 +685,15 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +@@ -554,6 +684,15 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) avbuf->buf.length = avbuf->planes[0].length; } @@ -260,18 +260,18 @@ index 4b2679eb38..cbd3e5680d 100644 } diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 8dbc7fc104..037e667997 100644 +index 3d2ff1b9a5..b82c990dcc 100644 --- a/libavcodec/v4l2_buffers.h +++ b/libavcodec/v4l2_buffers.h -@@ -27,6 +27,7 @@ - #include - #include +@@ -30,6 +30,7 @@ + #include "libavutil/buffer.h" + #include "libavutil/frame.h" +#include "libavutil/hwcontext_drm.h" - #include "avcodec.h" + #include "packet.h" enum V4L2Buffer_status { -@@ -42,6 +43,9 @@ typedef struct V4L2Buffer { +@@ -45,6 +46,9 @@ typedef struct V4L2Buffer { /* each buffer needs to have a reference to its context */ struct V4L2Context *context; @@ -282,10 +282,10 @@ index 8dbc7fc104..037e667997 100644 * of how many context-refs we are holding. */ AVBufferRef *context_ref; diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ff1ea8e57b..e9e8c27a54 100644 +index e891649f92..4de23e687c 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c -@@ -455,22 +455,54 @@ static int v4l2_release_buffers(V4L2Context* ctx) +@@ -441,22 +441,54 @@ static int v4l2_release_buffers(V4L2Context* ctx) struct v4l2_requestbuffers req = { .memory = V4L2_MEMORY_MMAP, .type = ctx->type, @@ -359,7 +359,7 @@ index b67b216331..0fbd19a013 100644 typedef struct V4L2m2mPriv { diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index ab07c0a24a..6bc7442702 100644 +index 8a51dec3fa..d916a3b726 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -23,6 +23,9 @@ @@ -373,8 +373,8 @@ index ab07c0a24a..6bc7442702 100644 #include "libavutil/pixdesc.h" #include "libavutil/opt.h" @@ -30,6 +33,9 @@ + #include "codec_internal.h" #include "libavcodec/decode.h" - #include "libavcodec/internal.h" +#include "libavcodec/hwaccels.h" +#include "libavcodec/internal.h" @@ -382,7 +382,7 @@ index ab07c0a24a..6bc7442702 100644 #include "v4l2_context.h" #include "v4l2_m2m.h" #include "v4l2_fmt.h" -@@ -201,6 +207,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -205,6 +211,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; capture->av_pix_fmt = avctx->pix_fmt; @@ -398,7 +398,7 @@ index ab07c0a24a..6bc7442702 100644 s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); if (ret) { -@@ -226,6 +241,11 @@ static const AVOption options[] = { +@@ -230,6 +245,11 @@ static const AVOption options[] = { { NULL}, }; @@ -410,18 +410,18 @@ index ab07c0a24a..6bc7442702 100644 #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -249,6 +269,9 @@ static const AVOption options[] = { +@@ -253,6 +273,9 @@ static const AVOption options[] = { .bsfs = bsf_name, \ - .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ + .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ -+ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ ++ .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ + AV_PIX_FMT_NONE}, \ + .hw_configs = v4l2_m2m_hw_configs, \ - .wrapper_name = "v4l2m2m", \ + .p.wrapper_name = "v4l2m2m", \ } -From deb0ba531401f069dc6e4dcf235dfc08bca6577c Mon Sep 17 00:00:00 2001 +From bb47b0e2ce4dda5ddeb4568d2b0260ec950d68cd Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Thu, 16 Aug 2018 21:09:40 -0700 Subject: [PATCH 2/9] libavcodec: v4l2m2m: depends on libdrm @@ -432,10 +432,10 @@ Subject: [PATCH 2/9] libavcodec: v4l2m2m: depends on libdrm 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/configure b/configure -index 4ba72bf84b..efb065905c 100755 +index ba5793b2ff..1872046e0e 100755 --- a/configure +++ b/configure -@@ -3438,6 +3438,7 @@ sndio_indev_deps="sndio" +@@ -3539,6 +3539,7 @@ sndio_indev_deps="sndio" sndio_outdev_deps="sndio" v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" v4l2_indev_suggest="libv4l2" @@ -444,7 +444,7 @@ index 4ba72bf84b..efb065905c 100755 v4l2_outdev_suggest="libv4l2" vfwcap_indev_deps="vfw32 vfwcap_defines" diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index cbd3e5680d..bebe2c1796 100644 +index 07662b5fc3..d41558527c 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c @@ -21,7 +21,7 @@ @@ -457,7 +457,7 @@ index cbd3e5680d..bebe2c1796 100644 #include #include -From f89fad11f53110cd6968c83e89bafb0c449f34ec Mon Sep 17 00:00:00 2001 +From a362c4ec155e5e38c7a1ed3d3c57054ba09d9945 Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Thu, 16 Aug 2018 21:10:13 -0700 Subject: [PATCH 3/9] libavcodec: v4l2m2m: set format_modifier to @@ -468,10 +468,10 @@ Subject: [PATCH 3/9] libavcodec: v4l2m2m: set format_modifier to 1 file changed, 2 insertions(+) diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index bebe2c1796..12037d5d66 100644 +index d41558527c..95c8a1e409 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c -@@ -328,10 +328,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) +@@ -327,10 +327,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) /* drm frame */ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; avbuf->drm_frame.objects[i].fd = expbuf.fd; @@ -485,7 +485,7 @@ index bebe2c1796..12037d5d66 100644 } -From d5a37af1a8fe1ed70428e55286126d241986dd0c Mon Sep 17 00:00:00 2001 +From d4c8398a40baae35c79c1baf7e99306edda53e1a Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Thu, 16 Aug 2018 21:10:53 -0700 Subject: [PATCH 4/9] libavcodec: v4l2m2m: only mmap the buffer when it is @@ -496,10 +496,10 @@ Subject: [PATCH 4/9] libavcodec: v4l2m2m: only mmap the buffer when it is 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 12037d5d66..1adf518ab9 100644 +index 95c8a1e409..0a65f32cb2 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c -@@ -662,14 +662,22 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +@@ -661,14 +661,22 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; @@ -529,7 +529,7 @@ index 12037d5d66..1adf518ab9 100644 if (avbuf->plane_info[i].mm_addr == MAP_FAILED) -From d0be699166cdc413a5dc3e1c087433ac7cf142e7 Mon Sep 17 00:00:00 2001 +From 5c468fefad99503d2fe53e5a83ef29a409f0abaa Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Thu, 16 Aug 2018 21:11:38 -0700 Subject: [PATCH 5/9] libavcodec: v4l2m2m: allow using software pixel formats @@ -539,10 +539,10 @@ Subject: [PATCH 5/9] libavcodec: v4l2m2m: allow using software pixel formats 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 6bc7442702..4b9baf833c 100644 +index d916a3b726..7787a2c185 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c -@@ -213,8 +213,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -217,8 +217,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) * - the DRM frame format is passed in the DRM frame descriptor layer. * check the v4l2_get_drm_frame function. */ @@ -560,16 +560,16 @@ index 6bc7442702..4b9baf833c 100644 s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); -@@ -270,6 +278,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { - .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ +@@ -274,6 +282,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { + .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ - .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ + .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ + AV_PIX_FMT_NV12, \ AV_PIX_FMT_NONE}, \ .hw_configs = v4l2_m2m_hw_configs, \ - .wrapper_name = "v4l2m2m", \ + .p.wrapper_name = "v4l2m2m", \ -From c4736742883eb2a1965ac65a5c75d5409e3c85a0 Mon Sep 17 00:00:00 2001 +From dd923e56e054df2f1bb275cef039b82e94fcbcb6 Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Mon, 24 Sep 2018 13:39:31 -0700 Subject: [PATCH 6/9] libavcodec: v4l2m2m: implement hwcontext @@ -582,10 +582,10 @@ Subject: [PATCH 6/9] libavcodec: v4l2m2m: implement hwcontext 4 files changed, 37 insertions(+) diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 1adf518ab9..6e2a544394 100644 +index 0a65f32cb2..a040d418d9 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c -@@ -435,6 +435,7 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) +@@ -434,6 +434,7 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); frame->format = AV_PIX_FMT_DRM_PRIME; @@ -593,7 +593,7 @@ index 1adf518ab9..6e2a544394 100644 } else { /* 1. get references to the actual data */ for (i = 0; i < avbuf->num_planes; i++) { -@@ -635,6 +636,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +@@ -634,6 +635,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) avbuf->buf.type = ctx->type; avbuf->buf.index = index; @@ -622,10 +622,10 @@ index 1adf518ab9..6e2a544394 100644 avbuf->buf.length = VIDEO_MAX_PLANES; avbuf->buf.m.planes = avbuf->planes; diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 22a9532444..e804e94131 100644 +index 6f7460c89a..02fbb6eec3 100644 --- a/libavcodec/v4l2_context.h +++ b/libavcodec/v4l2_context.h -@@ -92,6 +92,8 @@ typedef struct V4L2Context { +@@ -93,6 +93,8 @@ typedef struct V4L2Context { */ int done; @@ -648,7 +648,7 @@ index 0fbd19a013..adf5997bb5 100644 int output_drm; } V4L2m2mContext; diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 4b9baf833c..6c23693137 100644 +index 7787a2c185..9aca68e48c 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -35,6 +35,7 @@ @@ -659,7 +659,7 @@ index 4b9baf833c..6c23693137 100644 #include "v4l2_context.h" #include "v4l2_m2m.h" -@@ -224,6 +225,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -228,6 +229,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) break; } @@ -677,7 +677,7 @@ index 4b9baf833c..6c23693137 100644 ret = ff_v4l2_m2m_codec_init(priv); if (ret) { -From 0b15c77718900bf60c91217ed1492390022ad6db Mon Sep 17 00:00:00 2001 +From 32ca7163c62887aa4b4032bbd5858a03bf3e145e Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Mon, 4 May 2020 13:01:29 -0700 Subject: [PATCH 7/9] libavcodec: v4l2m2m: allow lower minimum buffer values @@ -705,10 +705,10 @@ index adf5997bb5..1082b9dad2 100644 typedef struct V4L2m2mContext { char devname[PATH_MAX]; diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 6c23693137..e323c37052 100644 +index 9aca68e48c..7afc81180d 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c -@@ -256,7 +256,7 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) +@@ -260,7 +260,7 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", @@ -718,7 +718,7 @@ index 6c23693137..e323c37052 100644 }; -From acc86933e5fe3a13aae44cf84c48bab6c717e49b Mon Sep 17 00:00:00 2001 +From 199d87bed11be1665913b88d7cae9417f48afa37 Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Wed, 6 May 2020 11:12:58 -0700 Subject: [PATCH 8/9] libavcodec: v4l2m2m: add option to specify pixel format @@ -731,10 +731,10 @@ Subject: [PATCH 8/9] libavcodec: v4l2m2m: add option to specify pixel format 3 files changed, 12 insertions(+) diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index e9e8c27a54..a97b70e836 100644 +index 4de23e687c..49ef4a684f 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c -@@ -531,6 +531,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm +@@ -517,6 +517,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) { @@ -743,7 +743,7 @@ index e9e8c27a54..a97b70e836 100644 enum AVPixelFormat pixfmt = ctx->av_pix_fmt; struct v4l2_fmtdesc fdesc; int ret; -@@ -549,6 +551,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) +@@ -535,6 +537,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) if (ret) return AVERROR(EINVAL); @@ -778,10 +778,10 @@ index 1082b9dad2..943a8923c4 100644 /** diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index e323c37052..363e998142 100644 +index 7afc81180d..de89d0ff18 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c -@@ -257,6 +257,7 @@ static const AVOption options[] = { +@@ -261,6 +261,7 @@ static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, @@ -790,7 +790,7 @@ index e323c37052..363e998142 100644 }; -From 909ca6380d9112cc0111266da02a4a8e1e5abc1e Mon Sep 17 00:00:00 2001 +From 38c65aab679c04e4dd5a58f889844e68e34c80ab Mon Sep 17 00:00:00 2001 From: Lukas Rusak Date: Mon, 24 Sep 2018 13:39:56 -0700 Subject: [PATCH 9/9] libavcodec: v4l2m2m: implement flush @@ -800,10 +800,10 @@ Subject: [PATCH 9/9] libavcodec: v4l2m2m: implement flush 1 file changed, 36 insertions(+) diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index 363e998142..52ec67cb59 100644 +index de89d0ff18..3278627553 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c -@@ -250,6 +250,41 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) +@@ -254,6 +254,41 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) return ff_v4l2_m2m_codec_end(avctx->priv_data); } @@ -815,6 +815,11 @@ index 363e998142..52ec67cb59 100644 + V4L2Context* capture = &s->capture; + int ret, i; + ++ struct v4l2_decoder_cmd cmd = { ++ .cmd = V4L2_DEC_CMD_START, ++ .flags = 0, ++ }; ++ + ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); @@ -828,11 +833,6 @@ index 363e998142..52ec67cb59 100644 + output->buffers[i].status = V4L2BUF_AVAILABLE; + } + -+ struct v4l2_decoder_cmd cmd = { -+ .cmd = V4L2_DEC_CMD_START, -+ .flags = 0, -+ }; -+ + ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); @@ -845,11 +845,11 @@ index 363e998142..52ec67cb59 100644 #define OFFSET(x) offsetof(V4L2m2mPriv, x) #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM -@@ -286,6 +321,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { +@@ -290,6 +325,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { .init = v4l2_decode_init, \ - .receive_frame = v4l2_receive_frame, \ + FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \ .close = v4l2_decode_close, \ + .flush = v4l2_decode_flush, \ .bsfs = bsf_name, \ - .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ + .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ From 3f5102deda3765b1a16268879885d827faa9a63b Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Tue, 8 Nov 2022 21:11:31 +0100 Subject: [PATCH 07/12] ffmpeg: update to 5.1.2 Signed-off-by: Matthias Reichl --- packages/multimedia/ffmpeg/package.mk | 15 ++----- .../ffmpeg/patches/ffmpeg-openssl3.patch | 44 ------------------- 2 files changed, 4 insertions(+), 55 deletions(-) delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-openssl3.patch diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 3b354f5b46..2acc1b5782 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="ffmpeg" -PKG_VERSION="4.4.1" -PKG_SHA256="eadbad9e9ab30b25f5520fbfde99fae4a92a1ae3c0257a8d68569a4651e30e02" +PKG_VERSION="5.1.2" +PKG_SHA256="619e706d662c8420859832ddc259cd4d4096a48a2ce1eefd052db9e440eef3dc" PKG_LICENSE="GPL-3.0-only" PKG_SITE="https://ffmpeg.org" PKG_URL="http://ffmpeg.org/releases/ffmpeg-${PKG_VERSION}.tar.xz" @@ -18,10 +18,9 @@ case "${PROJECT}" in PKG_FFMPEG_BRANCH="dev/4.4/rpi_import_1" PKG_SHA256="3b42cbffd15d95d59e402475fcdb1aaac9ae6a8404a521b95d1fe79c6b2baad4" PKG_URL="https://github.com/jc-kynesim/rpi-ffmpeg/archive/${PKG_VERSION}.tar.gz" - PKG_PATCH_DIRS="libreelec dav1d" ;; RPi) - PKG_FFMPEG_RPI="--disable-mmal --disable-rpi --enable-sand" + PKG_FFMPEG_RPI="--disable-mmal --enable-sand" PKG_PATCH_DIRS+=" rpi" ;; *) @@ -48,14 +47,8 @@ if [ "${V4L2_SUPPORT}" = "yes" ]; then PKG_NEED_UNPACK+=" $(get_pkg_directory libdrm)" PKG_FFMPEG_V4L2="--enable-v4l2_m2m --enable-libdrm" - if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" ]; then + if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" -o "${DEVICE}" = "RPi4" ]; then PKG_V4L2_REQUEST="yes" - elif [ "${PROJECT}" = "RPi" -a "${DEVICE}" = "RPi4" ]; then - PKG_V4L2_REQUEST="yes" - PKG_FFMPEG_HWACCEL="--disable-hwaccel=h264_v4l2request \ - --disable-hwaccel=mpeg2_v4l2request \ - --disable-hwaccel=vp8_v4l2request \ - --disable-hwaccel=vp9_v4l2request" else PKG_V4L2_REQUEST="no" fi diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-openssl3.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-openssl3.patch deleted file mode 100644 index 59e78ab369..0000000000 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-openssl3.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 1d23e125b6f76e74b754560c3b6931507cacddce Mon Sep 17 00:00:00 2001 -From: Timo Rothenpieler -Date: Tue, 7 Sep 2021 19:35:31 +0200 -Subject: [PATCH] configure: account for openssl3 license change - ---- - configure | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/configure b/configure -index c87a010387..ed7345b2c1 100755 ---- a/configure -+++ b/configure -@@ -1765,7 +1765,6 @@ EXTERNAL_LIBRARY_GPL_LIST=" - EXTERNAL_LIBRARY_NONFREE_LIST=" - decklink - libfdk_aac -- openssl - libtls - " - -@@ -1857,6 +1856,7 @@ EXTERNAL_LIBRARY_LIST=" - mediacodec - openal - opengl -+ openssl - pocketsphinx - vapoursynth - " -@@ -6572,7 +6572,10 @@ enabled omx_rpi && { test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoR - die "ERROR: OpenMAX IL headers from raspberrypi/firmware not found"; } && - enable omx - enabled omx && require_headers OMX_Core.h --enabled openssl && { check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl || -+enabled openssl && { { check_pkg_config openssl "openssl >= 3.0.0" openssl/ssl.h OPENSSL_init_ssl && -+ { enabled gplv3 || ! enabled gpl || enabled nonfree || die "ERROR: OpenSSL >=3.0.0 requires --enable-version3"; }; } || -+ { enabled gpl && ! enabled nonfree && die "ERROR: OpenSSL <3.0.0 is incompatible with the gpl"; } || -+ check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl || - check_pkg_config openssl openssl openssl/ssl.h SSL_library_init || - check_lib openssl openssl/ssl.h OPENSSL_init_ssl -lssl -lcrypto || - check_lib openssl openssl/ssl.h SSL_library_init -lssl -lcrypto || --- -2.34.1 - From 52ab9a72eef5c668addacf561db136c745d040e0 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sat, 5 Nov 2022 18:25:43 +0100 Subject: [PATCH 08/12] tools: ffmpeg: gen-patches: add feature-set vf-deinterlace-v4l2m2m It will be used by Allwinner and Rockchip projects --- packages/multimedia/ffmpeg/package.mk | 4 + .../ffmpeg/0001-WIP-deint-filter.patch | 924 ------------------ ...deinterlace-dequeue-both-destination.patch | 230 ----- ...m-increase-input-and-output-buffers.patch} | 0 .../ffmpeg/ffmpeg-0002-WIP-deint-filter.patch | 924 ------------------ ...deinterlace-dequeue-both-destination.patch | 230 ----- ...deinterlace-support-more-formats-aut.patch | 288 ------ tools/ffmpeg/gen-patches.sh | 4 +- 8 files changed, 6 insertions(+), 2598 deletions(-) delete mode 100644 projects/Allwinner/patches/ffmpeg/0001-WIP-deint-filter.patch delete mode 100644 projects/Allwinner/patches/ffmpeg/0002-libavfilter-v4l2deinterlace-dequeue-both-destination.patch rename projects/Rockchip/patches/ffmpeg/{ffmpeg-0006-deint_v4l2m2m-increase-input-and-output-buffers.patch => ffmpeg-0001-deint_v4l2m2m-increase-input-and-output-buffers.patch} (100%) delete mode 100644 projects/Rockchip/patches/ffmpeg/ffmpeg-0002-WIP-deint-filter.patch delete mode 100644 projects/Rockchip/patches/ffmpeg/ffmpeg-0003-libavfilter-v4l2deinterlace-dequeue-both-destination.patch delete mode 100644 projects/Rockchip/patches/ffmpeg/ffmpeg-0006-libavfilter-v4l2deinterlace-support-more-formats-aut.patch diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 2acc1b5782..38bdf869f5 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -25,6 +25,10 @@ case "${PROJECT}" in ;; *) PKG_PATCH_DIRS+=" v4l2-request v4l2-drmprime" + case "${PROJECT}" in + Allwinner|Rockchip) + PKG_PATCH_DIRS+=" vf-deinterlace-v4l2m2m" + esac ;; esac diff --git a/projects/Allwinner/patches/ffmpeg/0001-WIP-deint-filter.patch b/projects/Allwinner/patches/ffmpeg/0001-WIP-deint-filter.patch deleted file mode 100644 index 34ccc5b6e0..0000000000 --- a/projects/Allwinner/patches/ffmpeg/0001-WIP-deint-filter.patch +++ /dev/null @@ -1,924 +0,0 @@ -From 39069d9cc03a42cd497dd6b9756116ff4b684a5d Mon Sep 17 00:00:00 2001 -From: Jernej Skrabec -Date: Tue, 3 Dec 2019 21:01:18 +0100 -Subject: [PATCH] WIP deint filter - ---- - libavfilter/Makefile | 1 + - libavfilter/allfilters.c | 1 + - libavfilter/vf_deinterlace_v4l2m2m.c | 879 +++++++++++++++++++++++++++ - 3 files changed, 881 insertions(+) - create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c - -diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index 512354065305..625fd29f9313 100644 ---- a/libavfilter/Makefile -+++ b/libavfilter/Makefile -@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o - OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o - OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o - OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o -+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o - OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o - OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o - OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index 1183e4026751..fe5a2e8c02e8 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot; - extern AVFilter ff_vf_deflate; - extern AVFilter ff_vf_deflicker; - extern AVFilter ff_vf_deinterlace_qsv; -+extern AVFilter ff_vf_deinterlace_v4l2m2m; - extern AVFilter ff_vf_deinterlace_vaapi; - extern AVFilter ff_vf_dejudder; - extern AVFilter ff_vf_delogo; -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -new file mode 100644 -index 000000000000..1029e5b620fd ---- /dev/null -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -0,0 +1,879 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * deinterlace video filter - V4L2 M2M -+ */ -+ -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "libavutil/avassert.h" -+#include "libavutil/avstring.h" -+#include "libavutil/common.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavutil/internal.h" -+#include "libavutil/mathematics.h" -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/time.h" -+ -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" -+ -+typedef struct V4L2Queue V4L2Queue; -+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; -+ -+typedef struct V4L2PlaneInfo { -+ int bytesperline; -+ size_t length; -+} V4L2PlaneInfo; -+ -+typedef struct V4L2Buffer { -+ int enqueued; -+ int reenqueue; -+ int fd; -+ struct v4l2_buffer buffer; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int num_planes; -+ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; -+ AVDRMFrameDescriptor drm_frame; -+ V4L2Queue *q; -+} V4L2Buffer; -+ -+typedef struct V4L2Queue { -+ struct v4l2_format format; -+ int num_buffers; -+ V4L2Buffer *buffers; -+ DeintV4L2M2MContextShared *ctx; -+} V4L2Queue; -+ -+typedef struct DeintV4L2M2MContextShared { -+ int fd; -+ int done; -+ int width; -+ int height; -+ int orig_width; -+ int orig_height; -+ atomic_uint refcount; -+ -+ AVBufferRef *hw_frames_ctx; -+ -+ int frame_count; -+ AVFrame *frames[2]; -+ -+ V4L2Queue output; -+ V4L2Queue capture; -+} DeintV4L2M2MContextShared; -+ -+typedef struct DeintV4L2M2MContext { -+ const AVClass *class; -+ -+ DeintV4L2M2MContextShared *shared; -+} DeintV4L2M2MContext; -+ -+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) -+{ -+ struct v4l2_capability cap; -+ int ret; -+ -+ memset(&cap, 0, sizeof(cap)); -+ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); -+ if (ret < 0) -+ return ret; -+ -+ if (!(cap.capabilities & V4L2_CAP_STREAMING)) -+ return AVERROR(EINVAL); -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ -+ return 0; -+ } -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ -+ return 0; -+ } -+ -+ return AVERROR(EINVAL); -+} -+ -+static int deint_v4l2m2m_try_format(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret, field; -+ -+ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); -+ if (ret) -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); -+ -+ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_NONE; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = ctx->width; -+ fmt->fmt.pix_mp.height = ctx->height; -+ } else { -+ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = ctx->width; -+ fmt->fmt.pix.height = ctx->height; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); -+ if (ret) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12 || -+ fmt->fmt.pix_mp.field != field) { -+ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } else { -+ if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12 || -+ fmt->fmt.pix.field != field) { -+ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = width; -+ fmt->fmt.pix_mp.height = height; -+ /* TODO: bytesperline and imagesize */ -+ } else { -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = width; -+ fmt->fmt.pix.height = height; -+ fmt->fmt.pix.sizeimage = 0; -+ fmt->fmt.pix.bytesperline = 0; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); -+ if (ret) -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) -+{ -+ int ret; -+ -+ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); -+ if (ctx->fd < 0) -+ return AVERROR(errno); -+ -+ ret = deint_v4l2m2m_prepare_context(ctx); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->capture); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->output); -+ if (ret) -+ goto fail; -+ -+ return 0; -+ -+fail: -+ close(ctx->fd); -+ ctx->fd = -1; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) -+{ -+ int ret = AVERROR(EINVAL); -+ struct dirent *entry; -+ char node[PATH_MAX]; -+ DIR *dirp; -+ -+ dirp = opendir("/dev"); -+ if (!dirp) -+ return AVERROR(errno); -+ -+ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { -+ -+ if (strncmp(entry->d_name, "video", 5)) -+ continue; -+ -+ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); -+ av_log(NULL, AV_LOG_DEBUG, "probing device %s\n", node); -+ ret = deint_v4l2m2m_probe_device(ctx, node); -+ if (!ret) -+ break; -+ } -+ -+ closedir(dirp); -+ -+ if (ret) { -+ av_log(NULL, AV_LOG_ERROR, "Could not find a valid device\n"); -+ ctx->fd = -1; -+ -+ return ret; -+ } -+ -+ av_log(NULL, AV_LOG_INFO, "Using device %s\n", node); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) -+{ -+ int ret; -+ -+ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ buf->enqueued = 1; -+ -+ return 0; -+} -+ -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+{ -+ struct v4l2_exportbuffer expbuf; -+ int i, ret; -+ -+ for (i = 0; i < avbuf->num_planes; i++) { -+ memset(&expbuf, 0, sizeof(expbuf)); -+ -+ expbuf.index = avbuf->buffer.index; -+ expbuf.type = avbuf->buffer.type; -+ expbuf.plane = i; -+ -+ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ avbuf->fd = expbuf.fd; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { -+ /* drm frame */ -+ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; -+ avbuf->drm_frame.objects[i].fd = expbuf.fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } else { -+ /* drm frame */ -+ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; -+ avbuf->drm_frame.objects[0].fd = expbuf.fd; -+ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_requestbuffers req; -+ int ret, i, j, multiplanar; -+ uint32_t memory; -+ -+ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? -+ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; -+ -+ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); -+ -+ memset(&req, 0, sizeof(req)); -+ req.count = queue->num_buffers; -+ req.memory = memory; -+ req.type = fmt->type; -+ -+ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); -+ if (ret < 0) { -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); -+ -+ return AVERROR(errno); -+ } -+ -+ queue->num_buffers = req.count; -+ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); -+ if (!queue->buffers) { -+ av_log(NULL, AV_LOG_ERROR, "malloc enomem\n"); -+ -+ return AVERROR(ENOMEM); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer *buf = &queue->buffers[i]; -+ -+ buf->enqueued = 0; -+ buf->fd = -1; -+ buf->q = queue; -+ -+ buf->buffer.type = fmt->type; -+ buf->buffer.memory = memory; -+ buf->buffer.index = i; -+ -+ if (multiplanar) { -+ buf->buffer.length = VIDEO_MAX_PLANES; -+ buf->buffer.m.planes = buf->planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); -+ if (ret < 0) { -+ ret = AVERROR(errno); -+ -+ goto fail; -+ } -+ -+ if (multiplanar) -+ buf->num_planes = buf->buffer.length; -+ else -+ buf->num_planes = 1; -+ -+ for (j = 0; j < buf->num_planes; j++) { -+ V4L2PlaneInfo *info = &buf->plane_info[j]; -+ -+ if (multiplanar) { -+ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; -+ info->length = buf->buffer.m.planes[j].length; -+ } else { -+ info->bytesperline = fmt->fmt.pix.bytesperline; -+ info->length = buf->buffer.length; -+ } -+ } -+ -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { -+ ret = deint_v4l2m2m_enqueue_buffer(buf); -+ if (ret) -+ goto fail; -+ -+ ret = v4l2_buffer_export_drm(buf); -+ if (ret) -+ goto fail; -+ } -+ } -+ -+ return 0; -+ -+fail: -+ for (i = 0; i < queue->num_buffers; i++) -+ if (queue->buffers[i].fd >= 0) -+ close(queue->buffers[i].fd); -+ av_free(queue->buffers); -+ queue->buffers = NULL; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_streamon(V4L2Queue *queue) -+{ -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMON, &type); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_streamoff(V4L2Queue *queue) -+{ -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMOFF, &type); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) -+{ -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_buffer buf = { 0 }; -+ V4L2Buffer* avbuf = NULL; -+ struct pollfd pfd; -+ short events; -+ int ret; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ events = POLLOUT | POLLWRNORM; -+ else -+ events = POLLIN | POLLRDNORM; -+ -+ pfd.events = events; -+ pfd.fd = ctx->fd; -+ -+ for (;;) { -+ ret = poll(&pfd, 1, timeout); -+ if (ret > 0) -+ break; -+ if (errno == EINTR) -+ continue; -+ return NULL; -+ } -+ -+ if (pfd.revents & POLLERR) -+ return NULL; -+ -+ if (pfd.revents & events) { -+ memset(&buf, 0, sizeof(buf)); -+ buf.memory = V4L2_MEMORY_MMAP; -+ buf.type = queue->format.type; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); -+ if (ret) { -+ if (errno != EAGAIN) -+ av_log(NULL, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", -+ av_err2str(AVERROR(errno))); -+ return NULL; -+ } -+ -+ avbuf = &queue->buffers[buf.index]; -+ avbuf->enqueued = 0; -+ avbuf->buffer = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buffer.m.planes = avbuf->planes; -+ } -+ -+ return avbuf; -+ } -+ -+ return NULL; -+} -+ -+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) -+{ -+ int i; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (!queue->buffers[i].enqueued) -+ return &queue->buffers[i]; -+ -+ return NULL; -+} -+ -+static int deint_v4l2m2m_enqueue(V4L2Queue *queue, const AVFrame* frame) -+{ -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ V4L2Buffer *buf; -+ int i; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ while (deint_v4l2m2m_dequeue_buffer(queue, 0)); -+ -+ buf = deint_v4l2m2m_find_free_buf(queue); -+ if (!buf) -+ return AVERROR(ENOMEM); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) -+ for (i = 0; i < drm_desc->nb_objects; i++) -+ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; -+ else -+ buf->buffer.m.fd = drm_desc->objects[0].fd; -+ -+ return deint_v4l2m2m_enqueue_buffer(buf); -+} -+ -+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) -+{ -+ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int i; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); -+ -+ if (ctx->fd >= 0) { -+ deint_v4l2m2m_streamoff(capture); -+ deint_v4l2m2m_streamoff(output); -+ } -+ -+ if (capture->buffers) -+ for (i = 0; i < capture->num_buffers; i++) { -+ capture->buffers[i].q = NULL; -+ if (capture->buffers[i].fd >= 0) -+ close(capture->buffers[i].fd); -+ } -+ -+ for (i = 0; i < ctx->frame_count; i++) -+ av_frame_free(&ctx->frames[i]); -+ -+ av_buffer_unref(&ctx->hw_frames_ctx); -+ -+ if (capture->buffers) -+ av_free(capture->buffers); -+ -+ if (output->buffers) -+ av_free(output->buffers); -+ -+ if (ctx->fd >= 0) { -+ close(ctx->fd); -+ ctx->fd = -1; -+ } -+ -+ av_free(ctx); -+ } -+} -+ -+static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+{ -+ V4L2Buffer *buf = opaque; -+ DeintV4L2M2MContextShared *ctx = buf->q->ctx; -+ -+ if (!ctx->done) -+ deint_v4l2m2m_enqueue_buffer(buf); -+ -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) -+{ -+ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor *layer; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_objects = avbuf->num_planes; -+ drm_desc->nb_layers = 1; -+ -+ layer = &drm_desc->layers[0]; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; -+ } -+ -+ layer->format = DRM_FORMAT_NV12; -+ -+ if (avbuf->num_planes == 1) { -+ layer->nb_planes = 2; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -+ } -+ -+ return (uint8_t *)drm_desc; -+} -+ -+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) -+{ -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ V4L2Buffer* avbuf; -+ -+ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); -+ if (!avbuf) { -+ av_log(NULL, AV_LOG_ERROR, "dequeueing failed\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, -+ sizeof(avbuf->drm_frame), v4l2_free_buffer, -+ avbuf, AV_BUFFER_FLAG_READONLY); -+ if (!frame->buf[0]) -+ return AVERROR(ENOMEM); -+ -+ atomic_fetch_add(&ctx->refcount, 1); -+ -+ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); -+ frame->height = ctx->height; -+ frame->width = ctx->width; -+ -+ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { -+ av_log(NULL, AV_LOG_ERROR, "driver decode error\n"); -+ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame, int field) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ AVFilterLink *outlink = avctx->outputs[0]; -+ AVFrame *output_frame; -+ int err; -+ -+ output_frame = av_frame_alloc(); -+ -+ if (!output_frame) -+ return AVERROR(ENOMEM); -+ -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame, 500); -+ if (err < 0) { -+ av_log(priv, AV_LOG_ERROR, "no frame (field %d)\n", field); -+ goto fail; -+ } -+ -+ err = av_frame_copy_props(output_frame, input_frame); -+ if (err < 0) -+ goto fail; -+ -+ output_frame->interlaced_frame = 0; -+ -+ if (field == 0) { -+ output_frame->pts *= 2; -+ } else { -+ int64_t cur_pts = ctx->frames[0]->pts; -+ int64_t next_pts = ctx->frames[1]->pts; -+ -+ if (next_pts != AV_NOPTS_VALUE && cur_pts != AV_NOPTS_VALUE) { -+ output_frame->pts = next_pts + cur_pts; -+ } else { -+ output_frame->pts = AV_NOPTS_VALUE; -+ } -+ } -+ av_log(priv, AV_LOG_DEBUG, "pts: %"PRId64" (field %d)\n", output_frame->pts, field); -+ -+ return ff_filter_frame(outlink, output_frame); -+ -+fail: -+ av_frame_free(&output_frame); -+ return err; -+} -+ -+static int deint_v4l2m2m_config_props(AVFilterLink *outlink) -+{ -+ AVFilterLink *inlink = outlink->src->inputs[0]; -+ AVFilterContext *avctx = outlink->src; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ int ret; -+ -+ ctx->height = avctx->inputs[0]->h; -+ ctx->width = avctx->inputs[0]->w; -+ -+ outlink->frame_rate = av_mul_q(inlink->frame_rate, -+ (AVRational){ 2, 1 }); -+ outlink->time_base = av_mul_q(inlink->time_base, -+ (AVRational){ 1, 2 }); -+ -+ ret = deint_v4l2m2m_find_device(ctx); -+ if (ret) -+ return ret; -+ -+ if (!inlink->hw_frames_ctx) { -+ av_log(priv, AV_LOG_ERROR, "No hw context provided on input\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); -+ if (!ctx->hw_frames_ctx) -+ return AVERROR(ENOMEM); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) -+{ -+ static const enum AVPixelFormat pixel_formats[] = { -+ AV_PIX_FMT_DRM_PRIME, -+ AV_PIX_FMT_NONE, -+ }; -+ -+ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); -+} -+ -+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterContext *avctx = link->dst; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int ret; -+ -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64"\n", in->pts); -+ if (!ctx->frame_count) { -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ unsigned int field; -+ -+ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; -+ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; -+ -+ if (in->top_field_first) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_INTERLACED_BT; -+ -+ ret = deint_v4l2m2m_set_format(output, field, ctx->orig_width, ctx->orig_height); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->orig_width, ctx->orig_height); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(output); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(output); -+ if (ret) -+ return ret; -+ } -+ -+ if (ctx->frame_count < 2) { -+ ctx->frames[ctx->frame_count++] = in; -+ } else { -+ av_frame_free(&ctx->frames[0]); -+ ctx->frames[0] = ctx->frames[1]; -+ ctx->frames[1] = in; -+ } -+ -+ ret = deint_v4l2m2m_enqueue(output, in); -+ if (ret) -+ return ret; -+ -+ if (ctx->frame_count == 2) { -+ ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 0); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 1); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx; -+ -+ ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); -+ if (!ctx) -+ return AVERROR(ENOMEM); -+ -+ priv->shared = ctx; -+ ctx->fd = -1; -+ ctx->output.ctx = ctx; -+ ctx->output.num_buffers = 6; -+ ctx->capture.ctx = ctx; -+ ctx->capture.num_buffers = 6; -+ ctx->done = 0; -+ atomic_init(&ctx->refcount, 1); -+ -+ return 0; -+} -+ -+static void deint_v4l2m2m_uninit(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ -+ ctx->done = 1; -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static const AVOption deinterlace_v4l2m2m_options[] = { -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); -+ -+static const AVFilterPad deint_v4l2m2m_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .filter_frame = deint_v4l2m2m_filter_frame, -+ }, -+ { NULL } -+}; -+ -+static const AVFilterPad deint_v4l2m2m_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .config_props = deint_v4l2m2m_config_props, -+ }, -+ { NULL } -+}; -+ -+AVFilter ff_vf_deinterlace_v4l2m2m = { -+ .name = "deinterlace_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &deint_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ .query_formats = &deint_v4l2m2m_query_formats, -+ .inputs = deint_v4l2m2m_inputs, -+ .outputs = deint_v4l2m2m_outputs, -+ .priv_class = &deinterlace_v4l2m2m_class, -+}; --- -2.29.2 - diff --git a/projects/Allwinner/patches/ffmpeg/0002-libavfilter-v4l2deinterlace-dequeue-both-destination.patch b/projects/Allwinner/patches/ffmpeg/0002-libavfilter-v4l2deinterlace-dequeue-both-destination.patch deleted file mode 100644 index cefbd9c64d..0000000000 --- a/projects/Allwinner/patches/ffmpeg/0002-libavfilter-v4l2deinterlace-dequeue-both-destination.patch +++ /dev/null @@ -1,230 +0,0 @@ -From 6bea46839ba23bffaa093bb9ed805d571aaa66ea Mon Sep 17 00:00:00 2001 -From: Alex Bee -Date: Wed, 30 Sep 2020 21:11:34 +0200 -Subject: [PATCH] libavfilter: v4l2deinterlace: dequeue both destination - buffers on time - -Signed-off-by: Alex Bee ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 140 +++++++++++++++++---------- - 1 file changed, 88 insertions(+), 52 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 1029e5b620fd..72d28333ffa7 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -89,8 +89,14 @@ typedef struct DeintV4L2M2MContextShared { - - AVBufferRef *hw_frames_ctx; - -- int frame_count; -- AVFrame *frames[2]; -+ /* -+ * TODO: check if its really neccessary to hold this -+ * ref, it's only used for freeing av_frame on decoding -+ * end/abort -+ */ -+ AVFrame *cur_in_frame; -+ AVFrame *prev_in_frame; -+ unsigned int field_order; - - V4L2Queue output; - V4L2Queue capture; -@@ -557,8 +563,11 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) - close(capture->buffers[i].fd); - } - -- for (i = 0; i < ctx->frame_count; i++) -- av_frame_free(&ctx->frames[i]); -+ if (ctx->cur_in_frame) -+ av_frame_free(&ctx->cur_in_frame); -+ -+ if (ctx->prev_in_frame) -+ av_frame_free(&ctx->prev_in_frame); - - av_buffer_unref(&ctx->hw_frames_ctx); - -@@ -652,49 +661,79 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - return 0; - } - --static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame, int field) -+static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame) - { - DeintV4L2M2MContext *priv = avctx->priv; - DeintV4L2M2MContextShared *ctx = priv->shared; - AVFilterLink *outlink = avctx->outputs[0]; -- AVFrame *output_frame; -+ AVFrame *output_frame_1, *output_frame_2; -+ int64_t first_pts = AV_NOPTS_VALUE; - int err; - -- output_frame = av_frame_alloc(); -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" (field %d)\n", -+ input_frame->pts, ctx->field_order); - -- if (!output_frame) -+ output_frame_1 = av_frame_alloc(); -+ if (!output_frame_1) - return AVERROR(ENOMEM); - -- err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame, 500); -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_1, 500); - if (err < 0) { -- av_log(priv, AV_LOG_ERROR, "no frame (field %d)\n", field); -- goto fail; -+ av_log(priv, AV_LOG_ERROR, "no 1st frame (field %d)\n", ctx->field_order); -+ goto fail_out1; - } - -- err = av_frame_copy_props(output_frame, input_frame); -+ err = av_frame_copy_props(output_frame_1, input_frame); - if (err < 0) -- goto fail; -+ goto fail_out1; - -- output_frame->interlaced_frame = 0; -+ output_frame_1->interlaced_frame = 0; - -- if (field == 0) { -- output_frame->pts *= 2; -- } else { -- int64_t cur_pts = ctx->frames[0]->pts; -- int64_t next_pts = ctx->frames[1]->pts; -+ output_frame_2 = av_frame_alloc(); -+ if (!output_frame_2) { -+ err = AVERROR(ENOMEM); -+ goto fail_out1; -+ } -+ -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_2, 500); -+ if (err < 0) { -+ av_log(priv, AV_LOG_ERROR, "no 2nd frame (field %d)\n", ctx->field_order); -+ goto fail_out2; -+ } -+ -+ err = av_frame_copy_props(output_frame_2, input_frame); -+ if (err < 0) -+ goto fail_out2; -+ -+ output_frame_2->interlaced_frame = 0; - -- if (next_pts != AV_NOPTS_VALUE && cur_pts != AV_NOPTS_VALUE) { -- output_frame->pts = next_pts + cur_pts; -- } else { -- output_frame->pts = AV_NOPTS_VALUE; -- } -+ if (ctx->prev_in_frame && ctx->prev_in_frame->pts != AV_NOPTS_VALUE -+ && input_frame->pts != AV_NOPTS_VALUE) { -+ first_pts = (ctx->prev_in_frame->pts + input_frame->pts) / 2; -+ av_log(priv, AV_LOG_DEBUG, "calculated first pts %"PRId64"\n", first_pts); - } -- av_log(priv, AV_LOG_DEBUG, "pts: %"PRId64" (field %d)\n", output_frame->pts, field); - -- return ff_filter_frame(outlink, output_frame); -+ output_frame_1->pts = first_pts; -+ -+ err = ff_filter_frame(outlink, output_frame_1); -+ if (err < 0) { -+ av_frame_free(&output_frame_2); -+ return err; -+ } -+ err = ff_filter_frame(outlink, output_frame_2); -+ -+ if (err < 0) -+ return err; -+ -+ av_log(priv, AV_LOG_DEBUG, "1st frame pts: %"PRId64" 2nd frame pts: %"PRId64" first pts: %"PRId64" (field %d)\n", -+ output_frame_1->pts, output_frame_2->pts, first_pts, ctx->field_order); -+ -+ return 0; - --fail: -- av_frame_free(&output_frame); -+fail_out2: -+ av_frame_free(&output_frame_2); -+fail_out1: -+ av_frame_free(&output_frame_1); - return err; - } - -@@ -749,20 +788,22 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - V4L2Queue *output = &ctx->output; - int ret; - -- av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64"\n", in->pts); -- if (!ctx->frame_count) { -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" field :%d interlaced: %d\n", -+ in->pts, in->top_field_first, in->interlaced_frame); -+ -+ ctx->cur_in_frame = in; -+ -+ if (ctx->field_order == V4L2_FIELD_ANY) { - AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -- unsigned int field; -- - ctx->orig_width = drm_desc->layers[0].planes[0].pitch; - ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; - -- if (in->top_field_first) -- field = V4L2_FIELD_INTERLACED_TB; -+ if (in->top_field_first) -+ ctx->field_order = V4L2_FIELD_INTERLACED_TB; - else -- field = V4L2_FIELD_INTERLACED_BT; -+ ctx->field_order = V4L2_FIELD_INTERLACED_BT; - -- ret = deint_v4l2m2m_set_format(output, field, ctx->orig_width, ctx->orig_height); -+ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->orig_width, ctx->orig_height); - if (ret) - return ret; - -@@ -787,27 +828,19 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - return ret; - } - -- if (ctx->frame_count < 2) { -- ctx->frames[ctx->frame_count++] = in; -- } else { -- av_frame_free(&ctx->frames[0]); -- ctx->frames[0] = ctx->frames[1]; -- ctx->frames[1] = in; -- } -- - ret = deint_v4l2m2m_enqueue(output, in); - if (ret) - return ret; - -- if (ctx->frame_count == 2) { -- ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 0); -- if (ret) -- return ret; -+ ret = deint_v4l2m2m_dequeue(avctx, in); -+ if (ret) -+ return ret; - -- ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 1); -- if (ret) -- return ret; -- } -+ if (ctx->prev_in_frame) -+ av_frame_free(&ctx->prev_in_frame); -+ -+ ctx->prev_in_frame = in; -+ ctx->cur_in_frame = NULL; - - return 0; - } -@@ -828,6 +861,9 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - ctx->capture.ctx = ctx; - ctx->capture.num_buffers = 6; - ctx->done = 0; -+ ctx->field_order = V4L2_FIELD_ANY; -+ ctx->cur_in_frame = NULL; -+ ctx->prev_in_frame = NULL; - atomic_init(&ctx->refcount, 1); - - return 0; --- -2.29.2 - diff --git a/projects/Rockchip/patches/ffmpeg/ffmpeg-0006-deint_v4l2m2m-increase-input-and-output-buffers.patch b/projects/Rockchip/patches/ffmpeg/ffmpeg-0001-deint_v4l2m2m-increase-input-and-output-buffers.patch similarity index 100% rename from projects/Rockchip/patches/ffmpeg/ffmpeg-0006-deint_v4l2m2m-increase-input-and-output-buffers.patch rename to projects/Rockchip/patches/ffmpeg/ffmpeg-0001-deint_v4l2m2m-increase-input-and-output-buffers.patch diff --git a/projects/Rockchip/patches/ffmpeg/ffmpeg-0002-WIP-deint-filter.patch b/projects/Rockchip/patches/ffmpeg/ffmpeg-0002-WIP-deint-filter.patch deleted file mode 100644 index 34ccc5b6e0..0000000000 --- a/projects/Rockchip/patches/ffmpeg/ffmpeg-0002-WIP-deint-filter.patch +++ /dev/null @@ -1,924 +0,0 @@ -From 39069d9cc03a42cd497dd6b9756116ff4b684a5d Mon Sep 17 00:00:00 2001 -From: Jernej Skrabec -Date: Tue, 3 Dec 2019 21:01:18 +0100 -Subject: [PATCH] WIP deint filter - ---- - libavfilter/Makefile | 1 + - libavfilter/allfilters.c | 1 + - libavfilter/vf_deinterlace_v4l2m2m.c | 879 +++++++++++++++++++++++++++ - 3 files changed, 881 insertions(+) - create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c - -diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index 512354065305..625fd29f9313 100644 ---- a/libavfilter/Makefile -+++ b/libavfilter/Makefile -@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o - OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o - OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o - OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o -+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o - OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o - OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o - OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index 1183e4026751..fe5a2e8c02e8 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot; - extern AVFilter ff_vf_deflate; - extern AVFilter ff_vf_deflicker; - extern AVFilter ff_vf_deinterlace_qsv; -+extern AVFilter ff_vf_deinterlace_v4l2m2m; - extern AVFilter ff_vf_deinterlace_vaapi; - extern AVFilter ff_vf_dejudder; - extern AVFilter ff_vf_delogo; -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -new file mode 100644 -index 000000000000..1029e5b620fd ---- /dev/null -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -0,0 +1,879 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * deinterlace video filter - V4L2 M2M -+ */ -+ -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "libavutil/avassert.h" -+#include "libavutil/avstring.h" -+#include "libavutil/common.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavutil/internal.h" -+#include "libavutil/mathematics.h" -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/time.h" -+ -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" -+ -+typedef struct V4L2Queue V4L2Queue; -+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; -+ -+typedef struct V4L2PlaneInfo { -+ int bytesperline; -+ size_t length; -+} V4L2PlaneInfo; -+ -+typedef struct V4L2Buffer { -+ int enqueued; -+ int reenqueue; -+ int fd; -+ struct v4l2_buffer buffer; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int num_planes; -+ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; -+ AVDRMFrameDescriptor drm_frame; -+ V4L2Queue *q; -+} V4L2Buffer; -+ -+typedef struct V4L2Queue { -+ struct v4l2_format format; -+ int num_buffers; -+ V4L2Buffer *buffers; -+ DeintV4L2M2MContextShared *ctx; -+} V4L2Queue; -+ -+typedef struct DeintV4L2M2MContextShared { -+ int fd; -+ int done; -+ int width; -+ int height; -+ int orig_width; -+ int orig_height; -+ atomic_uint refcount; -+ -+ AVBufferRef *hw_frames_ctx; -+ -+ int frame_count; -+ AVFrame *frames[2]; -+ -+ V4L2Queue output; -+ V4L2Queue capture; -+} DeintV4L2M2MContextShared; -+ -+typedef struct DeintV4L2M2MContext { -+ const AVClass *class; -+ -+ DeintV4L2M2MContextShared *shared; -+} DeintV4L2M2MContext; -+ -+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) -+{ -+ struct v4l2_capability cap; -+ int ret; -+ -+ memset(&cap, 0, sizeof(cap)); -+ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); -+ if (ret < 0) -+ return ret; -+ -+ if (!(cap.capabilities & V4L2_CAP_STREAMING)) -+ return AVERROR(EINVAL); -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ -+ return 0; -+ } -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { -+ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ -+ return 0; -+ } -+ -+ return AVERROR(EINVAL); -+} -+ -+static int deint_v4l2m2m_try_format(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret, field; -+ -+ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); -+ if (ret) -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); -+ -+ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_NONE; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = ctx->width; -+ fmt->fmt.pix_mp.height = ctx->height; -+ } else { -+ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = ctx->width; -+ fmt->fmt.pix.height = ctx->height; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); -+ if (ret) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12 || -+ fmt->fmt.pix_mp.field != field) { -+ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } else { -+ if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12 || -+ fmt->fmt.pix.field != field) { -+ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); -+ -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = width; -+ fmt->fmt.pix_mp.height = height; -+ /* TODO: bytesperline and imagesize */ -+ } else { -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = width; -+ fmt->fmt.pix.height = height; -+ fmt->fmt.pix.sizeimage = 0; -+ fmt->fmt.pix.bytesperline = 0; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); -+ if (ret) -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) -+{ -+ int ret; -+ -+ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); -+ if (ctx->fd < 0) -+ return AVERROR(errno); -+ -+ ret = deint_v4l2m2m_prepare_context(ctx); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->capture); -+ if (ret) -+ goto fail; -+ -+ ret = deint_v4l2m2m_try_format(&ctx->output); -+ if (ret) -+ goto fail; -+ -+ return 0; -+ -+fail: -+ close(ctx->fd); -+ ctx->fd = -1; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) -+{ -+ int ret = AVERROR(EINVAL); -+ struct dirent *entry; -+ char node[PATH_MAX]; -+ DIR *dirp; -+ -+ dirp = opendir("/dev"); -+ if (!dirp) -+ return AVERROR(errno); -+ -+ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { -+ -+ if (strncmp(entry->d_name, "video", 5)) -+ continue; -+ -+ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); -+ av_log(NULL, AV_LOG_DEBUG, "probing device %s\n", node); -+ ret = deint_v4l2m2m_probe_device(ctx, node); -+ if (!ret) -+ break; -+ } -+ -+ closedir(dirp); -+ -+ if (ret) { -+ av_log(NULL, AV_LOG_ERROR, "Could not find a valid device\n"); -+ ctx->fd = -1; -+ -+ return ret; -+ } -+ -+ av_log(NULL, AV_LOG_INFO, "Using device %s\n", node); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) -+{ -+ int ret; -+ -+ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ buf->enqueued = 1; -+ -+ return 0; -+} -+ -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+{ -+ struct v4l2_exportbuffer expbuf; -+ int i, ret; -+ -+ for (i = 0; i < avbuf->num_planes; i++) { -+ memset(&expbuf, 0, sizeof(expbuf)); -+ -+ expbuf.index = avbuf->buffer.index; -+ expbuf.type = avbuf->buffer.type; -+ expbuf.plane = i; -+ -+ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ avbuf->fd = expbuf.fd; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { -+ /* drm frame */ -+ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; -+ avbuf->drm_frame.objects[i].fd = expbuf.fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } else { -+ /* drm frame */ -+ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; -+ avbuf->drm_frame.objects[0].fd = expbuf.fd; -+ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_requestbuffers req; -+ int ret, i, j, multiplanar; -+ uint32_t memory; -+ -+ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? -+ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; -+ -+ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); -+ -+ memset(&req, 0, sizeof(req)); -+ req.count = queue->num_buffers; -+ req.memory = memory; -+ req.type = fmt->type; -+ -+ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); -+ if (ret < 0) { -+ av_log(NULL, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); -+ -+ return AVERROR(errno); -+ } -+ -+ queue->num_buffers = req.count; -+ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); -+ if (!queue->buffers) { -+ av_log(NULL, AV_LOG_ERROR, "malloc enomem\n"); -+ -+ return AVERROR(ENOMEM); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer *buf = &queue->buffers[i]; -+ -+ buf->enqueued = 0; -+ buf->fd = -1; -+ buf->q = queue; -+ -+ buf->buffer.type = fmt->type; -+ buf->buffer.memory = memory; -+ buf->buffer.index = i; -+ -+ if (multiplanar) { -+ buf->buffer.length = VIDEO_MAX_PLANES; -+ buf->buffer.m.planes = buf->planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); -+ if (ret < 0) { -+ ret = AVERROR(errno); -+ -+ goto fail; -+ } -+ -+ if (multiplanar) -+ buf->num_planes = buf->buffer.length; -+ else -+ buf->num_planes = 1; -+ -+ for (j = 0; j < buf->num_planes; j++) { -+ V4L2PlaneInfo *info = &buf->plane_info[j]; -+ -+ if (multiplanar) { -+ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; -+ info->length = buf->buffer.m.planes[j].length; -+ } else { -+ info->bytesperline = fmt->fmt.pix.bytesperline; -+ info->length = buf->buffer.length; -+ } -+ } -+ -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { -+ ret = deint_v4l2m2m_enqueue_buffer(buf); -+ if (ret) -+ goto fail; -+ -+ ret = v4l2_buffer_export_drm(buf); -+ if (ret) -+ goto fail; -+ } -+ } -+ -+ return 0; -+ -+fail: -+ for (i = 0; i < queue->num_buffers; i++) -+ if (queue->buffers[i].fd >= 0) -+ close(queue->buffers[i].fd); -+ av_free(queue->buffers); -+ queue->buffers = NULL; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_streamon(V4L2Queue *queue) -+{ -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMON, &type); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_streamoff(V4L2Queue *queue) -+{ -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMOFF, &type); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) -+{ -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_buffer buf = { 0 }; -+ V4L2Buffer* avbuf = NULL; -+ struct pollfd pfd; -+ short events; -+ int ret; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ events = POLLOUT | POLLWRNORM; -+ else -+ events = POLLIN | POLLRDNORM; -+ -+ pfd.events = events; -+ pfd.fd = ctx->fd; -+ -+ for (;;) { -+ ret = poll(&pfd, 1, timeout); -+ if (ret > 0) -+ break; -+ if (errno == EINTR) -+ continue; -+ return NULL; -+ } -+ -+ if (pfd.revents & POLLERR) -+ return NULL; -+ -+ if (pfd.revents & events) { -+ memset(&buf, 0, sizeof(buf)); -+ buf.memory = V4L2_MEMORY_MMAP; -+ buf.type = queue->format.type; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); -+ if (ret) { -+ if (errno != EAGAIN) -+ av_log(NULL, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", -+ av_err2str(AVERROR(errno))); -+ return NULL; -+ } -+ -+ avbuf = &queue->buffers[buf.index]; -+ avbuf->enqueued = 0; -+ avbuf->buffer = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buffer.m.planes = avbuf->planes; -+ } -+ -+ return avbuf; -+ } -+ -+ return NULL; -+} -+ -+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) -+{ -+ int i; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (!queue->buffers[i].enqueued) -+ return &queue->buffers[i]; -+ -+ return NULL; -+} -+ -+static int deint_v4l2m2m_enqueue(V4L2Queue *queue, const AVFrame* frame) -+{ -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ V4L2Buffer *buf; -+ int i; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ while (deint_v4l2m2m_dequeue_buffer(queue, 0)); -+ -+ buf = deint_v4l2m2m_find_free_buf(queue); -+ if (!buf) -+ return AVERROR(ENOMEM); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) -+ for (i = 0; i < drm_desc->nb_objects; i++) -+ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; -+ else -+ buf->buffer.m.fd = drm_desc->objects[0].fd; -+ -+ return deint_v4l2m2m_enqueue_buffer(buf); -+} -+ -+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) -+{ -+ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int i; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); -+ -+ if (ctx->fd >= 0) { -+ deint_v4l2m2m_streamoff(capture); -+ deint_v4l2m2m_streamoff(output); -+ } -+ -+ if (capture->buffers) -+ for (i = 0; i < capture->num_buffers; i++) { -+ capture->buffers[i].q = NULL; -+ if (capture->buffers[i].fd >= 0) -+ close(capture->buffers[i].fd); -+ } -+ -+ for (i = 0; i < ctx->frame_count; i++) -+ av_frame_free(&ctx->frames[i]); -+ -+ av_buffer_unref(&ctx->hw_frames_ctx); -+ -+ if (capture->buffers) -+ av_free(capture->buffers); -+ -+ if (output->buffers) -+ av_free(output->buffers); -+ -+ if (ctx->fd >= 0) { -+ close(ctx->fd); -+ ctx->fd = -1; -+ } -+ -+ av_free(ctx); -+ } -+} -+ -+static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+{ -+ V4L2Buffer *buf = opaque; -+ DeintV4L2M2MContextShared *ctx = buf->q->ctx; -+ -+ if (!ctx->done) -+ deint_v4l2m2m_enqueue_buffer(buf); -+ -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) -+{ -+ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor *layer; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_objects = avbuf->num_planes; -+ drm_desc->nb_layers = 1; -+ -+ layer = &drm_desc->layers[0]; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; -+ } -+ -+ layer->format = DRM_FORMAT_NV12; -+ -+ if (avbuf->num_planes == 1) { -+ layer->nb_planes = 2; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -+ } -+ -+ return (uint8_t *)drm_desc; -+} -+ -+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) -+{ -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ V4L2Buffer* avbuf; -+ -+ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); -+ if (!avbuf) { -+ av_log(NULL, AV_LOG_ERROR, "dequeueing failed\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, -+ sizeof(avbuf->drm_frame), v4l2_free_buffer, -+ avbuf, AV_BUFFER_FLAG_READONLY); -+ if (!frame->buf[0]) -+ return AVERROR(ENOMEM); -+ -+ atomic_fetch_add(&ctx->refcount, 1); -+ -+ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); -+ frame->height = ctx->height; -+ frame->width = ctx->width; -+ -+ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { -+ av_log(NULL, AV_LOG_ERROR, "driver decode error\n"); -+ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame, int field) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ AVFilterLink *outlink = avctx->outputs[0]; -+ AVFrame *output_frame; -+ int err; -+ -+ output_frame = av_frame_alloc(); -+ -+ if (!output_frame) -+ return AVERROR(ENOMEM); -+ -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame, 500); -+ if (err < 0) { -+ av_log(priv, AV_LOG_ERROR, "no frame (field %d)\n", field); -+ goto fail; -+ } -+ -+ err = av_frame_copy_props(output_frame, input_frame); -+ if (err < 0) -+ goto fail; -+ -+ output_frame->interlaced_frame = 0; -+ -+ if (field == 0) { -+ output_frame->pts *= 2; -+ } else { -+ int64_t cur_pts = ctx->frames[0]->pts; -+ int64_t next_pts = ctx->frames[1]->pts; -+ -+ if (next_pts != AV_NOPTS_VALUE && cur_pts != AV_NOPTS_VALUE) { -+ output_frame->pts = next_pts + cur_pts; -+ } else { -+ output_frame->pts = AV_NOPTS_VALUE; -+ } -+ } -+ av_log(priv, AV_LOG_DEBUG, "pts: %"PRId64" (field %d)\n", output_frame->pts, field); -+ -+ return ff_filter_frame(outlink, output_frame); -+ -+fail: -+ av_frame_free(&output_frame); -+ return err; -+} -+ -+static int deint_v4l2m2m_config_props(AVFilterLink *outlink) -+{ -+ AVFilterLink *inlink = outlink->src->inputs[0]; -+ AVFilterContext *avctx = outlink->src; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ int ret; -+ -+ ctx->height = avctx->inputs[0]->h; -+ ctx->width = avctx->inputs[0]->w; -+ -+ outlink->frame_rate = av_mul_q(inlink->frame_rate, -+ (AVRational){ 2, 1 }); -+ outlink->time_base = av_mul_q(inlink->time_base, -+ (AVRational){ 1, 2 }); -+ -+ ret = deint_v4l2m2m_find_device(ctx); -+ if (ret) -+ return ret; -+ -+ if (!inlink->hw_frames_ctx) { -+ av_log(priv, AV_LOG_ERROR, "No hw context provided on input\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); -+ if (!ctx->hw_frames_ctx) -+ return AVERROR(ENOMEM); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) -+{ -+ static const enum AVPixelFormat pixel_formats[] = { -+ AV_PIX_FMT_DRM_PRIME, -+ AV_PIX_FMT_NONE, -+ }; -+ -+ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); -+} -+ -+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterContext *avctx = link->dst; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int ret; -+ -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64"\n", in->pts); -+ if (!ctx->frame_count) { -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ unsigned int field; -+ -+ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; -+ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; -+ -+ if (in->top_field_first) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_INTERLACED_BT; -+ -+ ret = deint_v4l2m2m_set_format(output, field, ctx->orig_width, ctx->orig_height); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->orig_width, ctx->orig_height); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(capture); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_allocate_buffers(output); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_streamon(output); -+ if (ret) -+ return ret; -+ } -+ -+ if (ctx->frame_count < 2) { -+ ctx->frames[ctx->frame_count++] = in; -+ } else { -+ av_frame_free(&ctx->frames[0]); -+ ctx->frames[0] = ctx->frames[1]; -+ ctx->frames[1] = in; -+ } -+ -+ ret = deint_v4l2m2m_enqueue(output, in); -+ if (ret) -+ return ret; -+ -+ if (ctx->frame_count == 2) { -+ ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 0); -+ if (ret) -+ return ret; -+ -+ ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 1); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx; -+ -+ ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); -+ if (!ctx) -+ return AVERROR(ENOMEM); -+ -+ priv->shared = ctx; -+ ctx->fd = -1; -+ ctx->output.ctx = ctx; -+ ctx->output.num_buffers = 6; -+ ctx->capture.ctx = ctx; -+ ctx->capture.num_buffers = 6; -+ ctx->done = 0; -+ atomic_init(&ctx->refcount, 1); -+ -+ return 0; -+} -+ -+static void deint_v4l2m2m_uninit(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ -+ ctx->done = 1; -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static const AVOption deinterlace_v4l2m2m_options[] = { -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); -+ -+static const AVFilterPad deint_v4l2m2m_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .filter_frame = deint_v4l2m2m_filter_frame, -+ }, -+ { NULL } -+}; -+ -+static const AVFilterPad deint_v4l2m2m_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .config_props = deint_v4l2m2m_config_props, -+ }, -+ { NULL } -+}; -+ -+AVFilter ff_vf_deinterlace_v4l2m2m = { -+ .name = "deinterlace_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &deint_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ .query_formats = &deint_v4l2m2m_query_formats, -+ .inputs = deint_v4l2m2m_inputs, -+ .outputs = deint_v4l2m2m_outputs, -+ .priv_class = &deinterlace_v4l2m2m_class, -+}; --- -2.29.2 - diff --git a/projects/Rockchip/patches/ffmpeg/ffmpeg-0003-libavfilter-v4l2deinterlace-dequeue-both-destination.patch b/projects/Rockchip/patches/ffmpeg/ffmpeg-0003-libavfilter-v4l2deinterlace-dequeue-both-destination.patch deleted file mode 100644 index cefbd9c64d..0000000000 --- a/projects/Rockchip/patches/ffmpeg/ffmpeg-0003-libavfilter-v4l2deinterlace-dequeue-both-destination.patch +++ /dev/null @@ -1,230 +0,0 @@ -From 6bea46839ba23bffaa093bb9ed805d571aaa66ea Mon Sep 17 00:00:00 2001 -From: Alex Bee -Date: Wed, 30 Sep 2020 21:11:34 +0200 -Subject: [PATCH] libavfilter: v4l2deinterlace: dequeue both destination - buffers on time - -Signed-off-by: Alex Bee ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 140 +++++++++++++++++---------- - 1 file changed, 88 insertions(+), 52 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index 1029e5b620fd..72d28333ffa7 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -89,8 +89,14 @@ typedef struct DeintV4L2M2MContextShared { - - AVBufferRef *hw_frames_ctx; - -- int frame_count; -- AVFrame *frames[2]; -+ /* -+ * TODO: check if its really neccessary to hold this -+ * ref, it's only used for freeing av_frame on decoding -+ * end/abort -+ */ -+ AVFrame *cur_in_frame; -+ AVFrame *prev_in_frame; -+ unsigned int field_order; - - V4L2Queue output; - V4L2Queue capture; -@@ -557,8 +563,11 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) - close(capture->buffers[i].fd); - } - -- for (i = 0; i < ctx->frame_count; i++) -- av_frame_free(&ctx->frames[i]); -+ if (ctx->cur_in_frame) -+ av_frame_free(&ctx->cur_in_frame); -+ -+ if (ctx->prev_in_frame) -+ av_frame_free(&ctx->prev_in_frame); - - av_buffer_unref(&ctx->hw_frames_ctx); - -@@ -652,49 +661,79 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - return 0; - } - --static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame, int field) -+static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame) - { - DeintV4L2M2MContext *priv = avctx->priv; - DeintV4L2M2MContextShared *ctx = priv->shared; - AVFilterLink *outlink = avctx->outputs[0]; -- AVFrame *output_frame; -+ AVFrame *output_frame_1, *output_frame_2; -+ int64_t first_pts = AV_NOPTS_VALUE; - int err; - -- output_frame = av_frame_alloc(); -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" (field %d)\n", -+ input_frame->pts, ctx->field_order); - -- if (!output_frame) -+ output_frame_1 = av_frame_alloc(); -+ if (!output_frame_1) - return AVERROR(ENOMEM); - -- err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame, 500); -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_1, 500); - if (err < 0) { -- av_log(priv, AV_LOG_ERROR, "no frame (field %d)\n", field); -- goto fail; -+ av_log(priv, AV_LOG_ERROR, "no 1st frame (field %d)\n", ctx->field_order); -+ goto fail_out1; - } - -- err = av_frame_copy_props(output_frame, input_frame); -+ err = av_frame_copy_props(output_frame_1, input_frame); - if (err < 0) -- goto fail; -+ goto fail_out1; - -- output_frame->interlaced_frame = 0; -+ output_frame_1->interlaced_frame = 0; - -- if (field == 0) { -- output_frame->pts *= 2; -- } else { -- int64_t cur_pts = ctx->frames[0]->pts; -- int64_t next_pts = ctx->frames[1]->pts; -+ output_frame_2 = av_frame_alloc(); -+ if (!output_frame_2) { -+ err = AVERROR(ENOMEM); -+ goto fail_out1; -+ } -+ -+ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_2, 500); -+ if (err < 0) { -+ av_log(priv, AV_LOG_ERROR, "no 2nd frame (field %d)\n", ctx->field_order); -+ goto fail_out2; -+ } -+ -+ err = av_frame_copy_props(output_frame_2, input_frame); -+ if (err < 0) -+ goto fail_out2; -+ -+ output_frame_2->interlaced_frame = 0; - -- if (next_pts != AV_NOPTS_VALUE && cur_pts != AV_NOPTS_VALUE) { -- output_frame->pts = next_pts + cur_pts; -- } else { -- output_frame->pts = AV_NOPTS_VALUE; -- } -+ if (ctx->prev_in_frame && ctx->prev_in_frame->pts != AV_NOPTS_VALUE -+ && input_frame->pts != AV_NOPTS_VALUE) { -+ first_pts = (ctx->prev_in_frame->pts + input_frame->pts) / 2; -+ av_log(priv, AV_LOG_DEBUG, "calculated first pts %"PRId64"\n", first_pts); - } -- av_log(priv, AV_LOG_DEBUG, "pts: %"PRId64" (field %d)\n", output_frame->pts, field); - -- return ff_filter_frame(outlink, output_frame); -+ output_frame_1->pts = first_pts; -+ -+ err = ff_filter_frame(outlink, output_frame_1); -+ if (err < 0) { -+ av_frame_free(&output_frame_2); -+ return err; -+ } -+ err = ff_filter_frame(outlink, output_frame_2); -+ -+ if (err < 0) -+ return err; -+ -+ av_log(priv, AV_LOG_DEBUG, "1st frame pts: %"PRId64" 2nd frame pts: %"PRId64" first pts: %"PRId64" (field %d)\n", -+ output_frame_1->pts, output_frame_2->pts, first_pts, ctx->field_order); -+ -+ return 0; - --fail: -- av_frame_free(&output_frame); -+fail_out2: -+ av_frame_free(&output_frame_2); -+fail_out1: -+ av_frame_free(&output_frame_1); - return err; - } - -@@ -749,20 +788,22 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - V4L2Queue *output = &ctx->output; - int ret; - -- av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64"\n", in->pts); -- if (!ctx->frame_count) { -+ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" field :%d interlaced: %d\n", -+ in->pts, in->top_field_first, in->interlaced_frame); -+ -+ ctx->cur_in_frame = in; -+ -+ if (ctx->field_order == V4L2_FIELD_ANY) { - AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -- unsigned int field; -- - ctx->orig_width = drm_desc->layers[0].planes[0].pitch; - ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; - -- if (in->top_field_first) -- field = V4L2_FIELD_INTERLACED_TB; -+ if (in->top_field_first) -+ ctx->field_order = V4L2_FIELD_INTERLACED_TB; - else -- field = V4L2_FIELD_INTERLACED_BT; -+ ctx->field_order = V4L2_FIELD_INTERLACED_BT; - -- ret = deint_v4l2m2m_set_format(output, field, ctx->orig_width, ctx->orig_height); -+ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->orig_width, ctx->orig_height); - if (ret) - return ret; - -@@ -787,27 +828,19 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - return ret; - } - -- if (ctx->frame_count < 2) { -- ctx->frames[ctx->frame_count++] = in; -- } else { -- av_frame_free(&ctx->frames[0]); -- ctx->frames[0] = ctx->frames[1]; -- ctx->frames[1] = in; -- } -- - ret = deint_v4l2m2m_enqueue(output, in); - if (ret) - return ret; - -- if (ctx->frame_count == 2) { -- ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 0); -- if (ret) -- return ret; -+ ret = deint_v4l2m2m_dequeue(avctx, in); -+ if (ret) -+ return ret; - -- ret = deint_v4l2m2m_dequeue(avctx, ctx->frames[0], 1); -- if (ret) -- return ret; -- } -+ if (ctx->prev_in_frame) -+ av_frame_free(&ctx->prev_in_frame); -+ -+ ctx->prev_in_frame = in; -+ ctx->cur_in_frame = NULL; - - return 0; - } -@@ -828,6 +861,9 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - ctx->capture.ctx = ctx; - ctx->capture.num_buffers = 6; - ctx->done = 0; -+ ctx->field_order = V4L2_FIELD_ANY; -+ ctx->cur_in_frame = NULL; -+ ctx->prev_in_frame = NULL; - atomic_init(&ctx->refcount, 1); - - return 0; --- -2.29.2 - diff --git a/projects/Rockchip/patches/ffmpeg/ffmpeg-0006-libavfilter-v4l2deinterlace-support-more-formats-aut.patch b/projects/Rockchip/patches/ffmpeg/ffmpeg-0006-libavfilter-v4l2deinterlace-support-more-formats-aut.patch deleted file mode 100644 index a46cc8f83d..0000000000 --- a/projects/Rockchip/patches/ffmpeg/ffmpeg-0006-libavfilter-v4l2deinterlace-support-more-formats-aut.patch +++ /dev/null @@ -1,288 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Alex Bee -Date: Wed, 15 Sep 2021 00:37:15 +0200 -Subject: [PATCH] libavfilter: v4l2deinterlace: support more formats / - automatic output format selection - -Signed-off-by: Alex Bee ---- - libavfilter/vf_deinterlace_v4l2m2m.c | 120 +++++++++++++++++++++++---- - 1 file changed, 102 insertions(+), 18 deletions(-) - -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -index d7935d92f9..8161fd9e75 100644 ---- a/libavfilter/vf_deinterlace_v4l2m2m.c -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -85,6 +85,9 @@ typedef struct DeintV4L2M2MContextShared { - int height; - int orig_width; - int orig_height; -+ uint64_t drm_in_format; -+ uint64_t drm_out_format; -+ - atomic_uint refcount; - - AVBufferRef *hw_frames_ctx; -@@ -108,6 +111,65 @@ typedef struct DeintV4L2M2MContext { - DeintV4L2M2MContextShared *shared; - } DeintV4L2M2MContext; - -+typedef struct drm_v4l2_pix_fmt_mapping { -+ uint64_t drm_format; -+ uint32_t v4l2_pix_fmt; -+}; -+ -+static struct drm_v4l2_pix_fmt_mapping drm_v4l2_pix_fmt_map[] = { -+ { .drm_format = DRM_FORMAT_NV12, .v4l2_pix_fmt = V4L2_PIX_FMT_NV12 }, -+ { .drm_format = DRM_FORMAT_NV21, .v4l2_pix_fmt = V4L2_PIX_FMT_NV21 }, -+ { .drm_format = DRM_FORMAT_NV16, .v4l2_pix_fmt = V4L2_PIX_FMT_NV16 }, -+ { .drm_format = DRM_FORMAT_NV16, .v4l2_pix_fmt = V4L2_PIX_FMT_NV16 }, -+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED -+ { .drm_format = DRM_FORMAT_MOD_ALLWINNER_TILED, .v4l2_pix_fmt = V4L2_PIX_FMT_SUNXI_TILED_NV12 }, -+#endif -+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) -+ { .drm_format = DRM_FORMAT_NV15, .v4l2_pix_fmt = V4L2_PIX_FMT_NV15 }, -+#endif -+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) -+ { .drm_format = DRM_FORMAT_NV20, .v4l2_pix_fmt = V4L2_PIX_FMT_NV20 }, -+#endif -+}; -+ -+static inline uint32_t v4l2_pix_fmt_from_drm_format(uint64_t drm_format) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(drm_v4l2_pix_fmt_map); i++) { -+ if (drm_v4l2_pix_fmt_map[i].drm_format == drm_format) -+ return drm_v4l2_pix_fmt_map[i].v4l2_pix_fmt; -+ } -+ -+ av_log(NULL, AV_LOG_WARNING, "%s unknown drm format 0x%llx using default v4l2_pix_fmt 0x%x\n", -+ __func__ , drm_format, drm_v4l2_pix_fmt_map[0].v4l2_pix_fmt); -+ return drm_v4l2_pix_fmt_map[0].v4l2_pix_fmt; -+} -+ -+static inline uint64_t drm_format_from_v4l2_pix_fmt(uint32_t v4l2_pix_fmt) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(drm_v4l2_pix_fmt_map); i++) { -+ if (drm_v4l2_pix_fmt_map[i].v4l2_pix_fmt == v4l2_pix_fmt) -+ return drm_v4l2_pix_fmt_map[i].drm_format; -+ } -+ -+ av_log(NULL, AV_LOG_WARNING, "%s unknown v4l2_pix_fmt format 0x%x using default drm_format 0x%llx\n", -+ __func__ , v4l2_pix_fmt, drm_v4l2_pix_fmt_map[0].drm_format); -+ return drm_v4l2_pix_fmt_map[0].drm_format; -+} -+ -+static inline uint64_t drm_format_modifier(uint64_t drm_format) -+{ -+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED -+ if (drm_format == DRM_FORMAT_MOD_ALLWINNER_TILED) -+ return DRM_FORMAT_MOD_ALLWINNER_TILED; -+#endif -+ return DRM_FORMAT_MOD_LINEAR; -+ -+} -+ - static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) - { - struct v4l2_capability cap; -@@ -138,11 +200,12 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) - return AVERROR(EINVAL); - } - --static int deint_v4l2m2m_try_format(V4L2Queue *queue) -+static int deint_v4l2m2m_try_format(V4L2Queue *queue, uint64_t drm_format) - { - struct v4l2_format *fmt = &queue->format; - DeintV4L2M2MContextShared *ctx = queue->ctx; - int ret, field; -+ uint32_t v4l2_pix_fmt = v4l2_pix_fmt_from_drm_format(drm_format); - - ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); - if (ret) -@@ -154,12 +217,12 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - field = V4L2_FIELD_NONE; - - if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix_mp.pixelformat = v4l2_pix_fmt; - fmt->fmt.pix_mp.field = field; - fmt->fmt.pix_mp.width = ctx->width; - fmt->fmt.pix_mp.height = ctx->height; - } else { -- fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_NV12; -+ fmt->fmt.pix.pixelformat = v4l2_pix_fmt; - fmt->fmt.pix.field = field; - fmt->fmt.pix.width = ctx->width; - fmt->fmt.pix.height = ctx->height; -@@ -170,14 +233,14 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - return AVERROR(EINVAL); - - if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -- if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12 || -+ if (fmt->fmt.pix_mp.pixelformat != v4l2_pix_fmt || - fmt->fmt.pix_mp.field != field) { - av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); - - return AVERROR(EINVAL); - } - } else { -- if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12 || -+ if (fmt->fmt.pix.pixelformat != v4l2_pix_fmt || - fmt->fmt.pix.field != field) { - av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); - -@@ -187,19 +250,21 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) - - return 0; - } -- --static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height) -+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, uint64_t drm_format) - { - struct v4l2_format *fmt = &queue->format; - DeintV4L2M2MContextShared *ctx = queue->ctx; - int ret; -+ uint32_t v4l2_pix_fmt = v4l2_pix_fmt_from_drm_format(drm_format); - - if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = v4l2_pix_fmt; - fmt->fmt.pix_mp.field = field; - fmt->fmt.pix_mp.width = width; - fmt->fmt.pix_mp.height = height; - /* TODO: bytesperline and imagesize */ - } else { -+ fmt->fmt.pix.pixelformat = v4l2_pix_fmt; - fmt->fmt.pix.field = field; - fmt->fmt.pix.width = width; - fmt->fmt.pix.height = height; -@@ -211,6 +276,18 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, - if (ret) - av_log(NULL, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); - -+ else if (!V4L2_TYPE_IS_OUTPUT(queue->format.type)) { -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type) && fmt->fmt.pix_mp.pixelformat != v4l2_pix_fmt) { -+ ctx->drm_out_format = drm_format_from_v4l2_pix_fmt(fmt->fmt.pix_mp.pixelformat); -+ av_log(NULL, AV_LOG_DEBUG, "%s driver updated v4l2_pixfmt from: %x to %x, so now using %llx as drm output format\n", -+ __func__, v4l2_pix_fmt, fmt->fmt.pix_mp.pixelformat, ctx->drm_out_format); -+ } else if (fmt->fmt.pix.pixelformat != v4l2_pix_fmt) { -+ ctx->drm_out_format = drm_format_from_v4l2_pix_fmt(fmt->fmt.pix.pixelformat); -+ av_log(NULL, AV_LOG_DEBUG, "%s driver updated v4l2_pixfmt from: %x to %x, so now using %llx as drm output format\n", -+ __func__, v4l2_pix_fmt, fmt->fmt.pix.pixelformat, ctx->drm_out_format); -+ } -+ } -+ - return ret; - } - -@@ -226,11 +303,11 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node - if (ret) - goto fail; - -- ret = deint_v4l2m2m_try_format(&ctx->capture); -+ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->drm_out_format); - if (ret) - goto fail; - -- ret = deint_v4l2m2m_try_format(&ctx->output); -+ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->drm_in_format); - if (ret) - goto fail; - -@@ -293,7 +370,7 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) - return 0; - } - --static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, uint64_t drm_format) - { - struct v4l2_exportbuffer expbuf; - int i, ret; -@@ -315,12 +392,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) - /* drm frame */ - avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; - avbuf->drm_frame.objects[i].fd = expbuf.fd; -- avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ avbuf->drm_frame.objects[i].format_modifier = drm_format_modifier(drm_format); - } else { - /* drm frame */ - avbuf->drm_frame.objects[0].size = avbuf->buffer.length; - avbuf->drm_frame.objects[0].fd = expbuf.fd; -- avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ avbuf->drm_frame.objects[0].format_modifier = drm_format_modifier(drm_format); - } - } - -@@ -405,7 +482,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) - if (ret) - goto fail; - -- ret = v4l2_buffer_export_drm(buf); -+ ret = v4l2_buffer_export_drm(buf, ctx->drm_out_format); - if (ret) - goto fail; - } -@@ -597,7 +674,7 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) - deint_v4l2m2m_destroy_context(ctx); - } - --static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) -+static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height, uint64_t drm_format) - { - AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; - AVDRMLayerDescriptor *layer; -@@ -615,7 +692,7 @@ static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) - layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; - } - -- layer->format = DRM_FORMAT_NV12; -+ layer->format = drm_format; - - if (avbuf->num_planes == 1) { - layer->nb_planes = 2; -@@ -647,7 +724,7 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim - - atomic_fetch_add(&ctx->refcount, 1); - -- frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); -+ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height, ctx->drm_out_format); - frame->format = AV_PIX_FMT_DRM_PRIME; - frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); - frame->height = ctx->height; -@@ -797,17 +874,22 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) - AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; - ctx->orig_width = drm_desc->layers[0].planes[0].pitch; - ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; -+ ctx->drm_in_format = drm_desc->layers->format; -+ ctx->drm_out_format = drm_desc->layers->format; -+ - - if (in->top_field_first) - ctx->field_order = V4L2_FIELD_INTERLACED_TB; - else - ctx->field_order = V4L2_FIELD_INTERLACED_BT; - -- ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->orig_width, ctx->orig_height); -+ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->orig_width, ctx->orig_height, -+ ctx->drm_in_format); - if (ret) - return ret; - -- ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->orig_width, ctx->orig_height); -+ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->orig_width, ctx->orig_height, -+ ctx->drm_out_format); - if (ret) - return ret; - -@@ -864,6 +946,8 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) - ctx->field_order = V4L2_FIELD_ANY; - ctx->cur_in_frame = NULL; - ctx->prev_in_frame = NULL; -+ ctx->drm_in_format = drm_v4l2_pix_fmt_map[0].drm_format; -+ ctx->drm_out_format = drm_v4l2_pix_fmt_map[0].drm_format; - atomic_init(&ctx->refcount, 1); - - return 0; diff --git a/tools/ffmpeg/gen-patches.sh b/tools/ffmpeg/gen-patches.sh index 9b0c618a66..5ce4e33615 100755 --- a/tools/ffmpeg/gen-patches.sh +++ b/tools/ffmpeg/gen-patches.sh @@ -7,7 +7,7 @@ FFMPEG_VERSION="n5.1.2" KODI_FFMPEG_REPO="https://github.com/xbmc/FFmpeg" KODI_FFMPEG_VERSION="5.1.2-Nexus-Alpha3" -ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi kodi" +ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi kodi vf-deinterlace-v4l2m2m" if [ $# -eq 0 ]; then echo "usage: $0 all|featureset [githash]" @@ -28,7 +28,7 @@ create_patch() { PATCH_CREATE_DIFF="no" case "${FEATURE_SET}" in - v4l2-drmprime|v4l2-request) + v4l2-drmprime|v4l2-request|vf-deinterlace-v4l2m2m) REPO="https://github.com/jernejsk/FFmpeg" REFSPEC="${FEATURE_SET}-${FFMPEG_VERSION}" ;; From ebd2e232873cd66e54298e44ad1d624528497394 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sat, 5 Nov 2022 18:27:23 +0100 Subject: [PATCH 09/12] ffmpeg: create vf-deinterlace-v4l2m2m patch Patch created using revisions eacfcba..d14859b from branch vf-deinterlace-v4l2m2m-n5.1.2 of https://github.com/jernejsk/FFmpeg --- .../ffmpeg-001-vf-deinterlace-v4l2m2m.patch | 1052 +++++++++++++++++ 1 file changed, 1052 insertions(+) create mode 100644 packages/multimedia/ffmpeg/patches/vf-deinterlace-v4l2m2m/ffmpeg-001-vf-deinterlace-v4l2m2m.patch diff --git a/packages/multimedia/ffmpeg/patches/vf-deinterlace-v4l2m2m/ffmpeg-001-vf-deinterlace-v4l2m2m.patch b/packages/multimedia/ffmpeg/patches/vf-deinterlace-v4l2m2m/ffmpeg-001-vf-deinterlace-v4l2m2m.patch new file mode 100644 index 0000000000..f7cf92b57b --- /dev/null +++ b/packages/multimedia/ffmpeg/patches/vf-deinterlace-v4l2m2m/ffmpeg-001-vf-deinterlace-v4l2m2m.patch @@ -0,0 +1,1052 @@ +From d14859b090dc3b2e1bd761698b947b0b55e4d831 Mon Sep 17 00:00:00 2001 +From: Jernej Skrabec +Date: Tue, 3 Dec 2019 21:01:18 +0100 +Subject: [PATCH] Add V4L2 m2m deinterlace filter + +Signed-off-by: Alex Bee +--- + libavfilter/Makefile | 1 + + libavfilter/allfilters.c | 1 + + libavfilter/vf_deinterlace_v4l2m2m.c | 1009 ++++++++++++++++++++++++++ + 3 files changed, 1011 insertions(+) + create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c + +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index 30cc329fb6..2fe6ab223e 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -254,6 +254,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o + OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o + OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o + OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o ++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o + OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o + OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o + OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 5ebacfde27..4b74bac3c8 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -234,6 +234,7 @@ extern const AVFilter ff_vf_dedot; + extern const AVFilter ff_vf_deflate; + extern const AVFilter ff_vf_deflicker; + extern const AVFilter ff_vf_deinterlace_qsv; ++extern const AVFilter ff_vf_deinterlace_v4l2m2m; + extern const AVFilter ff_vf_deinterlace_vaapi; + extern const AVFilter ff_vf_dejudder; + extern const AVFilter ff_vf_delogo; +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +new file mode 100644 +index 0000000000..ff5ed500a9 +--- /dev/null ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -0,0 +1,1009 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * deinterlace video filter - V4L2 M2M ++ */ ++ ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "libavutil/avassert.h" ++#include "libavutil/avstring.h" ++#include "libavutil/common.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavutil/internal.h" ++#include "libavutil/mathematics.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/time.h" ++ ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct V4L2Queue V4L2Queue; ++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; ++ ++typedef struct V4L2PlaneInfo { ++ int bytesperline; ++ size_t length; ++} V4L2PlaneInfo; ++ ++typedef struct V4L2Buffer { ++ int enqueued; ++ int reenqueue; ++ int fd; ++ struct v4l2_buffer buffer; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int num_planes; ++ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; ++ AVDRMFrameDescriptor drm_frame; ++ V4L2Queue *q; ++} V4L2Buffer; ++ ++typedef struct V4L2Queue { ++ struct v4l2_format format; ++ int num_buffers; ++ V4L2Buffer *buffers; ++ DeintV4L2M2MContextShared *ctx; ++} V4L2Queue; ++ ++typedef struct DeintV4L2M2MContextShared { ++ int fd; ++ int done; ++ int width; ++ int height; ++ int orig_width; ++ int orig_height; ++ uint64_t drm_in_format; ++ uint64_t drm_out_format; ++ ++ atomic_uint refcount; ++ ++ AVBufferRef *hw_frames_ctx; ++ ++ /* ++ * TODO: check if its really neccessary to hold this ++ * ref, it's only used for freeing av_frame on decoding ++ * end/abort ++ */ ++ AVFrame *cur_in_frame; ++ AVFrame *prev_in_frame; ++ unsigned int field_order; ++ ++ V4L2Queue output; ++ V4L2Queue capture; ++} DeintV4L2M2MContextShared; ++ ++typedef struct DeintV4L2M2MContext { ++ const AVClass *class; ++ ++ DeintV4L2M2MContextShared *shared; ++} DeintV4L2M2MContext; ++ ++static inline uint32_t v4l2_pix_fmt_from_drm_format(uint64_t drm_format) ++{ ++ switch(drm_format) { ++#if defined(V4L2_PIX_FMT_SUNXI_TILED_NV12) && defined(DRM_FORMAT_MOD_ALLWINNER_TILED) ++ case DRM_FORMAT_MOD_ALLWINNER_TILED: ++ return V4L2_PIX_FMT_SUNXI_TILED_NV12; ++#endif ++ case DRM_FORMAT_NV12: ++ return V4L2_PIX_FMT_NV12; ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ case DRM_FORMAT_NV15: ++ return V4L2_PIX_FMT_NV15; ++#endif ++ case DRM_FORMAT_NV16: ++ return V4L2_PIX_FMT_NV16; ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ case DRM_FORMAT_NV20: ++ return V4L2_PIX_FMT_NV20; ++#endif ++ case DRM_FORMAT_NV21: ++ return V4L2_PIX_FMT_NV21; ++ case DRM_FORMAT_NV61: ++ return V4L2_PIX_FMT_NV61; ++ default: ++ av_log(NULL, AV_LOG_WARNING, "%s unknown drm format 0x%llx using default v4l2_pix_fmt 0x%x\n", ++ __func__ , drm_format, V4L2_PIX_FMT_NV12); ++ return V4L2_PIX_FMT_NV12; ++ } ++} ++ ++static inline uint64_t drm_format_from_v4l2_pix_fmt(uint32_t v4l2_pix_fmt) ++{ ++ switch(v4l2_pix_fmt) { ++ case V4L2_PIX_FMT_NV12: ++ return DRM_FORMAT_NV12; ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ case V4L2_PIX_FMT_NV15: ++ return DRM_FORMAT_NV15; ++#endif ++ case V4L2_PIX_FMT_NV16: ++ return DRM_FORMAT_NV16; ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ case V4L2_PIX_FMT_NV20: ++ return DRM_FORMAT_NV20; ++#endif ++ case V4L2_PIX_FMT_NV21: ++ return DRM_FORMAT_NV21; ++ case V4L2_PIX_FMT_NV61: ++ return DRM_FORMAT_NV61; ++#if defined(V4L2_PIX_FMT_SUNXI_TILED_NV12) && defined(DRM_FORMAT_MOD_ALLWINNER_TILED) ++ case V4L2_PIX_FMT_SUNXI_TILED_NV12: ++ return DRM_FORMAT_MOD_ALLWINNER_TILED; ++#endif ++ default: ++ av_log(NULL, AV_LOG_WARNING, "%s unknown v4l2_pix_fmt format 0x%x using default drm_format 0x%x\n", ++ __func__ , v4l2_pix_fmt, DRM_FORMAT_NV12); ++ return DRM_FORMAT_NV12; ++ } ++} ++ ++static inline uint64_t drm_format_modifier(uint64_t drm_format) ++{ ++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED ++ if (drm_format == DRM_FORMAT_MOD_ALLWINNER_TILED) ++ return DRM_FORMAT_MOD_ALLWINNER_TILED; ++#endif ++ return DRM_FORMAT_MOD_LINEAR; ++ ++} ++ ++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) ++{ ++ struct v4l2_capability cap; ++ int ret; ++ ++ memset(&cap, 0, sizeof(cap)); ++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); ++ if (ret < 0) ++ return ret; ++ ++ if (!(cap.capabilities & V4L2_CAP_STREAMING)) ++ return AVERROR(EINVAL); ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ ++ return 0; ++ } ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ ++ return 0; ++ } ++ ++ return AVERROR(EINVAL); ++} ++ ++static int deint_v4l2m2m_try_format(V4L2Queue *queue, uint64_t drm_format) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret, field; ++ uint32_t v4l2_pix_fmt = v4l2_pix_fmt_from_drm_format(drm_format); ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); ++ if (ret) ++ av_log(NULL, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); ++ ++ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ field = V4L2_FIELD_INTERLACED_TB; ++ else ++ field = V4L2_FIELD_NONE; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = v4l2_pix_fmt; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = ctx->width; ++ fmt->fmt.pix_mp.height = ctx->height; ++ } else { ++ fmt->fmt.pix.pixelformat = v4l2_pix_fmt; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = ctx->width; ++ fmt->fmt.pix.height = ctx->height; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); ++ if (ret) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ if (fmt->fmt.pix_mp.pixelformat != v4l2_pix_fmt || ++ fmt->fmt.pix_mp.field != field) { ++ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } else { ++ if (fmt->fmt.pix.pixelformat != v4l2_pix_fmt || ++ fmt->fmt.pix.field != field) { ++ av_log(NULL, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ return 0; ++} ++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, uint64_t drm_format) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret; ++ uint32_t v4l2_pix_fmt = v4l2_pix_fmt_from_drm_format(drm_format); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = v4l2_pix_fmt; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = width; ++ fmt->fmt.pix_mp.height = height; ++ /* TODO: bytesperline and imagesize */ ++ } else { ++ fmt->fmt.pix.pixelformat = v4l2_pix_fmt; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = width; ++ fmt->fmt.pix.height = height; ++ fmt->fmt.pix.sizeimage = 0; ++ fmt->fmt.pix.bytesperline = 0; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); ++ if (ret) ++ av_log(NULL, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); ++ ++ else if (!V4L2_TYPE_IS_OUTPUT(queue->format.type)) { ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type) && fmt->fmt.pix_mp.pixelformat != v4l2_pix_fmt) { ++ ctx->drm_out_format = drm_format_from_v4l2_pix_fmt(fmt->fmt.pix_mp.pixelformat); ++ av_log(NULL, AV_LOG_DEBUG, "%s driver updated v4l2_pixfmt from: %x to %x, so now using %llx as drm output format\n", ++ __func__, v4l2_pix_fmt, fmt->fmt.pix_mp.pixelformat, ctx->drm_out_format); ++ } else if (fmt->fmt.pix.pixelformat != v4l2_pix_fmt) { ++ ctx->drm_out_format = drm_format_from_v4l2_pix_fmt(fmt->fmt.pix.pixelformat); ++ av_log(NULL, AV_LOG_DEBUG, "%s driver updated v4l2_pixfmt from: %x to %x, so now using %llx as drm output format\n", ++ __func__, v4l2_pix_fmt, fmt->fmt.pix.pixelformat, ctx->drm_out_format); ++ } ++ } ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) ++{ ++ int ret; ++ ++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); ++ if (ctx->fd < 0) ++ return AVERROR(errno); ++ ++ ret = deint_v4l2m2m_prepare_context(ctx); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->drm_out_format); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->drm_in_format); ++ if (ret) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ close(ctx->fd); ++ ctx->fd = -1; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) ++{ ++ int ret = AVERROR(EINVAL); ++ struct dirent *entry; ++ char node[PATH_MAX]; ++ DIR *dirp; ++ ++ dirp = opendir("/dev"); ++ if (!dirp) ++ return AVERROR(errno); ++ ++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { ++ ++ if (strncmp(entry->d_name, "video", 5)) ++ continue; ++ ++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); ++ av_log(NULL, AV_LOG_DEBUG, "probing device %s\n", node); ++ ret = deint_v4l2m2m_probe_device(ctx, node); ++ if (!ret) ++ break; ++ } ++ ++ closedir(dirp); ++ ++ if (ret) { ++ av_log(NULL, AV_LOG_ERROR, "Could not find a valid device\n"); ++ ctx->fd = -1; ++ ++ return ret; ++ } ++ ++ av_log(NULL, AV_LOG_INFO, "Using device %s\n", node); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) ++{ ++ int ret; ++ ++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ buf->enqueued = 1; ++ ++ return 0; ++} ++ ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, uint64_t drm_format) ++{ ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; ++ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buffer.index; ++ expbuf.type = avbuf->buffer.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ avbuf->fd = expbuf.fd; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { ++ /* drm frame */ ++ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; ++ avbuf->drm_frame.objects[i].fd = expbuf.fd; ++ avbuf->drm_frame.objects[i].format_modifier = drm_format_modifier(drm_format); ++ } else { ++ /* drm frame */ ++ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; ++ avbuf->drm_frame.objects[0].fd = expbuf.fd; ++ avbuf->drm_frame.objects[0].format_modifier = drm_format_modifier(drm_format); ++ } ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_requestbuffers req; ++ int ret, i, j, multiplanar; ++ uint32_t memory; ++ ++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? ++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; ++ ++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); ++ ++ memset(&req, 0, sizeof(req)); ++ req.count = queue->num_buffers; ++ req.memory = memory; ++ req.type = fmt->type; ++ ++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); ++ if (ret < 0) { ++ av_log(NULL, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); ++ ++ return AVERROR(errno); ++ } ++ ++ queue->num_buffers = req.count; ++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); ++ if (!queue->buffers) { ++ av_log(NULL, AV_LOG_ERROR, "malloc enomem\n"); ++ ++ return AVERROR(ENOMEM); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer *buf = &queue->buffers[i]; ++ ++ buf->enqueued = 0; ++ buf->fd = -1; ++ buf->q = queue; ++ ++ buf->buffer.type = fmt->type; ++ buf->buffer.memory = memory; ++ buf->buffer.index = i; ++ ++ if (multiplanar) { ++ buf->buffer.length = VIDEO_MAX_PLANES; ++ buf->buffer.m.planes = buf->planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); ++ if (ret < 0) { ++ ret = AVERROR(errno); ++ ++ goto fail; ++ } ++ ++ if (multiplanar) ++ buf->num_planes = buf->buffer.length; ++ else ++ buf->num_planes = 1; ++ ++ for (j = 0; j < buf->num_planes; j++) { ++ V4L2PlaneInfo *info = &buf->plane_info[j]; ++ ++ if (multiplanar) { ++ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; ++ info->length = buf->buffer.m.planes[j].length; ++ } else { ++ info->bytesperline = fmt->fmt.pix.bytesperline; ++ info->length = buf->buffer.length; ++ } ++ } ++ ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { ++ ret = deint_v4l2m2m_enqueue_buffer(buf); ++ if (ret) ++ goto fail; ++ ++ ret = v4l2_buffer_export_drm(buf, ctx->drm_out_format); ++ if (ret) ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].fd >= 0) ++ close(queue->buffers[i].fd); ++ av_free(queue->buffers); ++ queue->buffers = NULL; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_streamon(V4L2Queue *queue) ++{ ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMON, &type); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_streamoff(V4L2Queue *queue) ++{ ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(queue->ctx->fd, VIDIOC_STREAMOFF, &type); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) ++{ ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_buffer buf = { 0 }; ++ V4L2Buffer* avbuf = NULL; ++ struct pollfd pfd; ++ short events; ++ int ret; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ events = POLLOUT | POLLWRNORM; ++ else ++ events = POLLIN | POLLRDNORM; ++ ++ pfd.events = events; ++ pfd.fd = ctx->fd; ++ ++ for (;;) { ++ ret = poll(&pfd, 1, timeout); ++ if (ret > 0) ++ break; ++ if (errno == EINTR) ++ continue; ++ return NULL; ++ } ++ ++ if (pfd.revents & POLLERR) ++ return NULL; ++ ++ if (pfd.revents & events) { ++ memset(&buf, 0, sizeof(buf)); ++ buf.memory = V4L2_MEMORY_MMAP; ++ buf.type = queue->format.type; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); ++ if (ret) { ++ if (errno != EAGAIN) ++ av_log(NULL, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", ++ av_err2str(AVERROR(errno))); ++ return NULL; ++ } ++ ++ avbuf = &queue->buffers[buf.index]; ++ avbuf->enqueued = 0; ++ avbuf->buffer = buf; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buffer.m.planes = avbuf->planes; ++ } ++ ++ return avbuf; ++ } ++ ++ return NULL; ++} ++ ++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) ++{ ++ int i; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (!queue->buffers[i].enqueued) ++ return &queue->buffers[i]; ++ ++ return NULL; ++} ++ ++static int deint_v4l2m2m_enqueue(V4L2Queue *queue, const AVFrame* frame) ++{ ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ V4L2Buffer *buf; ++ int i; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ while (deint_v4l2m2m_dequeue_buffer(queue, 0)); ++ ++ buf = deint_v4l2m2m_find_free_buf(queue); ++ if (!buf) ++ return AVERROR(ENOMEM); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) ++ for (i = 0; i < drm_desc->nb_objects; i++) ++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; ++ else ++ buf->buffer.m.fd = drm_desc->objects[0].fd; ++ ++ return deint_v4l2m2m_enqueue_buffer(buf); ++} ++ ++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) ++{ ++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int i; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); ++ ++ if (ctx->fd >= 0) { ++ deint_v4l2m2m_streamoff(capture); ++ deint_v4l2m2m_streamoff(output); ++ } ++ ++ if (capture->buffers) ++ for (i = 0; i < capture->num_buffers; i++) { ++ capture->buffers[i].q = NULL; ++ if (capture->buffers[i].fd >= 0) ++ close(capture->buffers[i].fd); ++ } ++ ++ if (ctx->cur_in_frame) ++ av_frame_free(&ctx->cur_in_frame); ++ ++ if (ctx->prev_in_frame) ++ av_frame_free(&ctx->prev_in_frame); ++ ++ av_buffer_unref(&ctx->hw_frames_ctx); ++ ++ if (capture->buffers) ++ av_free(capture->buffers); ++ ++ if (output->buffers) ++ av_free(output->buffers); ++ ++ if (ctx->fd >= 0) { ++ close(ctx->fd); ++ ctx->fd = -1; ++ } ++ ++ av_free(ctx); ++ } ++} ++ ++static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++{ ++ V4L2Buffer *buf = opaque; ++ DeintV4L2M2MContextShared *ctx = buf->q->ctx; ++ ++ if (!ctx->done) ++ deint_v4l2m2m_enqueue_buffer(buf); ++ ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static uint8_t *v4l2_get_drm_frame(V4L2Buffer *avbuf, int height, uint64_t drm_format) ++{ ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; ++ ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; ++ } ++ ++ layer->format = drm_format; ++ ++ if (avbuf->num_planes == 1) { ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ } ++ ++ return (uint8_t *)drm_desc; ++} ++ ++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) ++{ ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ V4L2Buffer* avbuf; ++ ++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); ++ if (!avbuf) { ++ av_log(NULL, AV_LOG_ERROR, "dequeueing failed\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, ++ sizeof(avbuf->drm_frame), v4l2_free_buffer, ++ avbuf, AV_BUFFER_FLAG_READONLY); ++ if (!frame->buf[0]) ++ return AVERROR(ENOMEM); ++ ++ atomic_fetch_add(&ctx->refcount, 1); ++ ++ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height, ctx->drm_out_format); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); ++ frame->height = ctx->height; ++ frame->width = ctx->width; ++ ++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { ++ av_log(NULL, AV_LOG_ERROR, "driver decode error\n"); ++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_dequeue(AVFilterContext *avctx, AVFrame *input_frame) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ AVFilterLink *outlink = avctx->outputs[0]; ++ AVFrame *output_frame_1, *output_frame_2; ++ int64_t first_pts = AV_NOPTS_VALUE; ++ int err; ++ ++ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" (field %d)\n", ++ input_frame->pts, ctx->field_order); ++ ++ output_frame_1 = av_frame_alloc(); ++ if (!output_frame_1) ++ return AVERROR(ENOMEM); ++ ++ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_1, 500); ++ if (err < 0) { ++ av_log(priv, AV_LOG_ERROR, "no 1st frame (field %d)\n", ctx->field_order); ++ goto fail_out1; ++ } ++ ++ err = av_frame_copy_props(output_frame_1, input_frame); ++ if (err < 0) ++ goto fail_out1; ++ ++ output_frame_1->interlaced_frame = 0; ++ ++ output_frame_2 = av_frame_alloc(); ++ if (!output_frame_2) { ++ err = AVERROR(ENOMEM); ++ goto fail_out1; ++ } ++ ++ err = deint_v4l2m2m_dequeue_frame(&ctx->capture, output_frame_2, 500); ++ if (err < 0) { ++ av_log(priv, AV_LOG_ERROR, "no 2nd frame (field %d)\n", ctx->field_order); ++ goto fail_out2; ++ } ++ ++ err = av_frame_copy_props(output_frame_2, input_frame); ++ if (err < 0) ++ goto fail_out2; ++ ++ output_frame_2->interlaced_frame = 0; ++ ++ if (ctx->prev_in_frame && ctx->prev_in_frame->pts != AV_NOPTS_VALUE ++ && input_frame->pts != AV_NOPTS_VALUE) { ++ first_pts = (ctx->prev_in_frame->pts + input_frame->pts) / 2; ++ av_log(priv, AV_LOG_DEBUG, "calculated first pts %"PRId64"\n", first_pts); ++ } ++ ++ output_frame_1->pts = first_pts; ++ ++ err = ff_filter_frame(outlink, output_frame_1); ++ if (err < 0) { ++ av_frame_free(&output_frame_2); ++ return err; ++ } ++ err = ff_filter_frame(outlink, output_frame_2); ++ ++ if (err < 0) ++ return err; ++ ++ av_log(priv, AV_LOG_DEBUG, "1st frame pts: %"PRId64" 2nd frame pts: %"PRId64" first pts: %"PRId64" (field %d)\n", ++ output_frame_1->pts, output_frame_2->pts, first_pts, ctx->field_order); ++ ++ return 0; ++ ++fail_out2: ++ av_frame_free(&output_frame_2); ++fail_out1: ++ av_frame_free(&output_frame_1); ++ return err; ++} ++ ++static int deint_v4l2m2m_config_props(AVFilterLink *outlink) ++{ ++ AVFilterLink *inlink = outlink->src->inputs[0]; ++ AVFilterContext *avctx = outlink->src; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ int ret; ++ ++ ctx->height = avctx->inputs[0]->h; ++ ctx->width = avctx->inputs[0]->w; ++ ++ outlink->frame_rate = av_mul_q(inlink->frame_rate, ++ (AVRational){ 2, 1 }); ++ outlink->time_base = av_mul_q(inlink->time_base, ++ (AVRational){ 1, 2 }); ++ ++ ret = deint_v4l2m2m_find_device(ctx); ++ if (ret) ++ return ret; ++ ++ if (!inlink->hw_frames_ctx) { ++ av_log(priv, AV_LOG_ERROR, "No hw context provided on input\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); ++ if (!ctx->hw_frames_ctx) ++ return AVERROR(ENOMEM); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) ++{ ++ static const enum AVPixelFormat pixel_formats[] = { ++ AV_PIX_FMT_DRM_PRIME, ++ AV_PIX_FMT_NONE, ++ }; ++ ++ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); ++} ++ ++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterContext *avctx = link->dst; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int ret; ++ ++ av_log(priv, AV_LOG_DEBUG, "input pts: %"PRId64" field :%d interlaced: %d\n", ++ in->pts, in->top_field_first, in->interlaced_frame); ++ ++ ctx->cur_in_frame = in; ++ ++ if (ctx->field_order == V4L2_FIELD_ANY) { ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; ++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; ++ ctx->drm_in_format = drm_desc->layers->format; ++ ctx->drm_out_format = drm_desc->layers->format; ++ ++ if (in->top_field_first) ++ ctx->field_order = V4L2_FIELD_INTERLACED_TB; ++ else ++ ctx->field_order = V4L2_FIELD_INTERLACED_BT; ++ ++ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ++ ctx->orig_width, ctx->orig_height, ++ ctx->drm_in_format); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ++ ctx->orig_width, ctx->orig_height, ++ ctx->drm_out_format); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(output); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(output); ++ if (ret) ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_enqueue(output, in); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_dequeue(avctx, in); ++ if (ret) ++ return ret; ++ ++ if (ctx->prev_in_frame) ++ av_frame_free(&ctx->prev_in_frame); ++ ++ ctx->prev_in_frame = in; ++ ctx->cur_in_frame = NULL; ++ ++ return 0; ++} ++ ++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx; ++ ++ ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); ++ if (!ctx) ++ return AVERROR(ENOMEM); ++ ++ priv->shared = ctx; ++ ctx->fd = -1; ++ ctx->output.ctx = ctx; ++ ctx->output.num_buffers = 6; ++ ctx->capture.ctx = ctx; ++ ctx->capture.num_buffers = 6; ++ ctx->done = 0; ++ ctx->field_order = V4L2_FIELD_ANY; ++ ctx->cur_in_frame = NULL; ++ ctx->prev_in_frame = NULL; ++ ctx->drm_in_format = DRM_FORMAT_NV12; ++ ctx->drm_out_format = DRM_FORMAT_NV12; ++ atomic_init(&ctx->refcount, 1); ++ ++ return 0; ++} ++ ++static void deint_v4l2m2m_uninit(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ ++ ctx->done = 1; ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static const AVOption deinterlace_v4l2m2m_options[] = { ++ { NULL }, ++}; ++ ++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); ++ ++static const AVFilterPad deint_v4l2m2m_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = deint_v4l2m2m_filter_frame, ++ }, ++}; ++ ++static const AVFilterPad deint_v4l2m2m_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .config_props = deint_v4l2m2m_config_props, ++ }, ++}; ++ ++AVFilter ff_vf_deinterlace_v4l2m2m = { ++ .name = "deinterlace_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &deint_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ FILTER_QUERY_FUNC(&deint_v4l2m2m_query_formats), ++ FILTER_INPUTS(deint_v4l2m2m_inputs), ++ FILTER_OUTPUTS(deint_v4l2m2m_outputs), ++ .priv_class = &deinterlace_v4l2m2m_class, ++}; From 8f8e40c3afdec60acd88a4d9de67cb90feece734 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Thu, 16 Feb 2023 10:00:50 +0100 Subject: [PATCH 10/12] ffmpeg: update rpi patch Patch created using revisions eacfcba..1beaf80 from branch dev/5.1.2/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg --- .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch | 98965 +++++----------- 1 file changed, 30313 insertions(+), 68652 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 87a16f85db..89efaaac1a 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -1,68035 +1,47 @@ -diff --git a/configure b/configure -index d7a3f507e8..83383b0317 100755 ---- a/configure -+++ b/configure -@@ -207,6 +207,7 @@ External library support: - --disable-bzlib disable bzlib [autodetect] - --disable-coreimage disable Apple CoreImage framework [autodetect] - --enable-chromaprint enable audio fingerprinting with chromaprint [no] -+ --disable-epoxy disable epoxy [autodetect] - --enable-frei0r enable frei0r video filtering [no] - --enable-gcrypt enable gcrypt, needed for rtmp(t)e support - if openssl, librtmp or gmp is not used [no] -@@ -279,6 +280,7 @@ External library support: - if openssl, gnutls or mbedtls is not used [no] - --enable-libtwolame enable MP2 encoding via libtwolame [no] - --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] -+ --disable-libudev disable libudev [autodetect] - --enable-libv4l2 enable libv4l2/v4l-utils [no] - --enable-libvidstab enable video stabilization using vid.stab [no] - --enable-libvmaf enable vmaf filter via libvmaf [no] -@@ -340,12 +342,17 @@ External library support: - --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] - --enable-libnpp enable Nvidia Performance Primitives-based code [no] - --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] -+ --enable-rpi enable other rpi specific stuff [no] -+ --enable-sand enable sand video formats [rpi] -+ --enable-vout-drm enable the vout_drm module - for internal testing only [no] -+ --enable-vout-egl enable the vout_egl module - for internal testing only [no] - --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] - --disable-nvenc disable Nvidia video encoding code [autodetect] - --enable-omx enable OpenMAX IL code [no] - --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] - --enable-rkmpp enable Rockchip Media Process Platform code [no] - --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] -+ --enable-v4l2-request enable V4L2 request API code [no] - --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] - --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] - --disable-videotoolbox disable VideoToolbox code [autodetect] -@@ -1703,7 +1710,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" - avfoundation - bzlib - coreimage -+ epoxy - iconv -+ libudev - libxcb - libxcb_shm - libxcb_shape -@@ -1868,7 +1877,10 @@ HWACCEL_LIBRARY_LIST=" - mmal - omx - opencl -+ v4l2_request - vulkan -+ rpi4_8 -+ rpi4_10 - " - - DOCUMENT_LIST=" -@@ -1884,12 +1896,17 @@ FEATURE_LIST=" - gray - hardcoded_tables - omx_rpi -+ rpi - runtime_cpudetect - safe_bitstream_reader -+ sand - shared - small - static - swscale_alpha -+ vout_drm -+ vout_egl -+ v4l2_req_hevc_vx - " - - # this list should be kept in linking order -@@ -1930,6 +1947,7 @@ SUBSYSTEM_LIST=" - pixelutils - network - rdft -+ rpi - " - - # COMPONENT_LIST needs to come last to ensure correct dependency checking -@@ -2416,9 +2434,11 @@ CONFIG_EXTRA=" - rangecoder - riffdec - riffenc -+ rpi - rtpdec - rtpenc_chain - rv34dsp -+ sand - scene_sad - sinewin - snappy -@@ -2750,6 +2770,8 @@ hap_decoder_select="snappy texturedsp" - hap_encoder_deps="libsnappy" - hap_encoder_select="texturedspenc" - hevc_decoder_select="atsc_a53 bswapdsp cabac golomb hevcparse videodsp" -+hevc_rpi_decoder_deps="rpi" -+hevc_rpi_decoder_select="hevc_decoder sand" - huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" - huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" - hymt_decoder_select="huffyuv_decoder" -@@ -2919,6 +2941,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" - dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" - ffnvcodec_deps_any="libdl LoadLibrary" - nvdec_deps="ffnvcodec" -+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" - vaapi_x11_deps="xlib" - videotoolbox_hwaccel_deps="videotoolbox pthreads" - videotoolbox_hwaccel_extralibs="-framework QuartzCore" -@@ -2960,6 +2983,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" - hevc_dxva2_hwaccel_select="hevc_decoder" - hevc_nvdec_hwaccel_deps="nvdec" - hevc_nvdec_hwaccel_select="hevc_decoder" -+hevc_v4l2request_hwaccel_deps="v4l2_request" -+hevc_v4l2request_hwaccel_select="hevc_decoder" -+hevc_rpi4_10_hwaccel_deps="rpi" -+hevc_rpi4_10_hwaccel_select="hevc_decoder" -+hevc_rpi4_8_hwaccel_deps="rpi" -+hevc_rpi4_8_hwaccel_select="hevc_decoder" - hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" - hevc_vaapi_hwaccel_select="hevc_decoder" - hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" -@@ -3437,8 +3466,13 @@ sndio_indev_deps="sndio" - sndio_outdev_deps="sndio" - v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_indev_suggest="libv4l2" -+v4l2_outdev_deps="libdrm" - v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" - v4l2_outdev_suggest="libv4l2" -+vout_drm_outdev_deps="libdrm" -+vout_egl_outdev_deps="xlib epoxy" -+vout_rpi_outdev_deps="rpi" -+vout_rpi_outdev_select="sand" - vfwcap_indev_deps="vfw32 vfwcap_defines" - xcbgrab_indev_deps="libxcb" - xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" -@@ -3657,6 +3691,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping" - tonemap_opencl_filter_deps="opencl const_nan" - transpose_opencl_filter_deps="opencl" - transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" -+unsand_filter_select="sand" - unsharp_opencl_filter_deps="opencl" - uspp_filter_deps="gpl avcodec" - vaguedenoiser_filter_deps="gpl" -@@ -6154,6 +6189,12 @@ check_func_headers glob.h glob - enabled xlib && - check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext - -+enabled libudev && -+ check_pkg_config libudev libudev libudev.h udev_new -+ -+enabled epoxy && -+ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version -+ - check_headers direct.h - check_headers dirent.h - check_headers dxgidebug.h -@@ -6491,11 +6532,12 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt - check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto || - die "ERROR: mbedTLS not found"; } - enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; } --enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || -+( enabled rpi || -+ enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || - { ! enabled cross_compile && - add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && - add_ldflags -L/opt/vc/lib/ && -- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } || -+ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } || - die "ERROR: mmal not found" && - check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } - enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do -@@ -6536,8 +6578,16 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r - { enabled libdrm || - die "ERROR: rkmpp requires --enable-libdrm"; } - } -+enabled v4l2_request && { enabled libdrm || -+ die "ERROR: v4l2-request requires --enable-libdrm"; } && -+ { enabled libudev || -+ die "ERROR: v4l2-request requires libudev"; } - enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init - -+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } -+ -+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && -+ { enabled xlib || die "ERROR: vout_egl requires xlib"; } - - if enabled gcrypt; then - GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" -@@ -6617,6 +6667,10 @@ if enabled v4l2_m2m; then - check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" - fi - -+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns -+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" -+disable v4l2_req_hevc_vx -+ - check_headers sys/videoio.h - test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete - -@@ -7104,6 +7158,9 @@ check_deps $CONFIG_LIST \ - enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86" - enabled avresample && warn "Building with deprecated library libavresample" - -+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done -+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx -+ - case $target_os in - haiku) - disable memalign -diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 46bb014de8..0502ff71b8 100644 ---- a/fftools/ffmpeg.c -+++ b/fftools/ffmpeg.c -@@ -2186,8 +2186,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) - ifilter->channel_layout != frame->channel_layout; - break; - case AVMEDIA_TYPE_VIDEO: -- need_reinit |= ifilter->width != frame->width || -- ifilter->height != frame->height; -+ need_reinit |= ifilter->width != av_frame_cropped_width(frame) || -+ ifilter->height != av_frame_cropped_height(frame); - break; - } - -@@ -2198,6 +2198,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) - (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) - need_reinit = 1; - -+ if (no_cvt_hw && fg->graph) -+ need_reinit = 0; -+ - if (need_reinit) { - ret = ifilter_parameters_from_frame(ifilter, frame); - if (ret < 0) -@@ -2466,8 +2469,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ - decoded_frame->top_field_first = ist->top_field_first; - - ist->frames_decoded++; -- -- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { -+ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { - err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); - if (err < 0) - goto fail; -@@ -2671,7 +2673,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo - case AVMEDIA_TYPE_VIDEO: - ret = decode_video (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt, - &decode_failed); -- if (!repeating || !pkt || got_output) { -+ // Pi: Do not inc dts if no_cvt_hw set -+ // V4L2 H264 decode has long latency and sometimes spits out a long -+ // stream of output without input. In this case incrementing DTS is wrong. -+ // There may be cases where the condition as written is correct so only -+ // "fix" in the cases which cause problems -+ if (!repeating || !pkt || (got_output && !no_cvt_hw)) { - if (pkt && pkt->duration) { - duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); - } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { -@@ -2895,6 +2902,16 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat - } else { - const HWAccel *hwaccel = NULL; - int i; -+ -+ if (no_cvt_hw) { -+ config = avcodec_get_hw_config(s->codec, 0); -+ if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) { -+ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p); -+ ist->hwaccel_pix_fmt = *p; -+ break; -+ } -+ } -+ - for (i = 0; hwaccels[i].name; i++) { - if (hwaccels[i].pix_fmt == *p) { - hwaccel = &hwaccels[i]; -@@ -2990,6 +3007,15 @@ static int init_input_stream(int ist_index, char *error, int error_len) - return ret; - } - -+#if CONFIG_HEVC_RPI_DECODER -+ ret = -1; -+ if (strcmp(codec->name, "hevc_rpi") == 0 && -+ (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { -+ ist->dec = codec = avcodec_find_decoder_by_name("hevc"); -+ av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n"); -+ } -+ if (ret < 0) -+#endif - if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { - if (ret == AVERROR_EXPERIMENTAL) - abort_codec_experimental(codec, 0); -diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h -index 606f2afe0c..448cd2e009 100644 ---- a/fftools/ffmpeg.h -+++ b/fftools/ffmpeg.h -@@ -61,6 +61,7 @@ enum HWAccelID { - HWACCEL_GENERIC, - HWACCEL_VIDEOTOOLBOX, - HWACCEL_QSV, -+ HWACCEL_RPI, - }; - - typedef struct HWAccel { -@@ -611,6 +612,7 @@ extern int video_sync_method; - extern float frame_drop_threshold; - extern int do_benchmark; - extern int do_benchmark_all; -+extern int no_cvt_hw; - extern int do_deinterlace; - extern int do_hex_dump; - extern int do_pkt_dump; -diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c -index 4ab769c07b..5cdc3a7b6c 100644 ---- a/fftools/ffmpeg_filter.c -+++ b/fftools/ffmpeg_filter.c -@@ -1160,8 +1160,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) - - ifilter->format = frame->format; - -- ifilter->width = frame->width; -- ifilter->height = frame->height; -+ ifilter->width = av_frame_cropped_width(frame); -+ ifilter->height = av_frame_cropped_height(frame); - ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; - - ifilter->sample_rate = frame->sample_rate; -diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c -index fc4a5d31d6..cc69dce40e 100644 ---- a/fftools/ffmpeg_hw.c -+++ b/fftools/ffmpeg_hw.c -@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type) - char *name; - size_t index_pos; - int index, index_limit = 1000; -+ if (!type_name) -+ return NULL; - index_pos = strlen(type_name); - name = av_malloc(index_pos + 4); - if (!name) -diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c -index 807e783422..456d4f349b 100644 ---- a/fftools/ffmpeg_opt.c -+++ b/fftools/ffmpeg_opt.c -@@ -133,12 +133,22 @@ static const char *const opt_name_enc_time_bases[] = {"enc_time_base" - }\ - } - -+#if CONFIG_RPI -+static int rpi_init(AVCodecContext *avctx) { -+ return 0; -+} -+#endif -+ - const HWAccel hwaccels[] = { - #if CONFIG_VIDEOTOOLBOX - { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX }, - #endif - #if CONFIG_LIBMFX - { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV }, -+#endif -+#if CONFIG_RPI -+ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 }, -+ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 }, - #endif - { 0 }, - }; -@@ -158,6 +168,7 @@ float frame_drop_threshold = 0; - int do_deinterlace = 0; - int do_benchmark = 0; - int do_benchmark_all = 0; -+int no_cvt_hw = 0; - int do_hex_dump = 0; - int do_pkt_dump = 0; - int copy_ts = 0; -@@ -3499,6 +3510,8 @@ const OptionDef options[] = { - "add timings for benchmarking" }, - { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, - "add timings for each task" }, -+ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, -+ "do not auto-convert hw frames to sw" }, - { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, - "write program-readable progress information", "url" }, - { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 33a280cf69..e93c842047 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h \ - mediacodec.h \ - packet.h \ - qsv.h \ -+ rpi_zc.h \ - vaapi.h \ - vdpau.h \ - version.h \ -@@ -140,6 +141,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o - OBJS-$(CONFIG_QSVENC) += qsvenc.o - OBJS-$(CONFIG_RANGECODER) += rangecoder.o - OBJS-$(CONFIG_RDFT) += rdft.o -+OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o - OBJS-$(CONFIG_RV34DSP) += rv34dsp.o - OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o - OBJS-$(CONFIG_SINEWIN) += sinewin.o -@@ -154,7 +156,10 @@ OBJS-$(CONFIG_VIDEODSP) += videodsp.o - OBJS-$(CONFIG_VP3DSP) += vp3dsp.o - OBJS-$(CONFIG_VP56DSP) += vp56dsp.o - OBJS-$(CONFIG_VP8DSP) += vp8dsp.o --OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o -+OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ -+ weak_link.o v4l2_req_dmabufs.o -+OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ -+ v4l2_req_devscan.o weak_link.o - OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o - OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o - -@@ -403,6 +408,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) += qsvdec.o - OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \ - hevc_data.o - OBJS-$(CONFIG_HEVC_RKMPP_DECODER) += rkmppdec.o -+OBJS-$(CONFIG_RPI) += rpi_mem.o \ -+ rpi_mailbox.o rpi_zc.o -+OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \ -+ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \ -+ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \ -+ rpi_hevc_shader.o rpi_hevc_shader_template.o \ -+ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \ -+ rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o - OBJS-$(CONFIG_HEVC_VAAPI_ENCODER) += vaapi_encode_h265.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER) += v4l2_m2m_dec.o - OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER) += v4l2_m2m_enc.o -@@ -941,6 +954,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o - OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o - OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o -+OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o -+OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o -+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o -+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX) += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o - OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o - OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o - OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o -@@ -1297,3 +1314,31 @@ $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h - $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h - $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h - endif -+ -+ifdef CONFIG_HEVC_RPI_DECODER -+QASM_PY := ../local/bin/qasm.py -+VASMVIDCORE := ../local/bin/vasmvidcore_std -+ -+ifneq ("$(wildcard $(QASM_PY))","") -+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm -+ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ -+ -+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm -+ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ -+endif -+ -+ifneq ("$(wildcard $(VASMVIDCORE))","") -+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s -+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ -+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s -+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ -+ -+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin -+ python pi-util/make_array.py $< -+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin -+ python pi-util/make_array.py $< -+endif -+ -+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h -+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h -+endif -diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile -index 954461f81d..c8935f205e 100644 ---- a/libavcodec/aarch64/Makefile -+++ b/libavcodec/aarch64/Makefile -@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o - NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ - aarch64/hpeldsp_neon.o - NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o --NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o -+NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ -+ aarch64/simple_idct_neon.o - NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o - NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o - NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o -+NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o - NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o - - # decoders/encoders -diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c -index 742a3372e3..eec21aa5a2 100644 ---- a/libavcodec/aarch64/idctdsp_init_aarch64.c -+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c -@@ -27,19 +27,29 @@ - #include "libavcodec/idctdsp.h" - #include "idct.h" - -+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); -+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); -+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); -+ - av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, - unsigned high_bit_depth) - { - int cpu_flags = av_get_cpu_flags(); - -- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { -- if (avctx->idct_algo == FF_IDCT_AUTO || -- avctx->idct_algo == FF_IDCT_SIMPLEAUTO || -- avctx->idct_algo == FF_IDCT_SIMPLENEON) { -- c->idct_put = ff_simple_idct_put_neon; -- c->idct_add = ff_simple_idct_add_neon; -- c->idct = ff_simple_idct_neon; -- c->perm_type = FF_IDCT_PERM_PARTTRANS; -+ if (have_neon(cpu_flags)) { -+ if (!avctx->lowres && !high_bit_depth) { -+ if (avctx->idct_algo == FF_IDCT_AUTO || -+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO || -+ avctx->idct_algo == FF_IDCT_SIMPLENEON) { -+ c->idct_put = ff_simple_idct_put_neon; -+ c->idct_add = ff_simple_idct_add_neon; -+ c->idct = ff_simple_idct_neon; -+ c->perm_type = FF_IDCT_PERM_PARTTRANS; -+ } - } -+ -+ c->add_pixels_clamped = ff_add_pixels_clamped_neon; -+ c->put_pixels_clamped = ff_put_pixels_clamped_neon; -+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; - } - } -diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S -new file mode 100644 -index 0000000000..7f47611206 ---- /dev/null -+++ b/libavcodec/aarch64/idctdsp_neon.S -@@ -0,0 +1,130 @@ -+/* -+ * IDCT AArch64 NEON optimisations -+ * -+ * Copyright (c) 2022 Ben Avison -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/aarch64/asm.S" -+ -+// Clamp 16-bit signed block coefficients to unsigned 8-bit -+// On entry: -+// x0 -> array of 64x 16-bit coefficients -+// x1 -> 8-bit results -+// x2 = row stride for results, bytes -+function ff_put_pixels_clamped_neon, export=1 -+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 -+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ sqxtun v4.8b, v4.8h -+ st1 {v0.8b}, [x1], x2 -+ sqxtun v0.8b, v5.8h -+ st1 {v1.8b}, [x1], x2 -+ sqxtun v1.8b, v6.8h -+ st1 {v2.8b}, [x1], x2 -+ sqxtun v2.8b, v7.8h -+ st1 {v3.8b}, [x1], x2 -+ st1 {v4.8b}, [x1], x2 -+ st1 {v0.8b}, [x1], x2 -+ st1 {v1.8b}, [x1], x2 -+ st1 {v2.8b}, [x1] -+ ret -+endfunc -+ -+// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) -+// On entry: -+// x0 -> array of 64x 16-bit coefficients -+// x1 -> 8-bit results -+// x2 = row stride for results, bytes -+function ff_put_signed_pixels_clamped_neon, export=1 -+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 -+ movi v4.8b, #128 -+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] -+ sqxtn v0.8b, v0.8h -+ sqxtn v1.8b, v1.8h -+ sqxtn v2.8b, v2.8h -+ sqxtn v3.8b, v3.8h -+ sqxtn v5.8b, v16.8h -+ add v0.8b, v0.8b, v4.8b -+ sqxtn v6.8b, v17.8h -+ add v1.8b, v1.8b, v4.8b -+ sqxtn v7.8b, v18.8h -+ add v2.8b, v2.8b, v4.8b -+ sqxtn v16.8b, v19.8h -+ add v3.8b, v3.8b, v4.8b -+ st1 {v0.8b}, [x1], x2 -+ add v0.8b, v5.8b, v4.8b -+ st1 {v1.8b}, [x1], x2 -+ add v1.8b, v6.8b, v4.8b -+ st1 {v2.8b}, [x1], x2 -+ add v2.8b, v7.8b, v4.8b -+ st1 {v3.8b}, [x1], x2 -+ add v3.8b, v16.8b, v4.8b -+ st1 {v0.8b}, [x1], x2 -+ st1 {v1.8b}, [x1], x2 -+ st1 {v2.8b}, [x1], x2 -+ st1 {v3.8b}, [x1] -+ ret -+endfunc -+ -+// Add 16-bit signed block coefficients to unsigned 8-bit -+// On entry: -+// x0 -> array of 64x 16-bit coefficients -+// x1 -> 8-bit input and results -+// x2 = row stride for 8-bit input and results, bytes -+function ff_add_pixels_clamped_neon, export=1 -+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 -+ mov x3, x1 -+ ld1 {v4.8b}, [x1], x2 -+ ld1 {v5.8b}, [x1], x2 -+ ld1 {v6.8b}, [x1], x2 -+ ld1 {v7.8b}, [x1], x2 -+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] -+ uaddw v0.8h, v0.8h, v4.8b -+ uaddw v1.8h, v1.8h, v5.8b -+ uaddw v2.8h, v2.8h, v6.8b -+ ld1 {v4.8b}, [x1], x2 -+ uaddw v3.8h, v3.8h, v7.8b -+ ld1 {v5.8b}, [x1], x2 -+ sqxtun v0.8b, v0.8h -+ ld1 {v6.8b}, [x1], x2 -+ sqxtun v1.8b, v1.8h -+ ld1 {v7.8b}, [x1] -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ uaddw v4.8h, v16.8h, v4.8b -+ st1 {v0.8b}, [x3], x2 -+ uaddw v0.8h, v17.8h, v5.8b -+ st1 {v1.8b}, [x3], x2 -+ uaddw v1.8h, v18.8h, v6.8b -+ st1 {v2.8b}, [x3], x2 -+ uaddw v2.8h, v19.8h, v7.8b -+ sqxtun v4.8b, v4.8h -+ sqxtun v0.8b, v0.8h -+ st1 {v3.8b}, [x3], x2 -+ sqxtun v1.8b, v1.8h -+ sqxtun v2.8b, v2.8h -+ st1 {v4.8b}, [x3], x2 -+ st1 {v0.8b}, [x3], x2 -+ st1 {v1.8b}, [x3], x2 -+ st1 {v2.8b}, [x3] -+ ret -+endfunc -diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c -index 13dfd74940..a7976fd596 100644 ---- a/libavcodec/aarch64/vc1dsp_init_aarch64.c -+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c -@@ -21,10 +21,28 @@ - #include "libavutil/attributes.h" - #include "libavutil/cpu.h" - #include "libavutil/aarch64/cpu.h" -+#include "libavutil/intreadwrite.h" - #include "libavcodec/vc1dsp.h" - - #include "config.h" - -+void ff_vc1_inv_trans_8x8_neon(int16_t *block); -+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+ -+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); -+ -+void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); -+void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); -+void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); -+void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); -+void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); -+void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); -+ - void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - int h, int x, int y); - void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, -@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - int h, int x, int y); - -+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); -+ -+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) -+{ -+ /* Dealing with starting and stopping, and removing escape bytes, are -+ * comparatively less time-sensitive, so are more clearly expressed using -+ * a C wrapper around the assembly inner loop. Note that we assume a -+ * little-endian machine that supports unaligned loads. */ -+ int dsize = 0; -+ while (size >= 4) -+ { -+ int found = 0; -+ while (!found && (((uintptr_t) dst) & 7) && size >= 4) -+ { -+ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; -+ if (!found) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ } -+ if (!found) -+ { -+ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); -+ dst += skip; -+ src += skip; -+ size -= skip; -+ dsize += skip; -+ while (!found && size >= 4) -+ { -+ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; -+ if (!found) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ } -+ } -+ if (found) -+ { -+ *dst++ = *src++; -+ *dst++ = *src++; -+ ++src; -+ size -= 3; -+ dsize += 2; -+ } -+ } -+ while (size > 0) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ return dsize; -+} -+ - av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) - { - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { -+ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; -+ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; -+ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; -+ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; -+ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; -+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; -+ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; -+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; -+ -+ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; -+ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; -+ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; -+ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; -+ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; -+ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; -+ - dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; - dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; -+ -+ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; - } - } -diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S -new file mode 100644 -index 0000000000..9a96c2523c ---- /dev/null -+++ b/libavcodec/aarch64/vc1dsp_neon.S -@@ -0,0 +1,1546 @@ -+/* -+ * VC1 AArch64 NEON optimisations -+ * -+ * Copyright (c) 2022 Ben Avison -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/aarch64/asm.S" -+ -+// VC-1 8x8 inverse transform -+// On entry: -+// x0 -> array of 16-bit inverse transform coefficients, in column-major order -+// On exit: -+// array at x0 updated to hold transformed block; also now held in row-major order -+function ff_vc1_inv_trans_8x8_neon, export=1 -+ ld1 {v1.16b, v2.16b}, [x0], #32 -+ ld1 {v3.16b, v4.16b}, [x0], #32 -+ ld1 {v5.16b, v6.16b}, [x0], #32 -+ shl v1.8h, v1.8h, #2 // 8/2 * src[0] -+ sub x1, x0, #3*32 -+ ld1 {v16.16b, v17.16b}, [x0] -+ shl v7.8h, v2.8h, #4 // 16 * src[8] -+ shl v18.8h, v2.8h, #2 // 4 * src[8] -+ shl v19.8h, v4.8h, #4 // 16 * src[24] -+ ldr d0, .Lcoeffs_it8 -+ shl v5.8h, v5.8h, #2 // 8/2 * src[32] -+ shl v20.8h, v6.8h, #4 // 16 * src[40] -+ shl v21.8h, v6.8h, #2 // 4 * src[40] -+ shl v22.8h, v17.8h, #4 // 16 * src[56] -+ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] -+ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] -+ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] -+ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] -+ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] -+ shl v3.8h, v3.8h, #3 // 16/2 * src[16] -+ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] -+ ssra v1.8h, v1.8h, #1 // 12/2 * src[0] -+ ssra v5.8h, v5.8h, #1 // 12/2 * src[32] -+ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] -+ shl v21.8h, v16.8h, #3 // 16/2 * src[48] -+ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] -+ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] -+ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] -+ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] -+ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] -+ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] -+ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] -+ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 -+ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 -+ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] -+ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 -+ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 -+ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] -+ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 -+ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 -+ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] -+ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 -+ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 -+ neg v3.8h, v7.8h // -t1 -+ neg v4.8h, v20.8h // +t2 -+ neg v6.8h, v19.8h // +t3 -+ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 -+ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 -+ neg v7.8h, v18.8h // +t4 -+ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 -+ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 -+ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 -+ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 -+ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 -+ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 -+ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 -+ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 -+ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 -+ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 -+ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 -+ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 -+ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 -+ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 -+ trn2 v17.8h, v3.8h, v4.8h -+ trn2 v18.8h, v5.8h, v6.8h -+ trn2 v19.8h, v2.8h, v1.8h -+ trn2 v20.8h, v7.8h, v16.8h -+ trn1 v21.4s, v17.4s, v18.4s -+ trn2 v17.4s, v17.4s, v18.4s -+ trn1 v18.4s, v19.4s, v20.4s -+ trn2 v19.4s, v19.4s, v20.4s -+ trn1 v3.8h, v3.8h, v4.8h -+ trn2 v4.2d, v21.2d, v18.2d -+ trn1 v20.2d, v17.2d, v19.2d -+ trn1 v5.8h, v5.8h, v6.8h -+ trn1 v1.8h, v2.8h, v1.8h -+ trn1 v2.8h, v7.8h, v16.8h -+ trn1 v6.2d, v21.2d, v18.2d -+ trn2 v7.2d, v17.2d, v19.2d -+ shl v16.8h, v20.8h, #4 // 16 * src[24] -+ shl v17.8h, v4.8h, #4 // 16 * src[40] -+ trn1 v18.4s, v3.4s, v5.4s -+ trn1 v19.4s, v1.4s, v2.4s -+ shl v21.8h, v7.8h, #4 // 16 * src[56] -+ shl v22.8h, v6.8h, #2 // 4 * src[8] -+ shl v23.8h, v4.8h, #2 // 4 * src[40] -+ trn2 v3.4s, v3.4s, v5.4s -+ trn2 v1.4s, v1.4s, v2.4s -+ shl v2.8h, v6.8h, #4 // 16 * src[8] -+ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] -+ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] -+ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] -+ trn1 v22.2d, v18.2d, v19.2d -+ trn2 v18.2d, v18.2d, v19.2d -+ trn1 v19.2d, v3.2d, v1.2d -+ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] -+ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] -+ shl v21.8h, v22.8h, #2 // 8/2 * src[0] -+ shl v18.8h, v18.8h, #2 // 8/2 * src[32] -+ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] -+ shl v6.8h, v19.8h, #3 // 16/2 * src[16] -+ trn2 v1.2d, v3.2d, v1.2d -+ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] -+ ssra v21.8h, v21.8h, #1 // 12/2 * src[0] -+ ssra v18.8h, v18.8h, #1 // 12/2 * src[32] -+ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] -+ shl v19.8h, v1.8h, #3 // 16/2 * src[48] -+ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] -+ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] -+ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] -+ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] -+ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] -+ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] -+ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] -+ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 -+ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 -+ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] -+ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 -+ neg v21.8h, v17.8h // +t2 -+ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] -+ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 -+ neg v4.8h, v5.8h // +t3 -+ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 -+ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 -+ neg v24.8h, v16.8h // +t4 -+ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 -+ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 -+ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 -+ neg v3.8h, v2.8h // -t1 -+ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 -+ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 -+ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 -+ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 -+ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 -+ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 -+ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 -+ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 -+ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 -+ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 -+ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 -+ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 -+ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 -+ st1 {v2.16b, v3.16b}, [x1], #32 -+ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 -+ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 -+ st1 {v4.16b, v5.16b}, [x1], #32 -+ st1 {v16.16b, v17.16b}, [x1], #32 -+ st1 {v0.16b, v1.16b}, [x1] -+ ret -+endfunc -+ -+// VC-1 8x4 inverse transform -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> array of 16-bit inverse transform coefficients, in row-major order -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_8x4_neon, export=1 -+ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 -+ mov x3, x0 -+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] -+ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector -+ ld1 {v5.8b}, [x0], x1 -+ trn2 v6.4h, v1.4h, v3.4h -+ trn2 v7.4h, v2.4h, v4.4h -+ trn1 v1.4h, v1.4h, v3.4h -+ trn1 v2.4h, v2.4h, v4.4h -+ trn2 v3.4h, v16.4h, v18.4h -+ trn2 v4.4h, v17.4h, v19.4h -+ trn1 v16.4h, v16.4h, v18.4h -+ trn1 v17.4h, v17.4h, v19.4h -+ ld1 {v18.8b}, [x0], x1 -+ trn1 v19.2s, v6.2s, v3.2s -+ trn2 v3.2s, v6.2s, v3.2s -+ trn1 v6.2s, v7.2s, v4.2s -+ trn2 v4.2s, v7.2s, v4.2s -+ trn1 v7.2s, v1.2s, v16.2s -+ trn1 v20.2s, v2.2s, v17.2s -+ shl v21.4h, v19.4h, #4 // 16 * src[1] -+ trn2 v1.2s, v1.2s, v16.2s -+ shl v16.4h, v3.4h, #4 // 16 * src[3] -+ trn2 v2.2s, v2.2s, v17.2s -+ shl v17.4h, v6.4h, #4 // 16 * src[5] -+ ld1 {v22.8b}, [x0], x1 -+ shl v23.4h, v4.4h, #4 // 16 * src[7] -+ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] -+ ld1 {v25.8b}, [x0] -+ shl v26.4h, v19.4h, #2 // 4 * src[1] -+ shl v27.4h, v6.4h, #2 // 4 * src[5] -+ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] -+ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] -+ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] -+ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] -+ shl v7.4h, v7.4h, #2 // 8/2 * src[0] -+ shl v20.4h, v20.4h, #2 // 8/2 * src[4] -+ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] -+ shl v1.4h, v1.4h, #3 // 16/2 * src[2] -+ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] -+ ssra v7.4h, v7.4h, #1 // 12/2 * src[0] -+ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] -+ ssra v20.4h, v20.4h, #1 // 12/2 * src[4] -+ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] -+ shl v3.4h, v2.4h, #3 // 16/2 * src[6] -+ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] -+ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] -+ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] -+ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] -+ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] -+ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] -+ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] -+ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] -+ neg v6.4h, v21.4h // -t1 -+ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 -+ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 -+ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 -+ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 -+ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 -+ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 -+ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 -+ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 -+ neg v3.4h, v17.4h // +t2 -+ neg v4.4h, v16.4h // +t3 -+ neg v28.4h, v23.4h // +t4 -+ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 -+ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 -+ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 -+ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 -+ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 -+ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 -+ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 -+ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 -+ trn1 v1.2d, v7.2d, v1.2d -+ trn1 v2.2d, v20.2d, v2.2d -+ trn1 v3.2d, v24.2d, v27.2d -+ trn1 v4.2d, v19.2d, v26.2d -+ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 -+ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 -+ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 -+ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 -+ trn2 v6.8h, v1.8h, v2.8h -+ trn1 v1.8h, v1.8h, v2.8h -+ trn2 v2.8h, v3.8h, v4.8h -+ trn1 v3.8h, v3.8h, v4.8h -+ trn2 v4.4s, v6.4s, v2.4s -+ trn1 v7.4s, v1.4s, v3.4s -+ trn2 v1.4s, v1.4s, v3.4s -+ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] -+ trn1 v2.4s, v6.4s, v2.4s -+ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] -+ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] -+ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] -+ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] -+ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] -+ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] -+ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] -+ neg v2.8h, v3.8h // -t4/2 -+ neg v6.8h, v4.8h // -t3/2 -+ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 -+ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 -+ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 -+ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 -+ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 -+ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 -+ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 -+ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 -+ uaddw v0.8h, v0.8h, v5.8b -+ uaddw v1.8h, v1.8h, v18.8b -+ uaddw v2.8h, v2.8h, v22.8b -+ uaddw v3.8h, v3.8h, v25.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ st1 {v0.8b}, [x3], x1 -+ st1 {v1.8b}, [x3], x1 -+ st1 {v2.8b}, [x3], x1 -+ st1 {v3.8b}, [x3] -+ ret -+endfunc -+ -+// VC-1 4x8 inverse transform -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_4x8_neon, export=1 -+ mov x3, #16 -+ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector -+ mov x4, x0 -+ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 -+ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 -+ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 -+ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 -+ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 -+ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 -+ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 -+ ld1 {v4.d}[1], [x2] // 70 71 72 73 -+ ld1 {v5.s}[0], [x0], x1 -+ ld1 {v6.s}[0], [x0], x1 -+ ld1 {v7.s}[0], [x0], x1 -+ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 -+ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 -+ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 -+ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 -+ ld1 {v4.s}[0], [x0], x1 -+ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 -+ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 -+ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 -+ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] -+ ld1 {v5.s}[1], [x0], x1 -+ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] -+ ld1 {v6.s}[1], [x0], x1 -+ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 -+ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] -+ ld1 {v7.s}[1], [x0], x1 -+ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] -+ ld1 {v4.s}[1], [x0] -+ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] -+ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] -+ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] -+ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] -+ neg v3.8h, v16.8h // -t3/2 -+ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 -+ neg v18.8h, v17.8h // -t4/2 -+ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 -+ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 -+ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 -+ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 -+ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 -+ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 -+ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 -+ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 -+ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 -+ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 -+ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 -+ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 -+ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 -+ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 -+ mov d18, v3.d[1] // 50 51 52 53 -+ shl v19.4h, v3.4h, #4 // 16 * src[8] -+ mov d20, v16.d[1] // 70 71 72 73 -+ shl v21.4h, v16.4h, #4 // 16 * src[24] -+ mov d22, v17.d[1] // 40 41 42 43 -+ shl v23.4h, v3.4h, #2 // 4 * src[8] -+ shl v24.4h, v18.4h, #4 // 16 * src[40] -+ shl v25.4h, v20.4h, #4 // 16 * src[56] -+ shl v26.4h, v18.4h, #2 // 4 * src[40] -+ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 -+ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] -+ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] -+ shl v17.4h, v17.4h, #2 // 8/2 * src[0] -+ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] -+ shl v22.4h, v22.4h, #2 // 8/2 * src[32] -+ mov d23, v1.d[1] // 60 61 62 63 -+ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] -+ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] -+ shl v1.4h, v1.4h, #3 // 16/2 * src[16] -+ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] -+ ssra v17.4h, v17.4h, #1 // 12/2 * src[0] -+ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] -+ ssra v22.4h, v22.4h, #1 // 12/2 * src[32] -+ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] -+ shl v3.4h, v23.4h, #3 // 16/2 * src[48] -+ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] -+ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] -+ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] -+ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] -+ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] -+ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] -+ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] -+ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] -+ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 -+ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] -+ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 -+ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 -+ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 -+ neg v23.4h, v24.4h // +t2 -+ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 -+ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 -+ neg v17.4h, v21.4h // +t3 -+ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 -+ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 -+ neg v16.4h, v19.4h // -t1 -+ neg v27.4h, v2.4h // +t4 -+ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 -+ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 -+ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 -+ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 -+ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 -+ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 -+ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 -+ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 -+ trn1 v0.2d, v20.2d, v0.2d -+ trn1 v2.2d, v18.2d, v22.2d -+ trn1 v3.2d, v25.2d, v3.2d -+ trn1 v1.2d, v26.2d, v1.2d -+ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 -+ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 -+ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 -+ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 -+ uaddw v0.8h, v0.8h, v5.8b -+ uaddw v2.8h, v2.8h, v6.8b -+ uaddw v3.8h, v3.8h, v7.8b -+ uaddw v1.8h, v1.8h, v4.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ sqxtun v1.8b, v1.8h -+ st1 {v0.s}[0], [x4], x1 -+ st1 {v2.s}[0], [x4], x1 -+ st1 {v3.s}[0], [x4], x1 -+ st1 {v1.s}[0], [x4], x1 -+ st1 {v0.s}[1], [x4], x1 -+ st1 {v2.s}[1], [x4], x1 -+ st1 {v3.s}[1], [x4], x1 -+ st1 {v1.s}[1], [x4] -+ ret -+endfunc -+ -+// VC-1 4x4 inverse transform -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_4x4_neon, export=1 -+ mov x3, #16 -+ ldr d0, .Lcoeffs_it4 -+ mov x4, x0 -+ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 -+ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 -+ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 -+ ld1 {v4.d}[0], [x2] // 30 31 32 33 -+ ld1 {v5.s}[0], [x0], x1 -+ ld1 {v5.s}[1], [x0], x1 -+ ld1 {v6.s}[0], [x0], x1 -+ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 -+ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 -+ ld1 {v6.s}[1], [x0] -+ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 -+ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 -+ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 -+ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 -+ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 -+ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 -+ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] -+ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] -+ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] -+ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] -+ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] -+ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] -+ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] -+ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] -+ neg v7.4h, v3.4h // -t3/2 -+ neg v16.4h, v4.4h // -t4/2 -+ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 -+ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 -+ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 -+ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 -+ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 -+ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 -+ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 -+ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 -+ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 -+ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 -+ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 -+ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 -+ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 -+ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 -+ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 -+ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 -+ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] -+ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] -+ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] -+ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] -+ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] -+ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] -+ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] -+ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] -+ neg v3.4h, v2.4h // -t4/2 -+ neg v7.4h, v4.4h // -t3/2 -+ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 -+ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 -+ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 -+ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 -+ trn1 v0.2d, v4.2d, v3.2d -+ trn1 v1.2d, v2.2d, v7.2d -+ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 -+ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 -+ uaddw v0.8h, v0.8h, v5.8b -+ uaddw v1.8h, v1.8h, v6.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ st1 {v0.s}[0], [x4], x1 -+ st1 {v0.s}[1], [x4], x1 -+ st1 {v1.s}[0], [x4], x1 -+ st1 {v1.s}[1], [x4] -+ ret -+endfunc -+ -+// VC-1 8x8 inverse transform, DC case -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> 16-bit inverse transform DC coefficient -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_8x8_dc_neon, export=1 -+ ldrsh w2, [x2] -+ mov x3, x0 -+ ld1 {v0.8b}, [x0], x1 -+ ld1 {v1.8b}, [x0], x1 -+ ld1 {v2.8b}, [x0], x1 -+ add w2, w2, w2, lsl #1 -+ ld1 {v3.8b}, [x0], x1 -+ ld1 {v4.8b}, [x0], x1 -+ add w2, w2, #1 -+ ld1 {v5.8b}, [x0], x1 -+ asr w2, w2, #1 -+ ld1 {v6.8b}, [x0], x1 -+ add w2, w2, w2, lsl #1 -+ ld1 {v7.8b}, [x0] -+ add w0, w2, #16 -+ asr w0, w0, #5 -+ dup v16.8h, w0 -+ uaddw v0.8h, v16.8h, v0.8b -+ uaddw v1.8h, v16.8h, v1.8b -+ uaddw v2.8h, v16.8h, v2.8b -+ uaddw v3.8h, v16.8h, v3.8b -+ uaddw v4.8h, v16.8h, v4.8b -+ uaddw v5.8h, v16.8h, v5.8b -+ sqxtun v0.8b, v0.8h -+ uaddw v6.8h, v16.8h, v6.8b -+ sqxtun v1.8b, v1.8h -+ uaddw v7.8h, v16.8h, v7.8b -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ sqxtun v4.8b, v4.8h -+ st1 {v0.8b}, [x3], x1 -+ sqxtun v0.8b, v5.8h -+ st1 {v1.8b}, [x3], x1 -+ sqxtun v1.8b, v6.8h -+ st1 {v2.8b}, [x3], x1 -+ sqxtun v2.8b, v7.8h -+ st1 {v3.8b}, [x3], x1 -+ st1 {v4.8b}, [x3], x1 -+ st1 {v0.8b}, [x3], x1 -+ st1 {v1.8b}, [x3], x1 -+ st1 {v2.8b}, [x3] -+ ret -+endfunc -+ -+// VC-1 8x4 inverse transform, DC case -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> 16-bit inverse transform DC coefficient -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_8x4_dc_neon, export=1 -+ ldrsh w2, [x2] -+ mov x3, x0 -+ ld1 {v0.8b}, [x0], x1 -+ ld1 {v1.8b}, [x0], x1 -+ ld1 {v2.8b}, [x0], x1 -+ add w2, w2, w2, lsl #1 -+ ld1 {v3.8b}, [x0] -+ add w0, w2, #1 -+ asr w0, w0, #1 -+ add w0, w0, w0, lsl #4 -+ add w0, w0, #64 -+ asr w0, w0, #7 -+ dup v4.8h, w0 -+ uaddw v0.8h, v4.8h, v0.8b -+ uaddw v1.8h, v4.8h, v1.8b -+ uaddw v2.8h, v4.8h, v2.8b -+ uaddw v3.8h, v4.8h, v3.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ st1 {v0.8b}, [x3], x1 -+ st1 {v1.8b}, [x3], x1 -+ st1 {v2.8b}, [x3], x1 -+ st1 {v3.8b}, [x3] -+ ret -+endfunc -+ -+// VC-1 4x8 inverse transform, DC case -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> 16-bit inverse transform DC coefficient -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_4x8_dc_neon, export=1 -+ ldrsh w2, [x2] -+ mov x3, x0 -+ ld1 {v0.s}[0], [x0], x1 -+ ld1 {v1.s}[0], [x0], x1 -+ ld1 {v2.s}[0], [x0], x1 -+ add w2, w2, w2, lsl #4 -+ ld1 {v3.s}[0], [x0], x1 -+ add w2, w2, #4 -+ asr w2, w2, #3 -+ add w2, w2, w2, lsl #1 -+ ld1 {v0.s}[1], [x0], x1 -+ add w2, w2, #16 -+ asr w2, w2, #5 -+ dup v4.8h, w2 -+ ld1 {v1.s}[1], [x0], x1 -+ ld1 {v2.s}[1], [x0], x1 -+ ld1 {v3.s}[1], [x0] -+ uaddw v0.8h, v4.8h, v0.8b -+ uaddw v1.8h, v4.8h, v1.8b -+ uaddw v2.8h, v4.8h, v2.8b -+ uaddw v3.8h, v4.8h, v3.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ sqxtun v2.8b, v2.8h -+ sqxtun v3.8b, v3.8h -+ st1 {v0.s}[0], [x3], x1 -+ st1 {v1.s}[0], [x3], x1 -+ st1 {v2.s}[0], [x3], x1 -+ st1 {v3.s}[0], [x3], x1 -+ st1 {v0.s}[1], [x3], x1 -+ st1 {v1.s}[1], [x3], x1 -+ st1 {v2.s}[1], [x3], x1 -+ st1 {v3.s}[1], [x3] -+ ret -+endfunc -+ -+// VC-1 4x4 inverse transform, DC case -+// On entry: -+// x0 -> array of 8-bit samples, in row-major order -+// x1 = row stride for 8-bit sample array -+// x2 -> 16-bit inverse transform DC coefficient -+// On exit: -+// array at x0 updated by saturated addition of (narrowed) transformed block -+function ff_vc1_inv_trans_4x4_dc_neon, export=1 -+ ldrsh w2, [x2] -+ mov x3, x0 -+ ld1 {v0.s}[0], [x0], x1 -+ ld1 {v1.s}[0], [x0], x1 -+ ld1 {v0.s}[1], [x0], x1 -+ add w2, w2, w2, lsl #4 -+ ld1 {v1.s}[1], [x0] -+ add w0, w2, #4 -+ asr w0, w0, #3 -+ add w0, w0, w0, lsl #4 -+ add w0, w0, #64 -+ asr w0, w0, #7 -+ dup v2.8h, w0 -+ uaddw v0.8h, v2.8h, v0.8b -+ uaddw v1.8h, v2.8h, v1.8b -+ sqxtun v0.8b, v0.8h -+ sqxtun v1.8b, v1.8h -+ st1 {v0.s}[0], [x3], x1 -+ st1 {v1.s}[0], [x3], x1 -+ st1 {v0.s}[1], [x3], x1 -+ st1 {v1.s}[1], [x3] -+ ret -+endfunc -+ -+.align 5 -+.Lcoeffs_it8: -+.quad 0x000F00090003 -+.Lcoeffs_it4: -+.quad 0x0011000B0005 -+.Lcoeffs: -+.quad 0x00050002 -+ -+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of lower block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter4_neon, export=1 -+ sub x3, x0, w1, sxtw #2 -+ ldr d0, .Lcoeffs -+ ld1 {v1.s}[0], [x0], x1 // P5 -+ ld1 {v2.s}[0], [x3], x1 // P1 -+ ld1 {v3.s}[0], [x3], x1 // P2 -+ ld1 {v4.s}[0], [x0], x1 // P6 -+ ld1 {v5.s}[0], [x3], x1 // P3 -+ ld1 {v6.s}[0], [x0], x1 // P7 -+ ld1 {v7.s}[0], [x3] // P4 -+ ld1 {v16.s}[0], [x0] // P8 -+ ushll v17.8h, v1.8b, #1 // 2*P5 -+ dup v18.8h, w2 // pq -+ ushll v2.8h, v2.8b, #1 // 2*P1 -+ uxtl v3.8h, v3.8b // P2 -+ uxtl v4.8h, v4.8b // P6 -+ uxtl v19.8h, v5.8b // P3 -+ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 -+ uxtl v3.8h, v6.8b // P7 -+ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 -+ ushll v5.8h, v5.8b, #1 // 2*P3 -+ uxtl v6.8h, v7.8b // P4 -+ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 -+ uxtl v3.8h, v16.8b // P8 -+ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 -+ uxtl v1.8h, v1.8b // P5 -+ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 -+ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 -+ sub v3.4h, v6.4h, v1.4h // P4-P5 -+ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 -+ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 -+ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 -+ abs v4.4h, v3.4h -+ srshr v7.4h, v17.4h, #3 -+ srshr v2.4h, v2.4h, #3 -+ sshr v4.4h, v4.4h, #1 // clip -+ srshr v5.4h, v5.4h, #3 -+ abs v7.4h, v7.4h // a2 -+ sshr v3.4h, v3.4h, #8 // clip_sign -+ abs v2.4h, v2.4h // a1 -+ cmeq v16.4h, v4.4h, #0 // test clip == 0 -+ abs v17.4h, v5.4h // a0 -+ sshr v5.4h, v5.4h, #8 // a0_sign -+ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 -+ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq -+ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign -+ bsl v19.8b, v7.8b, v2.8b // a3 -+ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq -+ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 -+ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 -+ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 -+ mov w0, v5.s[1] // move to gp reg -+ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ cmhs v5.4h, v0.4h, v4.4h -+ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered -+ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) -+ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ sqxtun v0.8b, v6.8h -+ sqxtun v1.8b, v1.8h -+ st1 {v0.s}[0], [x3], x1 -+ st1 {v1.s}[0], [x3] -+1: ret -+endfunc -+ -+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of right block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter4_neon, export=1 -+ sub x3, x0, #4 // where to start reading -+ ldr d0, .Lcoeffs -+ ld1 {v1.8b}, [x3], x1 -+ sub x0, x0, #1 // where to start writing -+ ld1 {v2.8b}, [x3], x1 -+ ld1 {v3.8b}, [x3], x1 -+ ld1 {v4.8b}, [x3] -+ dup v5.8h, w2 // pq -+ trn1 v6.8b, v1.8b, v2.8b -+ trn2 v1.8b, v1.8b, v2.8b -+ trn1 v2.8b, v3.8b, v4.8b -+ trn2 v3.8b, v3.8b, v4.8b -+ trn1 v4.4h, v6.4h, v2.4h // P1, P5 -+ trn1 v7.4h, v1.4h, v3.4h // P2, P6 -+ trn2 v2.4h, v6.4h, v2.4h // P3, P7 -+ trn2 v1.4h, v1.4h, v3.4h // P4, P8 -+ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 -+ uxtl v6.8h, v7.8b // P2, P6 -+ uxtl v7.8h, v2.8b // P3, P7 -+ uxtl v1.8h, v1.8b // P4, P8 -+ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 -+ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 -+ uxtl v4.8h, v4.8b // P1, P5 -+ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 -+ mov d6, v6.d[1] // P6 -+ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 -+ mov d4, v4.d[1] // P5 -+ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 -+ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 -+ sub v7.4h, v1.4h, v4.4h // P4-P5 -+ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 -+ srshr v3.8h, v3.8h, #3 -+ abs v6.4h, v7.4h -+ sshr v7.4h, v7.4h, #8 // clip_sign -+ srshr v2.4h, v2.4h, #3 -+ abs v3.8h, v3.8h // a1, a2 -+ sshr v6.4h, v6.4h, #1 // clip -+ mov d16, v3.d[1] // a2 -+ abs v17.4h, v2.4h // a0 -+ cmeq v18.4h, v6.4h, #0 // test clip == 0 -+ sshr v2.4h, v2.4h, #8 // a0_sign -+ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 -+ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq -+ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign -+ bsl v19.8b, v16.8b, v3.8b // a3 -+ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq -+ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 -+ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 -+ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 -+ mov w2, v5.s[1] // move to gp reg -+ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ cmhs v5.4h, v0.4h, v6.4h -+ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered -+ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) -+ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ sqxtun v3.8b, v4.8h -+ sqxtun v2.8b, v1.8h -+ st2 {v2.b, v3.b}[0], [x0], x1 -+ st2 {v2.b, v3.b}[1], [x0], x1 -+ st2 {v2.b, v3.b}[2], [x0], x1 -+ st2 {v2.b, v3.b}[3], [x0] -+1: ret -+endfunc -+ -+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of lower block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter8_neon, export=1 -+ sub x3, x0, w1, sxtw #2 -+ ldr d0, .Lcoeffs -+ ld1 {v1.8b}, [x0], x1 // P5 -+ movi v2.2d, #0x0000ffff00000000 -+ ld1 {v3.8b}, [x3], x1 // P1 -+ ld1 {v4.8b}, [x3], x1 // P2 -+ ld1 {v5.8b}, [x0], x1 // P6 -+ ld1 {v6.8b}, [x3], x1 // P3 -+ ld1 {v7.8b}, [x0], x1 // P7 -+ ushll v16.8h, v1.8b, #1 // 2*P5 -+ ushll v3.8h, v3.8b, #1 // 2*P1 -+ ld1 {v17.8b}, [x3] // P4 -+ uxtl v4.8h, v4.8b // P2 -+ ld1 {v18.8b}, [x0] // P8 -+ uxtl v5.8h, v5.8b // P6 -+ dup v19.8h, w2 // pq -+ uxtl v20.8h, v6.8b // P3 -+ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 -+ uxtl v4.8h, v7.8b // P7 -+ ushll v6.8h, v6.8b, #1 // 2*P3 -+ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 -+ uxtl v7.8h, v17.8b // P4 -+ uxtl v17.8h, v18.8b // P8 -+ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 -+ uxtl v1.8h, v1.8b // P5 -+ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 -+ sub v4.8h, v7.8h, v1.8h // P4-P5 -+ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 -+ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 -+ abs v17.8h, v4.8h -+ sshr v4.8h, v4.8h, #8 // clip_sign -+ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 -+ sshr v17.8h, v17.8h, #1 // clip -+ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 -+ srshr v16.8h, v16.8h, #3 -+ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 -+ cmeq v5.8h, v17.8h, #0 // test clip == 0 -+ srshr v3.8h, v3.8h, #3 -+ abs v16.8h, v16.8h // a2 -+ abs v3.8h, v3.8h // a1 -+ srshr v6.8h, v6.8h, #3 -+ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 -+ abs v20.8h, v6.8h // a0 -+ sshr v6.8h, v6.8h, #8 // a0_sign -+ bsl v18.16b, v16.16b, v3.16b // a3 -+ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq -+ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign -+ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 -+ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq -+ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 -+ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 -+ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either -+ mov w0, v5.s[1] // move to gp reg -+ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ mov w2, v5.s[3] -+ orr v2.16b, v3.16b, v2.16b -+ cmhs v3.8h, v0.8h, v17.8h -+ and w0, w0, w2 -+ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) -+ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case -+ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered -+ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ sqxtun v0.8b, v7.8h -+ sqxtun v1.8b, v1.8h -+ st1 {v0.8b}, [x3], x1 -+ st1 {v1.8b}, [x3] -+1: ret -+endfunc -+ -+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of right block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter8_neon, export=1 -+ sub x3, x0, #4 // where to start reading -+ ldr d0, .Lcoeffs -+ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... -+ sub x0, x0, #1 // where to start writing -+ ld1 {v2.8b}, [x3], x1 -+ add x4, x0, x1, lsl #2 -+ ld1 {v3.8b}, [x3], x1 -+ ld1 {v4.8b}, [x3], x1 -+ ld1 {v5.8b}, [x3], x1 -+ ld1 {v6.8b}, [x3], x1 -+ ld1 {v7.8b}, [x3], x1 -+ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... -+ ld1 {v17.8b}, [x3] -+ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... -+ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... -+ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... -+ dup v4.8h, w2 // pq -+ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... -+ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... -+ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... -+ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... -+ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... -+ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... -+ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... -+ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... -+ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... -+ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... -+ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... -+ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... -+ trn1 v7.2s, v6.2s, v3.2s // P1 -+ trn1 v18.2s, v19.2s, v16.2s // P2 -+ trn2 v3.2s, v6.2s, v3.2s // P5 -+ trn2 v6.2s, v19.2s, v16.2s // P6 -+ trn1 v16.2s, v2.2s, v17.2s // P3 -+ trn2 v2.2s, v2.2s, v17.2s // P7 -+ ushll v7.8h, v7.8b, #1 // 2*P1 -+ trn1 v17.2s, v1.2s, v5.2s // P4 -+ ushll v19.8h, v3.8b, #1 // 2*P5 -+ trn2 v1.2s, v1.2s, v5.2s // P8 -+ uxtl v5.8h, v18.8b // P2 -+ uxtl v6.8h, v6.8b // P6 -+ uxtl v18.8h, v16.8b // P3 -+ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 -+ uxtl v2.8h, v2.8b // P7 -+ ushll v5.8h, v16.8b, #1 // 2*P3 -+ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 -+ uxtl v16.8h, v17.8b // P4 -+ uxtl v1.8h, v1.8b // P8 -+ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 -+ uxtl v2.8h, v3.8b // P5 -+ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 -+ sub v3.8h, v16.8h, v2.8h // P4-P5 -+ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 -+ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 -+ abs v1.8h, v3.8h -+ sshr v3.8h, v3.8h, #8 // clip_sign -+ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 -+ sshr v1.8h, v1.8h, #1 // clip -+ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 -+ srshr v17.8h, v19.8h, #3 -+ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 -+ cmeq v6.8h, v1.8h, #0 // test clip == 0 -+ srshr v7.8h, v7.8h, #3 -+ abs v17.8h, v17.8h // a2 -+ abs v7.8h, v7.8h // a1 -+ srshr v5.8h, v5.8h, #3 -+ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 -+ abs v19.8h, v5.8h // a0 -+ sshr v5.8h, v5.8h, #8 // a0_sign -+ bsl v18.16b, v17.16b, v7.16b // a3 -+ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq -+ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign -+ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 -+ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq -+ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 -+ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 -+ mov w2, v5.s[1] // move to gp reg -+ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ mov w3, v5.s[3] -+ cmhs v5.8h, v0.8h, v1.8h -+ and w5, w2, w3 -+ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) -+ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case -+ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ sqxtun v1.8b, v2.8h -+ sqxtun v0.8b, v16.8h -+ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so -+ st2 {v0.b, v1.b}[0], [x0], x1 -+ st2 {v0.b, v1.b}[1], [x0], x1 -+ st2 {v0.b, v1.b}[2], [x0], x1 -+ st2 {v0.b, v1.b}[3], [x0] -+1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so -+ st2 {v0.b, v1.b}[4], [x4], x1 -+ st2 {v0.b, v1.b}[5], [x4], x1 -+ st2 {v0.b, v1.b}[6], [x4], x1 -+ st2 {v0.b, v1.b}[7], [x4] -+2: ret -+endfunc -+ -+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of lower block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter16_neon, export=1 -+ sub x3, x0, w1, sxtw #2 -+ ldr d0, .Lcoeffs -+ ld1 {v1.16b}, [x0], x1 // P5 -+ movi v2.2d, #0x0000ffff00000000 -+ ld1 {v3.16b}, [x3], x1 // P1 -+ ld1 {v4.16b}, [x3], x1 // P2 -+ ld1 {v5.16b}, [x0], x1 // P6 -+ ld1 {v6.16b}, [x3], x1 // P3 -+ ld1 {v7.16b}, [x0], x1 // P7 -+ ushll v16.8h, v1.8b, #1 // 2*P5[0..7] -+ ushll v17.8h, v3.8b, #1 // 2*P1[0..7] -+ ld1 {v18.16b}, [x3] // P4 -+ uxtl v19.8h, v4.8b // P2[0..7] -+ ld1 {v20.16b}, [x0] // P8 -+ uxtl v21.8h, v5.8b // P6[0..7] -+ dup v22.8h, w2 // pq -+ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] -+ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] -+ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] -+ uxtl2 v4.8h, v4.16b // P2[8..15] -+ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] -+ uxtl2 v5.8h, v5.16b // P6[8..15] -+ uxtl v23.8h, v6.8b // P3[0..7] -+ uxtl v24.8h, v7.8b // P7[0..7] -+ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] -+ ushll v4.8h, v6.8b, #1 // 2*P3[0..7] -+ uxtl v25.8h, v18.8b // P4[0..7] -+ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] -+ uxtl2 v26.8h, v6.16b // P3[8..15] -+ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] -+ uxtl2 v7.8h, v7.16b // P7[8..15] -+ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] -+ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] -+ uxtl2 v18.8h, v18.16b // P4[8..15] -+ uxtl v23.8h, v20.8b // P8[0..7] -+ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] -+ uxtl v24.8h, v1.8b // P5[0..7] -+ uxtl2 v20.8h, v20.16b // P8[8..15] -+ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] -+ uxtl2 v1.8h, v1.16b // P5[8..15] -+ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] -+ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] -+ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] -+ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] -+ abs v27.8h, v26.8h -+ sshr v26.8h, v26.8h, #8 // clip_sign[0..7] -+ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] -+ abs v28.8h, v7.8h -+ sshr v27.8h, v27.8h, #1 // clip[0..7] -+ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] -+ sshr v7.8h, v7.8h, #8 // clip_sign[8..15] -+ sshr v23.8h, v28.8h, #1 // clip[8..15] -+ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] -+ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 -+ srshr v17.8h, v17.8h, #3 -+ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] -+ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 -+ srshr v16.8h, v16.8h, #3 -+ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] -+ abs v17.8h, v17.8h // a1[0..7] -+ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] -+ srshr v3.8h, v3.8h, #3 -+ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] -+ abs v16.8h, v16.8h // a2[0..7] -+ srshr v19.8h, v19.8h, #3 -+ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] -+ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] -+ abs v3.8h, v3.8h // a1[8..15] -+ srshr v4.8h, v4.8h, #3 -+ abs v19.8h, v19.8h // a2[8..15] -+ bsl v5.16b, v16.16b, v17.16b // a3[0..7] -+ srshr v6.8h, v6.8h, #3 -+ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] -+ abs v17.8h, v4.8h // a0[0..7] -+ sshr v4.8h, v4.8h, #8 // a0_sign[0..7] -+ bsl v16.16b, v19.16b, v3.16b // a3[8..15] -+ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ abs v19.8h, v6.8h // a0[8..15] -+ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq -+ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] -+ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] -+ sshr v6.8h, v6.8h, #8 // a0_sign[8..15] -+ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 -+ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq -+ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq -+ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] -+ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 -+ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] -+ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] -+ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 -+ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq -+ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either -+ mov w0, v5.s[1] // move to gp reg -+ cmhs v19.8h, v3.8h, v27.8h -+ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 -+ mov w2, v5.s[3] -+ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] -+ orr v16.16b, v20.16b, v17.16b -+ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) -+ cmtst v2.2d, v5.2d, v2.2d -+ cmhs v3.8h, v0.8h, v23.8h -+ mov w4, v5.s[1] -+ mov w5, v5.s[3] -+ and w0, w0, w2 -+ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) -+ orr v2.16b, v7.16b, v2.16b -+ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) -+ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] -+ and w2, w4, w5 -+ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) -+ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] -+ and w0, w0, w2 -+ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] -+ sqxtun v2.8b, v25.8h -+ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case -+ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] -+ sqxtun v0.8b, v24.8h -+ sqxtun2 v2.16b, v18.8h -+ sqxtun2 v0.16b, v1.8h -+ st1 {v2.16b}, [x3], x1 -+ st1 {v0.16b}, [x3] -+1: ret -+endfunc -+ -+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks -+// On entry: -+// x0 -> top-left pel of right block -+// x1 = row stride, bytes -+// w2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter16_neon, export=1 -+ sub x3, x0, #4 // where to start reading -+ ldr d0, .Lcoeffs -+ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... -+ sub x0, x0, #1 // where to start writing -+ ld1 {v2.8b}, [x3], x1 -+ add x4, x0, x1, lsl #3 -+ ld1 {v3.8b}, [x3], x1 -+ add x5, x0, x1, lsl #2 -+ ld1 {v4.8b}, [x3], x1 -+ add x6, x4, x1, lsl #2 -+ ld1 {v5.8b}, [x3], x1 -+ ld1 {v6.8b}, [x3], x1 -+ ld1 {v7.8b}, [x3], x1 -+ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... -+ ld1 {v17.8b}, [x3], x1 -+ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... -+ ld1 {v2.8b}, [x3], x1 -+ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... -+ ld1 {v19.8b}, [x3], x1 -+ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... -+ ld1 {v4.8b}, [x3], x1 -+ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... -+ ld1 {v21.8b}, [x3], x1 -+ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... -+ ld1 {v6.8b}, [x3], x1 -+ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... -+ ld1 {v23.8b}, [x3], x1 -+ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... -+ ld1 {v17.8b}, [x3], x1 -+ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... -+ ld1 {v25.8b}, [x3] -+ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... -+ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... -+ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... -+ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... -+ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... -+ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... -+ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... -+ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... -+ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... -+ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... -+ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... -+ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... -+ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... -+ trn1 v31.2s, v19.2s, v27.2s // P1[0..7] -+ trn2 v19.2s, v19.2s, v27.2s // P5[0..7] -+ trn1 v27.2s, v21.2s, v23.2s // P2[0..7] -+ trn2 v21.2s, v21.2s, v23.2s // P6[0..7] -+ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... -+ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... -+ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... -+ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... -+ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... -+ trn1 v24.2s, v29.2s, v23.2s // P1[8..15] -+ trn2 v23.2s, v29.2s, v23.2s // P5[8..15] -+ trn1 v26.2s, v25.2s, v18.2s // P2[8..15] -+ trn2 v18.2s, v25.2s, v18.2s // P6[8..15] -+ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... -+ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... -+ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... -+ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... -+ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... -+ ushll v5.8h, v31.8b, #1 // 2*P1[0..7] -+ ushll v6.8h, v19.8b, #1 // 2*P5[0..7] -+ trn1 v7.2s, v16.2s, v20.2s // P3[0..7] -+ uxtl v17.8h, v27.8b // P2[0..7] -+ trn2 v16.2s, v16.2s, v20.2s // P7[0..7] -+ uxtl v20.8h, v21.8b // P6[0..7] -+ trn1 v21.2s, v22.2s, v25.2s // P3[8..15] -+ ushll v24.8h, v24.8b, #1 // 2*P1[8..15] -+ trn2 v22.2s, v22.2s, v25.2s // P7[8..15] -+ ushll v25.8h, v23.8b, #1 // 2*P5[8..15] -+ trn1 v27.2s, v1.2s, v3.2s // P4[0..7] -+ uxtl v26.8h, v26.8b // P2[8..15] -+ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] -+ uxtl v17.8h, v18.8b // P6[8..15] -+ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] -+ trn1 v18.2s, v2.2s, v4.2s // P4[8..15] -+ uxtl v28.8h, v7.8b // P3[0..7] -+ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] -+ uxtl v16.8h, v16.8b // P7[0..7] -+ uxtl v26.8h, v21.8b // P3[8..15] -+ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] -+ uxtl v22.8h, v22.8b // P7[8..15] -+ ushll v7.8h, v7.8b, #1 // 2*P3[0..7] -+ uxtl v27.8h, v27.8b // P4[0..7] -+ trn2 v1.2s, v1.2s, v3.2s // P8[0..7] -+ ushll v3.8h, v21.8b, #1 // 2*P3[8..15] -+ trn2 v2.2s, v2.2s, v4.2s // P8[8..15] -+ uxtl v4.8h, v18.8b // P4[8..15] -+ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] -+ uxtl v1.8h, v1.8b // P8[0..7] -+ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] -+ uxtl v2.8h, v2.8b // P8[8..15] -+ uxtl v16.8h, v19.8b // P5[0..7] -+ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] -+ uxtl v18.8h, v23.8b // P5[8..15] -+ dup v19.8h, w2 // pq -+ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] -+ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] -+ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] -+ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] -+ abs v23.8h, v21.8h -+ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] -+ abs v26.8h, v22.8h -+ sshr v21.8h, v21.8h, #8 // clip_sign[0..7] -+ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] -+ sshr v23.8h, v23.8h, #1 // clip[0..7] -+ sshr v26.8h, v26.8h, #1 // clip[8..15] -+ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] -+ sshr v1.8h, v22.8h, #8 // clip_sign[8..15] -+ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 -+ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] -+ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 -+ srshr v5.8h, v5.8h, #3 -+ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] -+ srshr v2.8h, v6.8h, #3 -+ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] -+ srshr v6.8h, v24.8h, #3 -+ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] -+ abs v5.8h, v5.8h // a1[0..7] -+ srshr v24.8h, v25.8h, #3 -+ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] -+ abs v2.8h, v2.8h // a2[0..7] -+ abs v6.8h, v6.8h // a1[8..15] -+ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] -+ abs v17.8h, v24.8h // a2[8..15] -+ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] -+ srshr v3.8h, v3.8h, #3 -+ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] -+ srshr v7.8h, v7.8h, #3 -+ bsl v20.16b, v2.16b, v5.16b // a3[0..7] -+ abs v2.8h, v3.8h // a0[8..15] -+ sshr v3.8h, v3.8h, #8 // a0_sign[8..15] -+ bsl v24.16b, v17.16b, v6.16b // a3[8..15] -+ abs v5.8h, v7.8h // a0[0..7] -+ sshr v6.8h, v7.8h, #8 // a0_sign[0..7] -+ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq -+ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] -+ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] -+ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq -+ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq -+ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] -+ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 -+ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] -+ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq -+ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 -+ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] -+ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] -+ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 -+ mov w7, v2.s[1] -+ mov w8, v2.s[3] -+ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 -+ mov w2, v5.s[1] // move to gp reg -+ cmhs v2.8h, v3.8h, v26.8h -+ mov w3, v5.s[3] -+ cmhs v5.8h, v0.8h, v23.8h -+ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) -+ and w9, w7, w8 -+ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) -+ and w10, w2, w3 -+ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) -+ and w9, w10, w9 -+ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) -+ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 -+ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case -+ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 -+ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 -+ sqxtun v2.8b, v4.8h -+ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 -+ sqxtun v0.8b, v27.8h -+ sqxtun v1.8b, v16.8h -+ sqxtun v3.8b, v18.8h -+ tbnz w2, #0, 1f -+ st2 {v0.b, v1.b}[0], [x0], x1 -+ st2 {v0.b, v1.b}[1], [x0], x1 -+ st2 {v0.b, v1.b}[2], [x0], x1 -+ st2 {v0.b, v1.b}[3], [x0] -+1: tbnz w3, #0, 2f -+ st2 {v0.b, v1.b}[4], [x5], x1 -+ st2 {v0.b, v1.b}[5], [x5], x1 -+ st2 {v0.b, v1.b}[6], [x5], x1 -+ st2 {v0.b, v1.b}[7], [x5] -+2: tbnz w7, #0, 3f -+ st2 {v2.b, v3.b}[0], [x4], x1 -+ st2 {v2.b, v3.b}[1], [x4], x1 -+ st2 {v2.b, v3.b}[2], [x4], x1 -+ st2 {v2.b, v3.b}[3], [x4] -+3: tbnz w8, #0, 4f -+ st2 {v2.b, v3.b}[4], [x6], x1 -+ st2 {v2.b, v3.b}[5], [x6], x1 -+ st2 {v2.b, v3.b}[6], [x6], x1 -+ st2 {v2.b, v3.b}[7], [x6] -+4: ret -+endfunc -+ -+// Copy at most the specified number of bytes from source to destination buffer, -+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence -+// On entry: -+// x0 -> source buffer -+// w1 = max number of bytes to copy -+// x2 -> destination buffer, optimally 8-byte aligned -+// On exit: -+// w0 = number of bytes not copied -+function ff_vc1_unescape_buffer_helper_neon, export=1 -+ // Offset by 80 to screen out cases that are too short for us to handle, -+ // and also make it easy to test for loop termination, or to determine -+ // whether we need an odd number of half-iterations of the loop. -+ subs w1, w1, #80 -+ b.mi 90f -+ -+ // Set up useful constants -+ movi v20.4s, #3, lsl #24 -+ movi v21.4s, #3, lsl #16 -+ -+ tst w1, #32 -+ b.ne 1f -+ -+ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 -+ ext v25.16b, v0.16b, v1.16b, #1 -+ ext v26.16b, v0.16b, v1.16b, #2 -+ ext v27.16b, v0.16b, v1.16b, #3 -+ ext v29.16b, v1.16b, v2.16b, #1 -+ ext v30.16b, v1.16b, v2.16b, #2 -+ ext v31.16b, v1.16b, v2.16b, #3 -+ bic v24.16b, v0.16b, v20.16b -+ bic v25.16b, v25.16b, v20.16b -+ bic v26.16b, v26.16b, v20.16b -+ bic v27.16b, v27.16b, v20.16b -+ bic v28.16b, v1.16b, v20.16b -+ bic v29.16b, v29.16b, v20.16b -+ bic v30.16b, v30.16b, v20.16b -+ bic v31.16b, v31.16b, v20.16b -+ eor v24.16b, v24.16b, v21.16b -+ eor v25.16b, v25.16b, v21.16b -+ eor v26.16b, v26.16b, v21.16b -+ eor v27.16b, v27.16b, v21.16b -+ eor v28.16b, v28.16b, v21.16b -+ eor v29.16b, v29.16b, v21.16b -+ eor v30.16b, v30.16b, v21.16b -+ eor v31.16b, v31.16b, v21.16b -+ cmeq v24.4s, v24.4s, #0 -+ cmeq v25.4s, v25.4s, #0 -+ cmeq v26.4s, v26.4s, #0 -+ cmeq v27.4s, v27.4s, #0 -+ add w1, w1, #32 -+ b 3f -+ -+1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 -+ ext v25.16b, v3.16b, v4.16b, #1 -+ ext v26.16b, v3.16b, v4.16b, #2 -+ ext v27.16b, v3.16b, v4.16b, #3 -+ ext v29.16b, v4.16b, v5.16b, #1 -+ ext v30.16b, v4.16b, v5.16b, #2 -+ ext v31.16b, v4.16b, v5.16b, #3 -+ bic v24.16b, v3.16b, v20.16b -+ bic v25.16b, v25.16b, v20.16b -+ bic v26.16b, v26.16b, v20.16b -+ bic v27.16b, v27.16b, v20.16b -+ bic v28.16b, v4.16b, v20.16b -+ bic v29.16b, v29.16b, v20.16b -+ bic v30.16b, v30.16b, v20.16b -+ bic v31.16b, v31.16b, v20.16b -+ eor v24.16b, v24.16b, v21.16b -+ eor v25.16b, v25.16b, v21.16b -+ eor v26.16b, v26.16b, v21.16b -+ eor v27.16b, v27.16b, v21.16b -+ eor v28.16b, v28.16b, v21.16b -+ eor v29.16b, v29.16b, v21.16b -+ eor v30.16b, v30.16b, v21.16b -+ eor v31.16b, v31.16b, v21.16b -+ cmeq v24.4s, v24.4s, #0 -+ cmeq v25.4s, v25.4s, #0 -+ cmeq v26.4s, v26.4s, #0 -+ cmeq v27.4s, v27.4s, #0 -+ // Drop through... -+2: mov v0.16b, v5.16b -+ ld1 {v1.16b, v2.16b}, [x0], #32 -+ cmeq v28.4s, v28.4s, #0 -+ cmeq v29.4s, v29.4s, #0 -+ cmeq v30.4s, v30.4s, #0 -+ cmeq v31.4s, v31.4s, #0 -+ orr v24.16b, v24.16b, v25.16b -+ orr v26.16b, v26.16b, v27.16b -+ orr v28.16b, v28.16b, v29.16b -+ orr v30.16b, v30.16b, v31.16b -+ ext v25.16b, v0.16b, v1.16b, #1 -+ orr v22.16b, v24.16b, v26.16b -+ ext v26.16b, v0.16b, v1.16b, #2 -+ ext v27.16b, v0.16b, v1.16b, #3 -+ ext v29.16b, v1.16b, v2.16b, #1 -+ orr v23.16b, v28.16b, v30.16b -+ ext v30.16b, v1.16b, v2.16b, #2 -+ ext v31.16b, v1.16b, v2.16b, #3 -+ bic v24.16b, v0.16b, v20.16b -+ bic v25.16b, v25.16b, v20.16b -+ bic v26.16b, v26.16b, v20.16b -+ orr v22.16b, v22.16b, v23.16b -+ bic v27.16b, v27.16b, v20.16b -+ bic v28.16b, v1.16b, v20.16b -+ bic v29.16b, v29.16b, v20.16b -+ bic v30.16b, v30.16b, v20.16b -+ bic v31.16b, v31.16b, v20.16b -+ addv s22, v22.4s -+ eor v24.16b, v24.16b, v21.16b -+ eor v25.16b, v25.16b, v21.16b -+ eor v26.16b, v26.16b, v21.16b -+ eor v27.16b, v27.16b, v21.16b -+ eor v28.16b, v28.16b, v21.16b -+ mov w3, v22.s[0] -+ eor v29.16b, v29.16b, v21.16b -+ eor v30.16b, v30.16b, v21.16b -+ eor v31.16b, v31.16b, v21.16b -+ cmeq v24.4s, v24.4s, #0 -+ cmeq v25.4s, v25.4s, #0 -+ cmeq v26.4s, v26.4s, #0 -+ cmeq v27.4s, v27.4s, #0 -+ cbnz w3, 90f -+ st1 {v3.16b, v4.16b}, [x2], #32 -+3: mov v3.16b, v2.16b -+ ld1 {v4.16b, v5.16b}, [x0], #32 -+ cmeq v28.4s, v28.4s, #0 -+ cmeq v29.4s, v29.4s, #0 -+ cmeq v30.4s, v30.4s, #0 -+ cmeq v31.4s, v31.4s, #0 -+ orr v24.16b, v24.16b, v25.16b -+ orr v26.16b, v26.16b, v27.16b -+ orr v28.16b, v28.16b, v29.16b -+ orr v30.16b, v30.16b, v31.16b -+ ext v25.16b, v3.16b, v4.16b, #1 -+ orr v22.16b, v24.16b, v26.16b -+ ext v26.16b, v3.16b, v4.16b, #2 -+ ext v27.16b, v3.16b, v4.16b, #3 -+ ext v29.16b, v4.16b, v5.16b, #1 -+ orr v23.16b, v28.16b, v30.16b -+ ext v30.16b, v4.16b, v5.16b, #2 -+ ext v31.16b, v4.16b, v5.16b, #3 -+ bic v24.16b, v3.16b, v20.16b -+ bic v25.16b, v25.16b, v20.16b -+ bic v26.16b, v26.16b, v20.16b -+ orr v22.16b, v22.16b, v23.16b -+ bic v27.16b, v27.16b, v20.16b -+ bic v28.16b, v4.16b, v20.16b -+ bic v29.16b, v29.16b, v20.16b -+ bic v30.16b, v30.16b, v20.16b -+ bic v31.16b, v31.16b, v20.16b -+ addv s22, v22.4s -+ eor v24.16b, v24.16b, v21.16b -+ eor v25.16b, v25.16b, v21.16b -+ eor v26.16b, v26.16b, v21.16b -+ eor v27.16b, v27.16b, v21.16b -+ eor v28.16b, v28.16b, v21.16b -+ mov w3, v22.s[0] -+ eor v29.16b, v29.16b, v21.16b -+ eor v30.16b, v30.16b, v21.16b -+ eor v31.16b, v31.16b, v21.16b -+ cmeq v24.4s, v24.4s, #0 -+ cmeq v25.4s, v25.4s, #0 -+ cmeq v26.4s, v26.4s, #0 -+ cmeq v27.4s, v27.4s, #0 -+ cbnz w3, 91f -+ st1 {v0.16b, v1.16b}, [x2], #32 -+ subs w1, w1, #64 -+ b.pl 2b -+ -+90: add w0, w1, #80 -+ ret -+ -+91: sub w1, w1, #32 -+ b 90b -+endfunc -diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c -index 2e9a3581de..d9571b437f 100644 ---- a/libavcodec/allcodecs.c -+++ b/libavcodec/allcodecs.c -@@ -153,6 +153,7 @@ extern AVCodec ff_hap_decoder; - extern AVCodec ff_hevc_decoder; - extern AVCodec ff_hevc_qsv_decoder; - extern AVCodec ff_hevc_rkmpp_decoder; -+extern AVCodec ff_hevc_rpi_decoder; - extern AVCodec ff_hevc_v4l2m2m_decoder; - extern AVCodec ff_hnm4_video_decoder; - extern AVCodec ff_hq_hqa_decoder; -@@ -917,6 +918,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id) - } - } - -+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) -+{ -+ const enum AVPixelFormat *pf = p->pix_fmts; -+ -+ // Assume good if we lack info -+ if (pf == NULL) -+ return 1; -+ if (fmt == AV_PIX_FMT_NONE) -+ return 0; -+ -+ for (; *pf != AV_PIX_FMT_NONE; ++pf) { -+ if (*pf == fmt) -+ return 1; -+ } -+ return 0; -+} -+ -+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt) -+{ -+ const AVCodec *p, *experimental = NULL; -+ void *i = 0; -+ -+ id= remap_deprecated_codec_id(id); -+ while ((p = av_codec_iterate(&i))) { -+ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) { -+ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { -+ experimental = p; -+ } else -+ return (AVCodec *)p; -+ } -+ p = p->next; -+ } -+ return (AVCodec *)experimental; -+} -+ - static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *)) - { - const AVCodec *p, *experimental = NULL; -diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index c4ab93aeeb..cd926f7b33 100644 ---- a/libavcodec/arm/Makefile -+++ b/libavcodec/arm/Makefile -@@ -39,6 +39,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ - arm/sbrdsp_init_arm.o - OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o - OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o -+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ -+ arm/rpi_hevcpred_init_arm.o - OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o - OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o - OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o -@@ -137,10 +139,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ - NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o - NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o - NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ -+ arm/hevcdsp_idct_neon.o \ - arm/hevcdsp_deblock_neon.o \ - arm/hevcdsp_idct_neon.o \ - arm/hevcdsp_qpel_neon.o \ - arm/hevcdsp_sao_neon.o -+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ -+ arm/rpi_hevc_misc_neon.o \ -+ arm/rpi_hevcdsp_deblock_neon.o \ -+ arm/rpi_hevcdsp_idct_neon.o \ -+ arm/rpi_hevcdsp_res8_neon.o \ -+ arm/rpi_hevcdsp_res16_neon.o \ -+ arm/rpi_hevcdsp_sao_neon.o \ -+ arm/rpi_hevcpred_init_neon.o \ -+ arm/rpi_hevcpred_intra_angular_neon.o \ -+ arm/rpi_hevcpred_intra_dc_neon.o \ -+ arm/rpi_hevcpred_intra_filter_neon.o \ -+ arm/rpi_hevcpred_intra_hv_neon.o \ -+ arm/rpi_hevcpred_intra_planar_neon.o - NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o - NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ - arm/rv40dsp_neon.o -diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h -index fdbf86b45e..4755f20e2e 100644 ---- a/libavcodec/arm/cabac.h -+++ b/libavcodec/arm/cabac.h -@@ -26,83 +26,209 @@ - #include "libavutil/internal.h" - #include "libavcodec/cabac.h" - -+ - #define get_cabac_inline get_cabac_inline_arm - static av_always_inline int get_cabac_inline_arm(CABACContext *c, -- uint8_t *const state) -+ uint8_t *state) - { -- int bit; -- void *reg_b, *reg_c, *tmp; -+ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128; -+ int bit, ptr, low, tmp1, tmp2; -+ __asm__ volatile ( -+ "ldr %[bit], [%[c], %[range_off]] \n\t" -+ "ldrb %[ptr], [%[state]] \n\t" -+ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t" -+ "and %[tmp2], %[bit], #0xc0 \n\t" -+ "add %[tmp1], %[tmp1], %[ptr] \n\t" -+ "ldr %[low], [%[c], %[low_off]] \n\t" -+ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t" -+ "sub %[bit], %[bit], %[tmp2] \n\t" -+ "mov %[tmp1], %[bit] \n\t" -+ "cmp %[low], %[bit], lsl #17 \n\t" -+ "itt ge \n\t" -+ "movge %[tmp1], %[tmp2] \n\t" -+ "mvnge %[ptr], %[ptr] \n\t" -+ "clz %[tmp2], %[tmp1] \n\t" -+ "it ge \n\t" -+ "subge %[low], %[low], %[bit], lsl #17 \n\t" -+ "sub %[tmp2], %[tmp2], #23 \n\t" -+ "and %[bit], %[ptr], #1 \n\t" -+ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t" -+ "lsl %[low], %[low], %[tmp2] \n\t" -+ "lsls %[ptr], %[low], #16 \n\t" -+ "bne 1f \n\t" -+ "ldr %[ptr], [%[c], %[ptr_off]] \n\t" -+ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t" -+#if UNCHECKED_BITSTREAM_READER -+ "strb %[mlps_tables], [%[state]] \n\t" -+ "rbit %[state], %[low] \n\t" -+ "ldrh %[tmp1], [%[ptr]], #2 \n\t" -+#else -+ "ldr %[tmp1], [%[c], %[end_off]] \n\t" -+ "strb %[mlps_tables], [%[state]] \n\t" -+ "rbit %[state], %[low] \n\t" -+ "cmp %[tmp1], %[ptr] \n\t" -+#if CONFIG_THUMB -+ "it cs \n\t" -+ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t" -+#else -+ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t" -+#endif -+#endif -+ "clz %[state], %[state] \n\t" -+ "movw %[mlps_tables], #0xffff \n\t" -+ "sub %[state], %[state], #16 \n\t" -+ "str %[tmp2], [%[c], %[range_off]] \n\t" -+ "rev %[tmp1], %[tmp1] \n\t" -+ "str %[ptr], [%[c], %[ptr_off]] \n\t" -+ "lsr %[tmp1], %[tmp1], #15 \n\t" -+ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t" -+#if CONFIG_THUMB -+ "lsl %[tmp1], %[tmp1], %[state] \n\t" -+ "add %[low], %[low], %[tmp1] \n\t" -+#else -+ "add %[low], %[low], %[tmp1], lsl %[state] \n\t" -+#endif -+ "str %[low], [%[c], %[low_off]] \n\t" -+ "b 2f \n\t" -+ "1: \n\t" -+ "strb %[mlps_tables], [%[state]] \n\t" -+ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t" -+ "str %[low], [%[c], %[low_off]] \n\t" -+ "str %[tmp1], [%[c], %[range_off]] \n\t" -+ "2: \n\t" -+ : // Outputs -+ [state]"+r"(state), -+ [mlps_tables]"+r"(mlps_tables), -+ [bit]"=&r"(bit), -+ [ptr]"=&r"(ptr), -+ [low]"=&r"(low), -+ [tmp1]"=&r"(tmp1), -+ [tmp2]"=&r"(tmp2) -+ : // Inputs -+ [c]"r"(c), -+ [low_off]"J"(offsetof(CABACContext, low)), -+ [range_off]"J"(offsetof(CABACContext, range)), -+ [ptr_off]"J"(offsetof(CABACContext, bytestream)), -+ [end_off]"J"(offsetof(CABACContext, bytestream_end)), -+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ return bit; -+} - -- __asm__ volatile( -- "ldrb %[bit] , [%[state]] \n\t" -- "add %[r_b] , %[tables] , %[lps_off] \n\t" -- "mov %[tmp] , %[range] \n\t" -- "and %[range] , %[range] , #0xC0 \n\t" -- "add %[r_b] , %[r_b] , %[bit] \n\t" -- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t" -- "add %[r_b] , %[tables] , %[norm_off] \n\t" -- "sub %[r_c] , %[tmp] , %[range] \n\t" -- "lsl %[tmp] , %[r_c] , #17 \n\t" -- "cmp %[tmp] , %[low] \n\t" -- "it gt \n\t" -- "movgt %[range] , %[r_c] \n\t" -- "itt cc \n\t" -- "mvncc %[bit] , %[bit] \n\t" -- "subcc %[low] , %[low] , %[tmp] \n\t" -- "add %[r_c] , %[tables] , %[mlps_off] \n\t" -- "ldrb %[tmp] , [%[r_b], %[range]] \n\t" -- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t" -- "lsl %[low] , %[low] , %[tmp] \n\t" -- "lsl %[range] , %[range] , %[tmp] \n\t" -- "uxth %[r_c] , %[low] \n\t" -- "strb %[r_b] , [%[state]] \n\t" -- "tst %[r_c] , %[r_c] \n\t" -- "bne 2f \n\t" -- "ldr %[r_c] , [%[c], %[byte]] \n\t" -+#define get_cabac_bypass get_cabac_bypass_arm -+static inline int get_cabac_bypass_arm(CABACContext * const c) -+{ -+ uint32_t low = c->low, range, ptr, tmp; -+ int rv; -+ __asm volatile ( -+ "ldr %[range] , [%[c], %[range_off]] \n\t" -+ "mov %[rv] , #0 \n\t" -+ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" -+ "lsl %[low] , #1 \n\t" -+#if !UNCHECKED_BITSTREAM_READER -+ "ldr %[tmp] , [%[c], %[end_off]] \n\t" -+#endif -+ "cmp %[low] , %[range], lsl #17 \n\t" -+ "itt cs \n\t" -+ "subcs %[low] , %[low], %[range], lsl #17 \n\t" -+ "movcs %[rv] , #1 \n\t" - #if UNCHECKED_BITSTREAM_READER -- "ldrh %[tmp] , [%[r_c]] \n\t" -- "add %[r_c] , %[r_c] , #2 \n\t" -- "str %[r_c] , [%[c], %[byte]] \n\t" -+ "ldrh %[tmp] , [%[ptr]], #2 \n\t" -+#else -+ "cmp %[tmp] , %[ptr] \n\t" -+#if CONFIG_THUMB -+ "it cs \n\t" -+ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" - #else -- "ldr %[r_b] , [%[c], %[end]] \n\t" -- "ldrh %[tmp] , [%[r_c]] \n\t" -- "cmp %[r_c] , %[r_b] \n\t" -- "itt lt \n\t" -- "addlt %[r_c] , %[r_c] , #2 \n\t" -- "strlt %[r_c] , [%[c], %[byte]] \n\t" -+ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" -+#endif - #endif -- "sub %[r_c] , %[low] , #1 \n\t" -- "add %[r_b] , %[tables] , %[norm_off] \n\t" -- "eor %[r_c] , %[low] , %[r_c] \n\t" -- "rev %[tmp] , %[tmp] \n\t" -- "lsr %[r_c] , %[r_c] , #15 \n\t" -- "lsr %[tmp] , %[tmp] , #15 \n\t" -- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t" -- "movw %[r_b] , #0xFFFF \n\t" -- "sub %[tmp] , %[tmp] , %[r_b] \n\t" -- "rsb %[r_c] , %[r_c] , #7 \n\t" -- "lsl %[tmp] , %[tmp] , %[r_c] \n\t" -- "add %[low] , %[low] , %[tmp] \n\t" -- "2: \n\t" -- : [bit]"=&r"(bit), -- [low]"+&r"(c->low), -- [range]"+&r"(c->range), -- [r_b]"=&r"(reg_b), -- [r_c]"=&r"(reg_c), -- [tmp]"=&r"(tmp) -- : [c]"r"(c), -- [state]"r"(state), -- [tables]"r"(ff_h264_cabac_tables), -- [byte]"M"(offsetof(CABACContext, bytestream)), -- [end]"M"(offsetof(CABACContext, bytestream_end)), -- [norm_off]"I"(H264_NORM_SHIFT_OFFSET), -- [lps_off]"I"(H264_LPS_RANGE_OFFSET), -- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) -- : "memory", "cc" -- ); -+ "lsls %[range] , %[low], #16 \n\t" -+ "bne 1f \n\t" - -- return bit & 1; -+ "str %[ptr] , [%[c], %[ptr_off]] \n\t" -+ "rev %[tmp] , %[tmp] \n\t" -+ "add %[low] , %[low], %[tmp], lsr #15 \n\t" -+ "movw %[tmp] , 0xFFFF \n\t" -+ "sub %[low] , %[tmp] \n\t" -+ "1: \n\t" -+ "str %[low] , [%[c], %[low_off]] \n\t" -+ : // Outputs -+ [rv]"=&r"(rv), -+ [low]"+r"(low), -+ [range]"=&r"(range), -+ [ptr]"=&r"(ptr), -+ [tmp]"=&r"(tmp) -+ : // Inputs -+ [c]"r"(c), -+ [low_off]"J"(offsetof(CABACContext, low)), -+ [range_off]"J"(offsetof(CABACContext, range)), -+ [ptr_off]"J"(offsetof(CABACContext, bytestream)), -+ [end_off]"J"(offsetof(CABACContext, bytestream_end)) -+ : // Clobbers -+ "memory", "cc" -+ ); -+ return rv; - } -+ -+ -+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm -+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv) -+{ -+ uint32_t low = c->low, range, ptr, tmp; -+ __asm volatile ( -+ "ldr %[range] , [%[c], %[range_off]] \n\t" -+ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" -+ "lsl %[low] , #1 \n\t" -+#if !UNCHECKED_BITSTREAM_READER -+ "ldr %[tmp] , [%[c], %[end_off]] \n\t" -+#endif -+ "cmp %[low] , %[range], lsl #17 \n\t" -+ "it cs \n\t" -+ "subcs %[low] , %[low], %[range], lsl #17 \n\t" -+ "it cc \n\t" -+ "rsbcc %[rv] , %[rv], #0 \n\t" -+#if UNCHECKED_BITSTREAM_READER -+ "ldrh %[tmp] , [%[ptr]], #2 \n\t" -+#else -+ "cmp %[tmp] , %[ptr] \n\t" -+#if CONFIG_THUMB -+ "it cs \n\t" -+ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" -+#else -+ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" -+#endif -+#endif -+ "lsls %[range] , %[low], #16 \n\t" -+ "bne 1f \n\t" -+ -+ "str %[ptr] , [%[c], %[ptr_off]] \n\t" -+ "rev %[tmp] , %[tmp] \n\t" -+ "add %[low] , %[low], %[tmp], lsr #15 \n\t" -+ "movw %[tmp] , 0xFFFF \n\t" -+ "sub %[low] , %[tmp] \n\t" -+ "1: \n\t" -+ "str %[low] , [%[c], %[low_off]] \n\t" -+ : // Outputs -+ [rv]"+r"(rv), -+ [low]"+r"(low), -+ [range]"=&r"(range), -+ [ptr]"=&r"(ptr), -+ [tmp]"=&r"(tmp) -+ : // Inputs -+ [c]"r"(c), -+ [low_off]"J"(offsetof(CABACContext, low)), -+ [range_off]"J"(offsetof(CABACContext, range)), -+ [ptr_off]"J"(offsetof(CABACContext, bytestream)), -+ [end_off]"J"(offsetof(CABACContext, bytestream_end)) -+ : // Clobbers -+ "memory", "cc" -+ ); -+ return rv; -+} -+ - #endif /* HAVE_ARMV6T2_INLINE */ - - #endif /* AVCODEC_ARM_CABAC_H */ -diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h -new file mode 100644 -index 0000000000..c88dec6eff ---- /dev/null -+++ b/libavcodec/arm/rpi_hevc_cabac.h -@@ -0,0 +1,607 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_ARM_HEVC_CABAC_H -+#define AVCODEC_ARM_HEVC_CABAC_H -+ -+#include "config.h" -+#if HAVE_ARMV6T2_INLINE -+ -+#define hevc_mem_bits32 hevc_mem_bits32_arm -+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits) -+{ -+ unsigned int n; -+ __asm__ ( -+ "rev %[n], %[x] \n\t" -+ : [n]"=r"(n) -+ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3))) -+ : -+ ); -+ return n << (bits & 7); -+} -+ -+ -+// --------------------------------------------------------------------------- -+// -+// Helper fns - little bits of code where ARM has an instraction that the -+// compiler doesn't know about / use -+ -+#define trans_scale_sat trans_scale_sat_arm -+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) -+{ -+ int rv; -+ int t = ((level * (int)(scale * scale_m)) >> shift) + 1; -+ -+ __asm__ ( -+ "ssat %[rv], #16, %[t], ASR #1 \n\t" -+ : [rv]"=r"(rv) -+ : [t]"r"(t) -+ : -+ ); -+ return rv; -+} -+ -+#define update_rice update_rice_arm -+static inline void update_rice_arm(uint8_t * const stat_coeff, -+ const unsigned int last_coeff_abs_level_remaining, -+ const unsigned int c_rice_param) -+{ -+ int t = last_coeff_abs_level_remaining << 1; -+ __asm__ ( -+ "lsrs %[t], %[t], %[shift] \n\t" -+ -+ "it eq \n\t" -+ "subeq %[stat], %[stat], #1 \n\t" -+ "cmp %[t], #6 \n\t" -+ "adc %[stat], %[stat], #0 \n\t" -+ "usat %[stat], #8, %[stat] \n\t" -+ : [stat]"+r"(*stat_coeff), -+ [t]"+r"(t) -+ : [shift]"r"(c_rice_param) -+ : "cc" -+ ); -+} -+ -+// --------------------------------------------------------------------------- -+// -+// CABAC get loops -+// -+// Where the loop is simple enough we can normally do 10-30% better than the -+// compiler -+ -+// Get the residual greater than 1 bits -+ -+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm -+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n, -+ uint8_t * const state0) -+{ -+ unsigned int i, reg_b, st, tmp, bit, rv; -+ __asm__ ( -+ "mov %[i] , #0 \n\t" -+ "mov %[rv] , #0 \n\t" -+ "1: \n\t" -+ "add %[i] , %[i] , #1 \n\t" -+ "cmp %[rv] , #0 \n\t" -+ "ite eq \n\t" -+ "usateq %[st] , #2 , %[i] \n\t" -+ "movne %[st] , #0 \n\t" -+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" -+ "and %[tmp] , %[range] , #0xC0 \n\t" -+ -+ "ldrb %[bit] , [%[state0], %[st]] \n\t" -+ "add %[r_b] , %[r_b] , %[bit] \n\t" -+ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t" -+ "sub %[range] , %[range] , %[tmp] \n\t" -+ -+ "cmp %[low] , %[range], lsl #17 \n\t" -+ "ittt ge \n\t" -+ "subge %[low] , %[low] , %[range], lsl #17 \n\t" -+ "movge %[range] , %[tmp] \n\t" -+ "mvnge %[bit] , %[bit] \n\t" -+ -+ "clz %[tmp] , %[range] \n\t" -+ "sub %[tmp] , #23 \n\t" -+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" -+ "and %[bit] , %[bit] , #1 \n\t" -+ "strb %[r_b] , [%[state0], %[st]] \n\t" -+ "lsl %[low] , %[low] , %[tmp] \n\t" -+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" -+ "lsl %[range] , %[range] , %[tmp] \n\t" -+ -+// There is a small speed gain from combining both conditions, using a single -+// branch and then working out what that meant later -+ "lsls %[tmp] , %[low] , #16 \n\t" -+ "it ne \n\t" -+ "cmpne %[n] , %[i] \n\t" -+ "bne 1b \n\t" -+ -+// If reload is not required then we must have run out of flags to decode -+ "tst %[tmp] , %[tmp] \n\t" -+ "bne 2f \n\t" -+ -+// Do reload -+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" -+ "rbit %[bit] , %[low] \n\t" -+ "movw %[r_b] , #0xFFFF \n\t" -+ "clz %[bit] , %[bit] \n\t" -+ "rev %[tmp] , %[tmp] \n\t" -+ "sub %[bit] , %[bit] , #16 \n\t" -+ "cmp %[n] , %[i] \n\t" -+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" -+ -+#if CONFIG_THUMB -+ "lsl %[tmp] , %[tmp] , %[bit] \n\t" -+ "add %[low] , %[low] , %[tmp] \n\t" -+#else -+ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" -+#endif -+ -+ "bne 1b \n\t" -+ "2: \n\t" -+ : [bit]"=&r"(bit), -+ [low]"+r"(c->low), -+ [range]"+r"(c->range), -+ [r_b]"=&r"(reg_b), -+ [bptr]"+r"(c->bytestream), -+ [i]"=&r"(i), -+ [tmp]"=&r"(tmp), -+ [st]"=&r"(st), -+ [rv]"=&r"(rv) -+ : [state0]"r"(state0), -+ [n]"r"(n), -+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), -+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) -+ : "memory", "cc" -+ ); -+ return rv; -+} -+ -+ -+// n must be > 0 on entry -+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm -+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0, -+ unsigned int n, -+ const uint8_t * ctx_map, -+ uint8_t * p) -+{ -+ unsigned int reg_b, tmp, st, bit; -+ __asm__ ( -+// Get bin from map -+#if CONFIG_THUMB -+ "add %[ctx_map] , %[n] \n\t" -+ "ldrb %[st] , [%[ctx_map]] \n\t" -+#else -+ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t" -+#endif -+ "1: \n\t" -+ -+// Load state & ranges -+ "ldrb %[bit] , [%[state0], %[st]] \n\t" -+ "and %[tmp] , %[range] , #0xC0 \n\t" -+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" -+ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t" -+ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t" -+ "sub %[range] , %[range] , %[tmp] \n\t" -+ -+ "cmp %[low] , %[range], lsl #17 \n\t" -+ "ittt ge \n\t" -+ "mvnge %[bit] , %[bit] \n\t" -+ "subge %[low] , %[low] , %[range], lsl #17 \n\t" -+ "movge %[range] , %[tmp] \n\t" -+ -+// Renorm -+ "clz %[tmp] , %[range] \n\t" -+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" -+ "sub %[tmp] , #23 \n\t" -+ "strb %[r_b] , [%[state0], %[st]] \n\t" -+ "tst %[bit] , #1 \n\t" -+ "ldrb %[st] , [%[ctx_map], #-1]! \n\t" -+ "lsl %[low] , %[low] , %[tmp] \n\t" -+// GCC asm seems to need strbne written differently for thumb and arm -+#if CONFIG_THUMB -+ "it ne \n\t" -+ "strbne %[n] , [%[idx]] , #1 \n\t" -+#else -+ "strneb %[n] , [%[idx]] , #1 \n\t" -+#endif -+ -+// There is a small speed gain from combining both conditions, using a single -+// branch and then working out what that meant later -+ "subs %[n] , %[n] , #1 \n\t" -+ "lsl %[range] , %[range] , %[tmp] \n\t" -+#if CONFIG_THUMB -+ "itt ne \n\t" -+ "lslsne %[tmp] , %[low] , #16 \n\t" -+#else -+ "lslnes %[tmp] , %[low] , #16 \n\t" -+#endif -+ "bne 1b \n\t" -+ -+// If we have bits left then n must be 0 so give up now -+ "lsls %[tmp] , %[low] , #16 \n\t" -+ "bne 2f \n\t" -+ -+// Do reload -+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" -+ "rbit %[bit] , %[low] \n\t" -+ "movw %[r_b] , #0xFFFF \n\t" -+ "clz %[bit] , %[bit] \n\t" -+ "cmp %[n] , #0 \n\t" -+ "rev %[tmp] , %[tmp] \n\t" -+ "sub %[bit] , %[bit] , #16 \n\t" -+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" -+ -+#if CONFIG_THUMB -+ "lsl %[tmp] , %[tmp] , %[bit] \n\t" -+ "add %[low] , %[low] , %[tmp] \n\t" -+#else -+ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" -+#endif -+ -+// Check to see if we still have more to do -+ "bne 1b \n\t" -+ "2: \n\t" -+ : [bit]"=&r"(bit), -+ [low]"+r"(c->low), -+ [range]"+r"(c->range), -+ [r_b]"=&r"(reg_b), -+ [bptr]"+r"(c->bytestream), -+ [idx]"+r"(p), -+ [n]"+r"(n), -+ [tmp]"=&r"(tmp), -+ [st]"=&r"(st), -+ [ctx_map]"+r"(ctx_map) -+ : [state0]"r"(state0), -+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), -+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) -+ : "memory", "cc" -+ ); -+ -+ return p; -+} -+ -+// --------------------------------------------------------------------------- -+// -+// CABAC_BY22 functions -+ -+ -+#define get_cabac_by22_start get_cabac_by22_start_arm -+static inline void get_cabac_by22_start_arm(CABACContext * const c) -+{ -+ const uint8_t *ptr = c->bytestream; -+ register uint32_t low __asm__("r1"), range __asm__("r2"); -+ uint32_t m, range8, bits; -+#if !USE_BY22_DIV -+ uintptr_t inv; -+#endif -+ -+ av_assert2(offsetof (CABACContext, low) == 0); -+ av_assert2(offsetof (CABACContext, range) == 4); -+ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2); -+ __asm__ volatile ( -+ "ldmia %[c], {%[low], %[range]} \n\t" -+ : // Outputs -+ [low]"=r"(low), -+ [range]"=r"(range) -+ : // Inputs -+ [c]"r"(c) -+ : // Clobbers -+ ); -+#if !USE_BY22_DIV -+ inv = (uintptr_t)cabac_by22_inv_range; -+#endif -+ __asm__ volatile ( -+ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t" -+#if !USE_BY22_DIV -+ "uxtb %[range8], %[range] \n\t" -+#endif -+ "rbit %[bits], %[low] \n\t" -+ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t" -+ "clz %[bits], %[bits] \n\t" -+ "str %[ptr], [%[c], %[ptr_off]] \n\t" -+ "rev %[m], %[m] \n\t" -+ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t" -+ "eor %[m], %[m], #0x80000000 \n\t" -+#if !USE_BY22_DIV -+ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t" -+ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t" -+ "str %[range], [%[c], %[bits_off]] \n\t" -+#else -+ "strh %[bits], [%[c], %[bits_off]] \n\t" -+#endif -+#if CONFIG_THUMB -+ "lsr %[m], %[ptr] \n\t" -+ "eor %[range], %[low], %[m] \n\t" -+#else -+ "eor %[range], %[low], %[m], lsr %[ptr] \n\t" -+#endif -+ : // Outputs -+ [ptr]"+&r"(ptr), -+ [low]"+&r"(low), -+ [range]"+&r"(range), -+#if !USE_BY22_DIV -+ [inv]"+&r"(inv), -+#endif -+ [m]"=&r"(m), -+ [range8]"=&r"(range8), -+ [bits]"=&r"(bits) -+ : // Inputs -+ [c]"r"(c), -+ [bits_off]"J"(offsetof (CABACContext, by22.bits)), -+ [ptr_off]"J"(offsetof (CABACContext, bytestream)) -+ : // Clobbers -+ "memory" -+ ); -+ c->low = range; -+#if !USE_BY22_DIV -+ c->range = inv; -+#endif -+} -+ -+#define get_cabac_by22_peek get_cabac_by22_peek_arm -+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c) -+{ -+ uint32_t rv = c->low &~ 1, tmp; -+ __asm__ ( -+ "cmp %[inv] , #0 \n\t" -+ "it ne \n\t" -+ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t" -+ : // Outputs -+ [rv]"+r"(rv), -+ [tmp]"=r"(tmp) -+ : // Inputs -+ [inv]"r"(c->range) -+ : // Clobbers -+ "cc" -+ ); -+ return rv << 1; -+} -+ -+#define get_cabac_by22_flush get_cabac_by22_flush_arm -+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val) -+{ -+ uint32_t bits, ptr, tmp1, tmp2; -+ __asm__ volatile ( -+ "ldrh %[bits], [%[cc], %[bits_off]] \n\t" -+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" -+ "rsb %[tmp1], %[n], #32 \n\t" -+ "add %[bits], %[bits], %[n] \n\t" -+ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t" -+ "lsr %[tmp1], %[val], %[tmp1] \n\t" -+ "ldr %[val], [%[cc], %[low_off]] \n\t" -+#if CONFIG_THUMB -+ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t" -+ "ldr %[ptr], [%[ptr]] \n\t" -+#else -+ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t" -+#endif -+ "mul %[tmp1], %[tmp2], %[tmp1] \n\t" -+ "and %[tmp2], %[bits], #7 \n\t" -+ "strh %[bits], [%[cc], %[bits_off]] \n\t" -+ "rev %[ptr], %[ptr] \n\t" -+ "lsl %[tmp1], %[tmp1], #23 \n\t" -+#if CONFIG_THUMB -+ "lsl %[val], %[n] \n\t" -+ "sub %[val], %[tmp1] \n\t" -+#else -+ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t" -+#endif -+ "lsl %[ptr], %[ptr], %[tmp2] \n\t" -+ "orr %[val], %[val], %[ptr], lsr #9 \n\t" -+ "str %[val], [%[cc], %[low_off]] \n\t" -+ : // Outputs -+ [val]"+r"(val), -+ [bits]"=&r"(bits), -+ [ptr]"=&r"(ptr), -+ [tmp1]"=&r"(tmp1), -+ [tmp2]"=&r"(tmp2) -+ : // Inputs -+ [cc]"r"(c), -+ [n]"r"(n), -+ [bits_off]"J"(offsetof(CABACContext, by22.bits)), -+ [ptr_off]"J"(offsetof(CABACContext, bytestream)), -+ [range_off]"J"(offsetof(CABACContext, by22.range)), -+ [low_off]"J"(offsetof(CABACContext, low)) -+ : // Clobbers -+ "memory" -+ ); -+} -+ -+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm -+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param) -+{ -+ uint32_t last_coeff_abs_level_remaining; -+ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2; -+ __asm__ volatile ( -+ "ldr %[remain], [%[cc], %[low_off]] \n\t" -+ "ldr %[prefix], [%[cc], %[range_off]] \n\t" -+ "bic %[remain], %[remain], #1 \n\t" -+ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" -+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" -+ "cmp %[prefix], #0 \n\t" -+ "it ne \n\t" -+ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t" -+ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t" -+ "lsl %[remain], %[remain], #1 \n\t" -+ "mvn %[prefix], %[remain] \n\t" -+ "clz %[prefix], %[prefix] \n\t" -+ "rsbs %[n1], %[prefix], #2 \n\t" -+ "bcc 1f \n\t" -+ "adc %[n1], %[rice], %[prefix] \n\t" -+ "add %[tmp2], %[tmp2], %[n1] \n\t" -+ "rsb %[n2], %[n1], #32 \n\t" -+ "and %[tmp1], %[tmp2], #7 \n\t" -+ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" -+ "lsr %[tmp2], %[tmp2], #3 \n\t" -+ "lsr %[n2], %[remain], %[n2] \n\t" -+ "mul %[n2], %[range], %[n2] \n\t" -+ "ldr %[range], [%[cc], %[low_off]] \n\t" -+ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t" -+ "rsb %[tmp2], %[rice], #31 \n\t" -+ "lsl %[remain], %[remain], %[prefix] \n\t" -+ "lsl %[n2], %[n2], #23 \n\t" -+#if CONFIG_THUMB -+ "lsl %[range], %[n1] \n\t" -+ "sub %[range], %[n2] \n\t" -+#else -+ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t" -+#endif -+ "rev %[ptr], %[ptr] \n\t" -+ "lsl %[n2], %[prefix], %[rice] \n\t" -+#if CONFIG_THUMB -+ "lsr %[remain], %[tmp2] \n\t" -+ "add %[remain], %[n2] \n\t" -+#else -+ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t" -+#endif -+ "b 3f \n\t" -+ "1: \n\t" -+ "add %[n2], %[rice], %[prefix], lsl #1 \n\t" -+ "cmp %[n2], %[peek_bits_plus_2] \n\t" -+ "bhi 2f \n\t" -+ "sub %[n1], %[n2], #2 \n\t" -+ "add %[tmp2], %[tmp2], %[n1] \n\t" -+ "rsb %[n2], %[n1], #32 \n\t" -+ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" -+ "lsr %[tmp1], %[tmp2], #3 \n\t" -+ "lsr %[n2], %[remain], %[n2] \n\t" -+ "mul %[n2], %[range], %[n2] \n\t" -+ "rsb %[range], %[rice], #34 \n\t" -+ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t" -+ "and %[tmp1], %[tmp2], #7 \n\t" -+ "lsl %[remain], %[remain], %[prefix] \n\t" -+ "ldr %[tmp2], [%[cc], %[low_off]] \n\t" -+ "rsb %[prefix], %[prefix], %[range] \n\t" -+ "orr %[remain], %[remain], #0x80000000 \n\t" -+ "rev %[ptr], %[ptr] \n\t" -+ "lsl %[n2], %[n2], #23 \n\t" -+ "mov %[range], #2 \n\t" -+#if CONFIG_THUMB -+ "lsl %[tmp2], %[n1] \n\t" -+ "sub %[tmp2], %[n2] \n\t" -+#else -+ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t" -+#endif -+ "lsl %[ptr], %[ptr], %[tmp1] \n\t" -+ "lsl %[rice], %[range], %[rice] \n\t" -+ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t" -+#if CONFIG_THUMB -+ "lsr %[remain], %[prefix] \n\t" -+ "add %[remain], %[rice] \n\t" -+#else -+ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t" -+#endif -+ "b 4f \n\t" -+ "2: \n\t" -+ "add %[n1], %[tmp2], %[prefix] \n\t" -+#if CONFIG_THUMB -+ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t" -+ "ldr %[tmp2], [%[tmp2]] \n\t" -+#else -+ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t" -+#endif -+ "rsb %[tmp1], %[prefix], #32 \n\t" -+ "push {%[rice]} \n\t" -+ "and %[rice], %[n1], #7 \n\t" -+ "lsr %[tmp1], %[remain], %[tmp1] \n\t" -+ "ldr %[ptr], [%[cc], %[low_off]] \n\t" -+ "mul %[remain], %[range], %[tmp1] \n\t" -+ "rev %[tmp2], %[tmp2] \n\t" -+ "rsb %[n2], %[prefix], %[n2] \n\t" -+ "ldr %[tmp1], [%[cc], %[range_off]] \n\t" -+ "lsl %[rice], %[tmp2], %[rice] \n\t" -+ "sub %[tmp2], %[n2], #2 \n\t" -+ "lsl %[remain], %[remain], #23 \n\t" -+#if CONFIG_THUMB -+ "lsl %[ptr], %[prefix] \n\t" -+ "rsb %[remain], %[ptr] \n\t" -+#else -+ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t" -+#endif -+ "orr %[remain], %[remain], %[rice], lsr #9 \n\t" -+ "add %[prefix], %[n1], %[tmp2] \n\t" -+ "bic %[n1], %[remain], #1 \n\t" -+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" -+ "cmp %[tmp1], #0 \n\t" -+ "rsb %[rice], %[tmp2], #32 \n\t" -+ "it ne \n\t" -+ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t" -+ "and %[tmp1], %[prefix], #7 \n\t" -+#if CONFIG_THUMB -+ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t" -+ "ldr %[ptr], [%[ptr]] \n\t" -+#else -+ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t" -+#endif -+ "lsl %[n1], %[n1], #1 \n\t" -+ "lsr %[rice], %[n1], %[rice] \n\t" -+ "rsb %[n2], %[n2], #34 \n\t" -+ "mul %[range], %[range], %[rice] \n\t" -+ "pop {%[rice]} \n\t" -+ "rev %[ptr], %[ptr] \n\t" -+ "orr %[n1], %[n1], #0x80000000 \n\t" -+ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t" -+ "mov %[prefix], #2 \n\t" -+ "lsl %[range], %[range], #23 \n\t" -+#if CONFIG_THUMB -+ "lsl %[remain], %[tmp2] \n\t" -+ "rsb %[range], %[remain] \n\t" -+#else -+ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t" -+#endif -+ "lsl %[remain], %[prefix], %[rice] \n\t" -+#if CONFIG_THUMB -+ "lsr %[n1], %[n2] \n\t" -+ "add %[remain], %[n1] \n\t" -+#else -+ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t" -+#endif -+ "3: \n\t" -+ "lsl %[ptr], %[ptr], %[tmp1] \n\t" -+ "orr %[range], %[range], %[ptr], lsr #9 \n\t" -+ "4: \n\t" -+ "str %[range], [%[cc], %[low_off]] \n\t" -+ : // Outputs -+ [remain]"=&r"(last_coeff_abs_level_remaining), -+ [rice]"+r"(rice_param), -+ [prefix]"=&r"(prefix), -+ [n1]"=&r"(n1), -+ [range]"=&r"(range), -+ [n2]"=&r"(n2), -+ [ptr]"=&r"(ptr), -+ [tmp1]"=&r"(tmp1), -+ [tmp2]"=&r"(tmp2) -+ : // Inputs -+ [cc]"r"(c), -+ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2), -+ [low_off]"J"(offsetof(CABACContext, low)), -+ [range_off]"J"(offsetof(CABACContext, range)), -+ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)), -+ [by22_range_off]"J"(offsetof(CABACContext, by22.range)), -+ [ptr_off]"J"(offsetof(CABACContext, bytestream)) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ return last_coeff_abs_level_remaining; -+} -+ -+#endif /* HAVE_ARMV6T2_INLINE */ -+ -+#endif /* AVCODEC_ARM_HEVC_CABAC_H */ -diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S -new file mode 100644 -index 0000000000..978b7b6947 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S -@@ -0,0 +1,183 @@ -+/* -+ * ARM NEON optimised IDCT functions for HEVC decoding -+ * Copyright (c) 2014 Seppo Tomperi -+ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+@ Included multiple times from hevc_idct_neon.S -+@ Macros defined there -+ -+#define DC_SHIFT (15 - BIT_DEPTH) -+#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) -+#define TRN_SHIFT (20 - BIT_DEPTH) -+ -+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1 -+ ldrsh r1, [r0] -+ add r1, #DC_ADD -+ asr r1, #DC_SHIFT -+ vdup.16 q0, r1 -+ vdup.16 q1, r1 -+ vst1.16 {q0, q1}, [r0] -+ bx lr -+endfunc -+ -+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1 -+ ldrsh r1, [r0] -+ add r2, r0, #32 -+ mov r3, #64 -+ add r1, #DC_ADD -+ asr r1, #DC_SHIFT -+ vdup.16 q8, r1 -+ vdup.16 q9, r1 -+ vst1.16 {q8, q9}, [r0], r3 -+ vst1.16 {q8, q9}, [r2], r3 -+ vst1.16 {q8, q9}, [r0] -+ vst1.16 {q8, q9}, [r2] -+ bx lr -+endfunc -+ -+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1 -+ ldrsh r1, [r0] -+ add r2, r0, #32 -+ mov r3, #64 -+ add r1, #DC_ADD -+ mov ip, #16*16 -+ asr r1, #DC_SHIFT -+ vdup.16 q8, r1 -+ vdup.16 q9, r1 -+1: vst1.16 {q8, q9}, [r0], r3 -+ subs ip, ip, #32 -+ vst1.16 {q8, q9}, [r2], r3 -+ bhi 1b -+ bx lr -+endfunc -+ -+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1 -+ ldrsh r1, [r0] -+ add r2, r0, #32 -+ mov r3, #64 -+ add r1, #DC_ADD -+ mov ip, #32*32 -+ asr r1, #DC_SHIFT -+ vdup.16 q8, r1 -+ vdup.16 q9, r1 -+1: vst1.16 {q8, q9}, [r0], r3 -+ subs ip, ip, #32 -+ vst1.16 {q8, q9}, [r2], r3 -+ bhi 1b -+ bx lr -+endfunc -+ -+ -+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1 -+ vldr.i32 s0, =0x00240053 // 36 and 83 -+ vld1.16 {q14, q15}, [r0 :256] // coeffs -+ -+ tr4_shift #7 -+ -+ vzip.16 d28, d29 -+ vzip.16 d30, d31 -+ vzip.32 q14, q15 -+ -+ tr4_shift #TRN_SHIFT -+ -+ vst4.16 {q14, q15}, [r0 :256] -+ bx lr -+ -+ .ltorg -+endfunc -+ -+ -+ -+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1 -+ vmov.i32 d0, #0x4a // 74 -+ vld1.16 {q14, q15}, [r0 :256] // coeffs -+ vmov.i32 d1, #0x1d // 29 -+ vmov.i32 d2, #0x37 // 55 -+ -+ tr4_luma_shift #7 -+ -+ vzip.16 d28, d29 -+ vzip.16 d30, d31 -+ vzip.32 q14, q15 -+ -+ tr4_luma_shift #TRN_SHIFT -+ -+ vst4.16 {q14, q15}, [r0 :256] -+ bx lr -+endfunc -+ -+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1 -+ add r2, r0, #16 -+ adr r3, tr4f -+ vpush {d8-d15} -+ vld1.16 {d0, d1}, [r3] -+ mov r3, #32 -+ -+ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \ -+ "sub r0, r0, #128-8", \ -+ "sub r2, r2, #128-8", \ -+ "cmp r1, #4" -+ ble 2f -+ -+ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \ -+ "sub r0, r0, #128+8", \ -+ "sub r2, r2, #128+8+16-32", \ -+ "mov r3, #64" -+ -+ vzip.16 d16, d17 -+ vzip.16 d18, d19 -+ -+ vzip.16 d20, d21 -+ vzip.16 d22, d23 -+ vzip.16 d28, d29 -+ vzip.16 d30, d31 -+ vzip.32 q10, q11 -+ vzip.32 q14, q15 -+1: -+ vzip.16 d24, d25 -+ vzip.16 d26, d27 -+ vzip.32 q8, q9 -+ vzip.32 q12, q13 -+ -+ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT -+ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT -+ -+ vpop {d8-d15} -+ bx lr -+ -+2: vmov.i64 q10, #0 -+ sub r0, r0, #8 -+ vmov.i64 q11, #0 -+ sub r2, r2, #8+16-32 -+ vmov.i64 q14, #0 -+ mov r3, #64 -+ vmov.i64 q15, #0 -+ -+ vzip.16 d16, d17 -+ vzip.16 d18, d19 -+ -+ b 1b -+ -+endfunc -+ -+#undef DC_SHIFT -+#undef DC_ADD -+#undef TRN_SHIFT -+ -diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S -new file mode 100644 -index 0000000000..161bb0d7c9 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevc_misc_neon.S -@@ -0,0 +1,267 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Written by John Cox, Ben Avison -+*/ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+@ rpi_zap_coeff_vals_neon( -+@ uint16_t * buf, [r0] -+@ unsigned int log_n_m2) [r1] -+ -+function rpi_zap_coeff_vals_neon, export=1 -+ mov ip, #1 -+ vmov.i64 q0, #0 -+ teq r1, #0 -+ vmov.i64 q1, #0 -+ beq 2f -+ -+ lsl ip, r1 @ 2, 4 or 8 -+ add r2, r0, #32 -+ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero -+ mov r3, #64 -+1: vst1.8 {q0,q1}, [r0:256], r3 -+ subs ip, #2 -+ vst1.8 {q0,q1}, [r2:256], r3 -+ bne 1b -+ bx lr -+ -+2: vst1.8 {q0,q1}, [r0:256] -+ bx lr -+endfunc -+ -+@ PIC jump tables are more expensive than absolute for A32 code -+.set jent_pic, CONFIG_PIC || CONFIG_THUMB -+ -+@ Jump table entry - if in neon mode the bottom bit must be set -+@ ? There is probably a real asm instruction to do this but I haven't found it -+.macro jent lab -+.if jent_pic -+T .short ((0 + \lab) - (0 + 98b)) / 2 -+A .short (0 + \lab) - (4 + 98b) -+.else -+T .word 1 + \lab -+A .word \lab -+.endif -+.endm -+ -+.set expected_next, 0 -+ -+.macro cpy_compound val, p1, p2, drop_thru=0 -+.if \p1 + \p2 != \val -+.error "Bad addition! \p1 + \p2 != \val" -+.endif -+.if expected_next != 0 && expected_next != \val -+.error "Drop thru failure" -+.endif -+\val\(): -+ push {r0-r3} -+ bl 100\p1\()b -+ pop {r0-r3} -+ add r0, #\p1 -+ add r2, #\p1 -+.if \drop_thru == 0 -+ b \p2\()b -+.set expected_next, 0 -+.else -+.set expected_next, \p2 -+.endif -+.endm -+ -+@ ff_hevc_cpy_blks8x4_neon( -+@ dst [r0] -+@ dst_stride [r1] -+@ src [r2] -+@ src_stride [r3] -+@ width [sp, #0] (bytes) -+@ height) [sp, #4] -+@ -+@ Power of 2 widths are directly coded, all others are done in stripes -+@ We expect the vast majority of calls to be power of 2 -+@ -+@ Currently has min width of 8, but we could make that 4 without issue -+@ Min height is 4 -+ -+function ff_hevc_rpi_cpy_blks8x4_neon, export=1 -+ ldr r12, [sp, #0] -+ push {r11, lr} -+.if jent_pic -+A adr lr, 98f - 2 -+.else -+A adr lr, 98f - 4 -+.endif -+ lsr r12, #3 -+ ldr r11, [sp, #(8 + 4)] -+.if jent_pic -+A lsl r12, #1 -+A ldrsh lr, [lr, r12] -+A add pc, lr -+T tbh [pc, r12, lsl #1] -+.else -+ @ A32 only, Thumb is always PIC -+ ldr pc, [lr, r12, lsl #2] -+.endif -+ -+98: -+T .short 0 @ unused -+ jent 8f -+ jent 16f -+ jent 24f -+ jent 32f -+ jent 40f -+ jent 48f -+ jent 56f -+ jent 64f -+ jent 72f -+ jent 80f -+ jent 88f -+ jent 96f -+ jent 104f -+ jent 112f -+ jent 120f -+ jent 128f -+ -+1008: -+ push {r11, lr} -+8: -+ add lr, r2, r3 -+ lsl r3, #1 -+ add r12, r0, r1 -+ lsl r1, #1 -+1: -+ vld1.32 {d0 }, [r2], r3 -+ vld1.32 {d1 }, [lr], r3 -+ vld1.32 {d2 }, [r2], r3 -+ vld1.32 {d3 }, [lr], r3 -+ subs r11, #4 -+ vst1.32 {d0 }, [r0], r1 -+ vst1.32 {d1 }, [r12], r1 -+ vst1.32 {d2 }, [r0], r1 -+ vst1.32 {d3 }, [r12], r1 -+ bgt 1b -+ pop {r11, pc} -+ -+10016: -+ push {r11, lr} -+16: -+ add lr, r2, r3 -+ lsl r3, #1 -+ add r12, r0, r1 -+ lsl r1, #1 -+1: -+ vld1.32 {q0 }, [r2], r3 -+ vld1.32 {q1 }, [lr], r3 -+ vld1.32 {q2 }, [r2], r3 -+ vld1.32 {q3 }, [lr], r3 -+ subs r11, #4 -+ vst1.32 {q0 }, [r0], r1 -+ vst1.32 {q1 }, [r12], r1 -+ vst1.32 {q2 }, [r0], r1 -+ vst1.32 {q3 }, [r12], r1 -+ bgt 1b -+ pop {r11, pc} -+ -+10032: -+ push {r11, lr} -+32: -+ add lr, r2, r3 -+ lsl r3, #1 -+ add r12, r0, r1 -+ lsl r1, #1 -+1: -+ vld1.32 {q8, q9 }, [r2], r3 -+ vld1.32 {q10, q11}, [lr], r3 -+ vld1.32 {q12, q13}, [r2], r3 -+ vld1.32 {q14, q15}, [lr], r3 -+ subs r11, #4 -+ vst1.32 {q8, q9 }, [r0], r1 -+ vst1.32 {q10, q11}, [r12], r1 -+ vst1.32 {q12, q13}, [r0], r1 -+ vst1.32 {q14, q15}, [r12], r1 -+ bgt 1b -+ pop {r11, pc} -+ -+10064: -+ push {r11, lr} -+64: -+ add lr, r2, #32 -+ add r12, r0, #32 -+1: -+ vld1.32 {q8, q9 }, [r2], r3 -+ vld1.32 {q10, q11}, [lr], r3 -+ vld1.32 {q12, q13}, [r2], r3 -+ vld1.32 {q14, q15}, [lr], r3 -+ subs r11, #2 -+ vst1.32 {q8, q9 }, [r0], r1 -+ vst1.32 {q10, q11}, [r12], r1 -+ vst1.32 {q12, q13}, [r0], r1 -+ vst1.32 {q14, q15}, [r12], r1 -+ bgt 1b -+ pop {r11, pc} -+ -+128: -+ push {r4, r5} -+ @ We could do this with fewer registers if we jump around but I -+ @ have a primative urge to load sequentially -+ mov r4, #64 -+ add lr, r2, #32 -+ add r12, r0, #32 -+ sub r3, r4 -+ sub r1, r4 -+1: -+ vld1.32 {q8, q9 }, [r2], r4 -+ vld1.32 {q10, q11}, [lr], r4 -+ vld1.32 {q12, q13}, [r2], r3 -+ vld1.32 {q14, q15}, [lr], r3 -+ subs r11, #1 -+ vst1.32 {q8, q9 }, [r0], r4 -+ vst1.32 {q10, q11}, [r12], r4 -+ vst1.32 {q12, q13}, [r0], r1 -+ vst1.32 {q14, q15}, [r12], r1 -+ bgt 1b -+ pop {r4, r5, r11, pc} -+ -+@ Use drop_thru where we can -+cpy_compound 104, 64, 40, 1 -+cpy_compound 40, 32, 8 -+ -+cpy_compound 112, 64, 48, 1 -+cpy_compound 48, 32, 16 -+ -+cpy_compound 120, 64, 56, 1 -+cpy_compound 56, 32, 24, 1 -+cpy_compound 24, 16, 8 -+ -+cpy_compound 72, 64, 8 -+cpy_compound 80, 64, 16 -+cpy_compound 88, 64, 24 -+cpy_compound 96, 64, 32 -+ -+ -+endfunc -+ -diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h -new file mode 100644 -index 0000000000..9d21f6a882 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevc_misc_neon.h -@@ -0,0 +1,438 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H -+#define AVCODEC_ARM_RPI_HEVC_MISC_H -+ -+#include "config.h" -+#if HAVE_NEON_INLINE && !CONFIG_THUMB -+ -+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_src) -+{ -+ const uint8_t *src2 = src + stride_src; -+ stride_src <<= 1; -+ switch (pixel_shift) -+ { -+ case 2: -+ __asm__ volatile ( -+ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" -+ "subs %[height], #4 \n\t" -+ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.32 {q0}, [%[dst]]! \n\t" -+ "beq 3f \n\t" -+ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.32 {q1}, [%[dst]]! \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vst1.32 {q0}, [%[dst]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vst1.32 {q1}, [%[dst]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [src]"+r"(src), -+ [src2]"+r"(src2), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ case 1: -+ __asm__ volatile ( -+ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" -+ "subs %[height], #4 \n\t" -+ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t" -+ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t" -+ "vzip.16 d0, d1 \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.16 {d0}, [%[dst]]! \n\t" -+ "beq 3f \n\t" -+ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" -+ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "vzip.16 d2, d3 \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.16 {d2}, [%[dst]]! \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vzip.16 d0, d1 \n\t" -+ "vst1.16 {d0}, [%[dst]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vzip.16 d2, d3 \n\t" -+ "vst1.16 {d2}, [%[dst]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [src]"+r"(src), -+ [src2]"+r"(src2), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ default: -+ __asm__ volatile ( -+ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" -+ "subs %[height], #8 \n\t" -+ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t" -+ "vzip.8 d0, d1 \n\t" -+ "subs %[height], #8 \n\t" -+ "vst1.8 {d0}, [%[dst]]! \n\t" -+ "beq 3f \n\t" -+ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" -+ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" -+ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" -+ "vzip.8 d2, d3 \n\t" -+ "subs %[height], #8 \n\t" -+ "vst1.8 {d2}, [%[dst]]! \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vzip.8 d0, d1 \n\t" -+ "vst1.8 {d0}, [%[dst]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vzip.8 d2, d3 \n\t" -+ "vst1.8 {d2}, [%[dst]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [src]"+r"(src), -+ [src2]"+r"(src2), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ } -+} -+ -+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_dst) -+{ -+ uint8_t *dst2 = dst + stride_dst; -+ stride_dst <<= 1; -+ switch (pixel_shift) -+ { -+ case 2: -+ __asm__ volatile ( -+ "subs %[height], #4 \n\t" -+ "vld1.32 {q0}, [%[src]]! \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.32 {q1}, [%[src]]! \n\t" -+ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "beq 3f \n\t" -+ "vld1.32 {q0}, [%[src]]! \n\t" -+ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.32 {d1[0]}, [%[dst]] \n\t" -+ "vst1.32 {d1[1]}, [%[dst2]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.32 {d3[0]}, [%[dst]] \n\t" -+ "vst1.32 {d3[1]}, [%[dst2]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [dst]"+r"(dst), -+ [dst2]"+r"(dst2), -+ [src]"+r"(src), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ case 1: -+ __asm__ volatile ( -+ "subs %[height], #4 \n\t" -+ "vld1.16 {d0}, [%[src]]! \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.16 {d2}, [%[src]]! \n\t" -+ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "beq 3f \n\t" -+ "vld1.16 {d0}, [%[src]]! \n\t" -+ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #4 \n\t" -+ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.16 {d0[2]}, [%[dst]] \n\t" -+ "vst1.16 {d0[3]}, [%[dst2]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.16 {d2[2]}, [%[dst]] \n\t" -+ "vst1.16 {d2[3]}, [%[dst2]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [dst]"+r"(dst), -+ [dst2]"+r"(dst2), -+ [src]"+r"(src), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ default: -+ __asm__ volatile ( -+ "subs %[height], #8 \n\t" -+ "vld1.8 {d0}, [%[src]]! \n\t" -+ "beq 2f \n\t" -+ "1: \n\t" -+ "vld1.8 {d2}, [%[src]]! \n\t" -+ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #8 \n\t" -+ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t" -+ "beq 3f \n\t" -+ "vld1.8 {d0}, [%[src]]! \n\t" -+ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t" -+ "subs %[height], #8 \n\t" -+ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "2: \n\t" -+ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d0[6]}, [%[dst]] \n\t" -+ "vst1.8 {d0[7]}, [%[dst2]] \n\t" -+ "b 4f \n\t" -+ "3: \n\t" -+ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" -+ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" -+ "vst1.8 {d2[6]}, [%[dst]] \n\t" -+ "vst1.8 {d2[7]}, [%[dst2]] \n\t" -+ "4: \n\t" -+ : // Outputs -+ [dst]"+r"(dst), -+ [dst2]"+r"(dst2), -+ [src]"+r"(src), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ } -+} -+ -+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+ int x, y; -+ switch (pixel_shift) -+ { -+ case 2: -+ __asm__ volatile ( -+ "ldr %[x], [%[src]], %[stride_src] \n\t" -+ "ldr %[y], [%[src]], %[stride_src] \n\t" -+ "str %[x], [%[dst]], %[stride_dst] \n\t" -+ "sub %[height], #2 \n\t" -+ "1: \n\t" -+ "ldr %[x], [%[src]], %[stride_src] \n\t" -+ "str %[y], [%[dst]], %[stride_dst] \n\t" -+ "ldr %[y], [%[src]], %[stride_src] \n\t" -+ "subs %[height], #2 \n\t" -+ "str %[x], [%[dst]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "str %[y], [%[dst]] \n\t" -+ : // Outputs -+ [x]"=&r"(x), -+ [y]"=&r"(y), -+ [src]"+r"(src), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src), -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ case 1: -+ __asm__ volatile ( -+ "ldrh %[x], [%[src]], %[stride_src] \n\t" -+ "ldrh %[y], [%[src]], %[stride_src] \n\t" -+ "strh %[x], [%[dst]], %[stride_dst] \n\t" -+ "sub %[height], #2 \n\t" -+ "1: \n\t" -+ "ldrh %[x], [%[src]], %[stride_src] \n\t" -+ "strh %[y], [%[dst]], %[stride_dst] \n\t" -+ "ldrh %[y], [%[src]], %[stride_src] \n\t" -+ "subs %[height], #2 \n\t" -+ "strh %[x], [%[dst]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "strh %[y], [%[dst]] \n\t" -+ : // Outputs -+ [x]"=&r"(x), -+ [y]"=&r"(y), -+ [src]"+r"(src), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src), -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ default: -+ __asm__ volatile ( -+ "ldrb %[x], [%[src]], %[stride_src] \n\t" -+ "ldrb %[y], [%[src]], %[stride_src] \n\t" -+ "strb %[x], [%[dst]], %[stride_dst] \n\t" -+ "sub %[height], #2 \n\t" -+ "1: \n\t" -+ "ldrb %[x], [%[src]], %[stride_src] \n\t" -+ "strb %[y], [%[dst]], %[stride_dst] \n\t" -+ "ldrb %[y], [%[src]], %[stride_src] \n\t" -+ "subs %[height], #2 \n\t" -+ "strb %[x], [%[dst]], %[stride_dst] \n\t" -+ "bne 1b \n\t" -+ "strb %[y], [%[dst]] \n\t" -+ : // Outputs -+ [x]"=&r"(x), -+ [y]"=&r"(y), -+ [src]"+r"(src), -+ [dst]"+r"(dst), -+ [height]"+r"(height) -+ : // Inputs -+ [stride_src]"r"(stride_src), -+ [stride_dst]"r"(stride_dst) -+ : // Clobbers -+ "cc", "memory" -+ ); -+ break; -+ } -+} -+ -+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon -+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+ if (stride_dst == 1 << pixel_shift) -+ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src); -+ else if (stride_src == 1 << pixel_shift) -+ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst); -+ else -+ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src); -+} -+ -+#endif /* HAVE_NEON_INLINE */ -+ -+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */ -diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h -new file mode 100644 -index 0000000000..325c26a49b ---- /dev/null -+++ b/libavcodec/arm/rpi_hevc_mv_arm.h -@@ -0,0 +1,93 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Written by John Cox, Ben Avison -+*/ -+ -+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H -+#define AVCODEC_ARM_RPI_HEVC_MV_H -+ -+#if HAVE_ARMV6T2_INLINE -+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b) -+{ -+ MvXY r; -+ __asm__ ( -+ "sadd16 %[r], %[a], %[b] \n\t" -+ : [r]"=r"(r) -+ : [a]"r"(a), -+ [b]"r"(b) -+ : -+ ); -+ return r; -+} -+#define mvxy_add mvxy_add_arm -+#endif -+ -+#if HAVE_ARMV6T2_INLINE -+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV)) -+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb) -+{ -+ int t; -+ __asm__ ( -+ "ssat %[td], #8, %[td] \n\t" -+ "ssat %[tb], #8, %[tb] \n\t" -+ "eor %[t], %[td], %[td], asr #31 \n\t" -+ "adds %[t], %[t], %[td], lsr #31 \n\t" -+ "asr %[t], #1 \n\t" -+ "add %[t], #0x4000 \n\t" -+ "it ne \n\t" -+ "sdivne %[t], %[t], %[td] \n\t" -+ "mov %[td], #32 \n\t" -+ "smlabb %[td], %[t], %[tb], %[td] \n\t" -+ "ssat %[td], #13, %[td], asr #6 \n\t" -+ "mov %[tb], #127 \n\t" -+ "smlatb %[t], %[xy], %[td], %[tb] \n\t" -+ "smlabb %[tb], %[xy], %[td], %[tb] \n\t" -+// This takes the sign of x & y for rounding at the "wrong" point -+// (i.e. after adding 127) but for the range of values (-1,-127) -+// where it does the wrong thing you get the right answer (0) anyway -+ "add %[t], %[t], %[t], lsr #31 \n\t" -+ "add %[xy], %[tb], %[tb], lsr #31 \n\t" -+ "ssat %[t], #16, %[t], asr #8 \n\t" -+ "ssat %[xy], #16, %[xy], asr #8 \n\t" -+ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t" -+ : -+ [t]"=&r"(t), -+ [xy]"+r"(xy), -+ [td]"+r"(td), -+ [tb]"+r"(tb) -+ : -+ : -+ "cc" -+ ); -+ return xy; -+} -+#define mv_scale_xy mv_scale_xy_arm -+#endif -+#endif -+ -+#endif // AVCODEC_ARM_RPI_HEVC_MV_H -+ -diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h -new file mode 100644 -index 0000000000..62b9326532 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_arm.h -@@ -0,0 +1,26 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H -+#define AVCODEC_ARM_HEVCDSP_ARM_H -+ -+#include "libavcodec/rpi_hevcdsp.h" -+ -+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth); -+ -+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ -diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -new file mode 100644 -index 0000000000..88a3b4e5e7 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1634 @@ -+/* -+ * Copyright (c) 2014 Seppo Tomperi -+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 -+ */ -+ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 -+ vsubl.u8 q0, \Q0a, \P0a -+ vsubl.u8 q1, \P1a, \Q1a -+ vdup.16 d4, r2 -+ \I1 -+ vshl.i16 q0, #2 -+ \I2 -+ vadd.i16 q0, q1 -+ \I3 -+ vmovl.u8 q2, d4 -+ \I4 -+ vneg.s16 q1, q2 -+ \I5 -+ vrshr.s16 q0, #3 -+ \I6 -+ \I7 -+ \I8 -+ vmin.s16 q0, q2 -+ vmovl.u8 q2, \Q0a -+ vmax.s16 q0, q1 -+ vaddw.u8 q1, q0, \P0a -+ vsub.i16 q0, q2, q0 -+ vqmovun.s16 \P0a, q1 -+ vqmovun.s16 \Q0a, q0 -+.endm -+ -+ -+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 -+ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a -+ lsr r12, r2, #16 -+ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b -+ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a -+ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b -+ vshl.i16 q0, #2 @ (q0a - p0a) * 4 -+ vshl.i16 q1, #2 @ (q0b - p0b) * 4 -+ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a -+ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b -+ vdup.16 d4, r2 @ tc0a, tc0b -+ vdup.16 d6, r12 @ tc1a, tc1b -+ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 -+ \I1 -+ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 -+ \I2 -+ vmovl.u8 q2, d4 @ tc0a, tc0b -+ \I3 -+ vmovl.u8 q3, d6 @ tc1a, tc1b -+ \I4 -+ vmin.s16 q0, q2 -+ \I5 -+ vneg.s16 q2, q2 @ -tc0a, -tc0b -+ \I6 -+ vmin.s16 q1, q3 -+ \I7 -+ vneg.s16 q3, q3 @ -tc1a, -tc1b -+ vmax.s16 q0, q2 @ delta0a -+ vmovl.u8 q2, \Q0a -+ vmax.s16 q1, q3 @ delta0b -+ vaddw.u8 q3, q0, \P0a @ p0a + delta0a -+ vsub.i16 q0, q2, q0 @ q0a - delta0a -+ vmovl.u8 q2, \Q0b -+ vsub.i16 q2, q1 @ q0b - delta0b -+ vaddw.u8 q1, \P0b @ p0b + delta0b -+ vqmovun.s16 \Q0a, q0 -+ vqmovun.s16 \P0a, q3 -+ vqmovun.s16 \Q0b, q2 -+ vqmovun.s16 \P0b, q1 -+.endm -+ -+ -+@ Preserves r12 -+@ Clobbers r2 -+@ P0a et al all contain UVUVUVUV -+@ r2 (tc4) contains -+@ [0..7] tc U a -+@ [8..15] tc V a -+ -+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 -+ vsub.i16 q0, \Q0a, \P0a -+ vsub.i16 q1, \P1a, \Q1a -+ vdup.16 d4, r2 -+ \I1 -+ vshl.i16 q0, #2 -+ \I2 -+ vadd.i16 q0, q1 -+ \I3 -+ vshll.u8 q2, d4, #\bit_depth - 8 -+ \I4 -+ vneg.s16 q1, q2 -+ \I5 -+ vrshr.s16 q0, #3 -+ \I6 -+ \I7 -+ \I8 -+ vmin.s16 q0, q2 -+ vmov.i16 q2, #0 -+ vmax.s16 q0, q1 -+ vadd.i16 \P0a, q0 -+ vsub.i16 \Q0a, q0 -+ vmov.i16 q1, #(1 << \bit_depth) - 1 -+ vmax.s16 \P0a, q2 -+ vmax.s16 \Q0a, q2 -+ vmin.s16 \P0a, q1 -+ vmin.s16 \Q0a, q1 -+.endm -+ -+@ Clobbers r2, r12 -+@ P0a et al all contain UVUVUVUV -+@ r2 (tc4) contains -+@ [0..7] tc U a -+@ [8..15] tc V a -+@ [16..23] tc U b -+@ [24..31] tc V b -+ -+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 -+ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a -+ lsr r12, r2, #16 -+ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b -+ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a -+ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b -+ vshl.i16 q0, #2 @ (q0a - p0a) * 4 -+ vshl.i16 q1, #2 @ (q0b - p0b) * 4 -+ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a -+ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b -+ vdup.16 d4, r2 @ tc0a, tc0b -+ vdup.16 d6, r12 @ tc1a, tc1b -+ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 -+ \I1 -+ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 -+ \I2 -+ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b -+ \I3 -+ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b -+ \I4 -+ vmin.s16 q0, q2 -+ \I5 -+ vneg.s16 q2, q2 @ -tc0a, -tc0b -+ \I6 -+ vmin.s16 q1, q3 -+ \I7 -+ vneg.s16 q3, q3 @ -tc1a, -tc1b -+ vmax.s16 q0, q2 @ delta0a -+ vadd.i16 \P0a, q0 @ p0a + delta0a -+ vsub.i16 \Q0a, q0 @ q0a - delta0a -+ vmax.s16 q1, q3 @ delta0b -+ vadd.i16 \P0b, q1 @ p0b + delta0b -+ vsub.i16 \Q0b, q1 @ q0b - delta0b -+ vmov.i16 q2, #0 -+ vmov.i16 q3, #(1 << \bit_depth) - 1 -+ vmax.s16 \P0a, q2 -+ vmax.s16 \Q0a, q2 -+ vmax.s16 \P0b, q2 -+ vmax.s16 \Q0b, q2 -+ vmin.s16 \P0a, q3 -+ vmin.s16 \Q0a, q3 -+ vmin.s16 \P0b, q3 -+ vmin.s16 \Q0b, q3 -+.endm -+ -+ -+ -+@ uint8_t *_no_p, [sp+0] -+@ uint8_t *_no_q) [sp+4] -+ -+.macro hevc_loop_filter_luma_start -+ ldr r12, [r3] -+ ldr r3, [r3, #4] -+ orrs r3, r12, r3, lsl #16 -+ it eq -+ bxeq lr -+ push {r4-r10,lr} @ 32 bytes -+ ldrd r4, r5, [sp, #32] @ &_no_p -+ ldrb r4, [r4] -+ ldrb r5, [r5] -+ movs r10, r4 -+ it ne -+ movne r10, #1 -+ cmp r5, #0 -+ it ne -+ orrne r10, #2 -+.endm -+ -+@ Input: -+@ r2 beta (raw: needs shift for bitdepth > 8) -+@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) -+@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) -+@ -+@ Input & output -+@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3) -+@ 16-bit: q8-q15 -+@ -+@ r1 -r1 -+@ r10 b1->C, b0->N (r10 junk) -+@ -+@ Junks: -+@ r5, r6, r7, r8, r9 -+ -+.macro m_filter_luma bit_depth, Q11, Q15 -+.if \bit_depth == 8 -+ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 -+ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 -+ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 -+ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 -+ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 -+ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 -+.endif -+ vadd.i16 q0, q9, \Q11 @ P2 + P0 -+.if \bit_depth > 8 -+ lsl r3, r3, #(\bit_depth - 8) -+.endif -+ vadd.i16 q1, q14, q12 @ Q2 + Q0 -+.if \bit_depth > 8 -+ lsl r2, r2, #(\bit_depth - 8) -+.endif -+ vsub.i16 q0, q10 @ P2 - P1 + P0 -+ lsr r5, r3, #16 -+ vsub.i16 q1, q13 @ Q2 - Q1 + Q0 -+.if \bit_depth == 8 -+ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 -+ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 -+.endif -+ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) -+ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) -+ vmov.i64 q2, #0xffffffff0000 -+ vbic q0, q2 @ only dp0(') and dp3(') -+ vbic q1, q2 @ only dq0(') and dq3(') -+ vsra.u64 q0, #16 -+ vsra.u64 q1, #16 -+ vdup.16 q3, r2 @ beta -+ vdup.16 d14, r3 @ tC[0] -+ vdup.16 d15, r5 @ tC[1] -+ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) -+ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 -+ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 -+ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 -+ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) -+ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) -+ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 -+ vshl.s16 q6, q7, #2 @ tC[] * 4 -+ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 -+ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) -+ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) -+ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 -+ cmp r7, #0 -+ beq .Lbypasswrite -+ -+ vcgt.s16 q5, q6, q5 @ if < tc25 -+ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) -+ vand q4, q5 -+ vbic d8, d4 -+ vbic d9, d4 -+ vshr.s16 q3, #2 @ beta_2 = beta >> 2 -+ vsra.u64 q4, #16 -+ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 -+ vshl.i16 q7, #1 @ tc2 = tC[] << 1 -+ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc -+ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half -+ vand d6, d8 @ && beta_2 tests, prime in ms half -+ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 -+ vneg.s16 q6, q7 @ -tc2 -+ vmovn.i32 d8, q3 -+ vshrn.i32 d6, q3, #16 -+ vand d6, d8 -+ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 -+ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) -+ vadd.i16 q0, \Q11, q12 @ p0 + q0 -+ ands r9, r7, r8 -+ beq 1f -+ -+ vadd.i16 q2, q0, q10 @ p1 + p0 + q0 -+ vadd.i16 q3, q0, q13 @ p0 + q0 + q1 -+ lsr r3, r9, #16 -+ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) -+ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) -+ vadd.i16 q0, q8, q9 @ p3 + p2 -+ vadd.i16 q5, \Q15, q14 @ q2 + q3 -+ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 -+ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 -+ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 -+ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 -+ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) -+ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) -+ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) -+ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) -+ vrshr.s16 q0, #3 @ scale, with rounding -+ vrshr.s16 q5, #3 -+ vrshr.s16 q1, #2 -+ vrshr.s16 q4, #2 -+ vrshr.s16 q2, #3 -+ vrshr.s16 q3, #3 -+ vsub.i16 q0, q9 @ find difference -+ vsub.i16 q5, q14 -+ vsub.i16 q1, q10 -+ vsub.i16 q4, q13 -+ vsub.i16 q2, \Q11 -+ vsub.i16 q3, q12 -+ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 -+ vmax.s16 q5, q6 -+ vmax.s16 q1, q6 -+ vmax.s16 q4, q6 -+ vmax.s16 q2, q6 -+ vmax.s16 q3, q6 -+ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure -+ vdup.16 d13, r3 -+ vmin.s16 q0, q7 -+ vmin.s16 q5, q7 -+ vmin.s16 q1, q7 -+ vmin.s16 q4, q7 -+ vmin.s16 q2, q7 -+ vmin.s16 q3, q7 -+ vadd.i16 q0, q9 @ apply difference -+ vadd.i16 q5, q14 -+ vadd.i16 q1, q10 -+ vadd.i16 q4, q13 -+ vadd.i16 q2, \Q11 -+ vadd.i16 q3, q12 -+ vbit q9, q0, q6 @ apply filtered values according to mask -+ vbit q14, q5, q6 -+ vbit q10, q1, q6 -+ vbit q13, q4, q6 -+ vbit \Q11, q2, q6 -+ vbit q12, q3, q6 -+ vneg.s16 q6, q7 @ restore -tc2 -+ -+1: -+ bics r9, r7, r8 -+ beq 2f -+ -+ vsub.i16 q0, q12, \Q11 @ q0 - p0 -+ vsub.i16 q1, q13, q10 @ q1 - p1 -+ lsr r3, r9, #16 -+ vshl.i16 q2, q0, #3 -+ lsr r7, r5, #16 -+ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) -+ lsr r8, r6, #16 -+ vshl.i16 q2, q1, #1 -+ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) -+ vshr.s16 q6, #1 @ -tc = -tc2 >> 1 -+ vsub.i16 q5, q3, q4 -+ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 -+ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 -+ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 -+ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 -+ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 -+ vmax.s16 q6, q5 @ -+ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 -+ vdup.16 q0, r2 @ beta -+ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] -+ vshr.s16 q4, #1 @ tc_2 = tc >> 1 -+ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 -+ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 -+ vshr.s16 q2, q0, #1 @ beta >> 1 -+ vadd.i16 q2, q0 @ beta + (beta >> 1) -+ vneg.s16 q0, q4 @ -tc_2 -+ vabs.s16 q5, q5 @ abs(original delta0) -+ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 -+ vmax.s16 q1, q0 -+ vmax.s16 q3, q0 -+ vshl.s16 q0, q7, #2 @ 8 * tc -+ vadd.i16 q7, q0 @ 10 * tc -+ vdup.16 d0, r9 -+ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering -+ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) -+ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) -+ vdup.16 d8, r5 @ dp0 + dp3 -+ vdup.16 d9, r7 @ dp0' + dp3' -+ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) -+ vdup.16 d10, r6 @ dq0 + dq3 -+ vdup.16 d11, r8 @ dq0' + dq3' -+ vand q7, q0 @ AND block and line masks -+ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) -+ vadd.i16 q0, q1, q10 @ p1 + deltap1 -+ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) -+ vadd.i16 q3, q3, q13 @ q1 + deltaq1 -+ vadd.i16 q1, \Q11, q6 @ p0 + delta0 -+ vsub.i16 q2, q12, q6 @ q0 - delta0 -+ vand q4, q7 @ AND nd_p test with block/line masks -+ vand q5, q7 @ AND nd_q test with block/line masks -+ vbit q10, q0, q4 -+ vbit \Q11, q1, q7 -+ vbit q12, q2, q7 -+ vbit q13, q3, q5 -+ -+2: -+.if \bit_depth == 8 -+ vmovn.i16 d16, q8 -+ vmovn.i16 d23, \Q15 -+ neg r1, r1 -+ vqmovun.s16 d17, q9 -+ vqmovun.s16 d18, q10 -+ vqmovun.s16 d19, \Q11 -+ lsls r10, #31 -+ vqmovun.s16 d20, q12 -+ vqmovun.s16 d21, q13 -+ vqmovun.s16 d22, q14 -+.else -+ vmov.i16 q0, #0 -+ vmov.i16 q1, #(1 << \bit_depth - 1) -+ @ q8 & q15 should be unaltered and so don't require clipping -+ neg r1, r1 -+ vmax.s16 q9, q0 -+ vmax.s16 q10, q0 -+ vmax.s16 q11, q0 -+ vmax.s16 q12, q0 -+ vmax.s16 q13, q0 -+ vmax.s16 q14, q0 -+ lsls r10, #31 -+ vmin.s16 q9, q1 -+ vmin.s16 q10, q1 -+ vmin.s16 q11, q1 -+ vmin.s16 q12, q1 -+ vmin.s16 q13, q1 -+ vmin.s16 q14, q1 -+.endif -+ bx lr -+.endm -+ -+function hevc_loop_filter_luma_body -+ m_filter_luma 8, q15, q11 -+endfunc -+ -+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( -+@ uint8_t *_pix, [r0] -+@ ptrdiff_t _stride, [r1] -+@ int _beta, [r2] -+@ int *_tc, [r3] -+@ uint8_t *_no_p, [sp+0] -+@ uint8_t *_no_q) [sp+4] -+ -+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 -+ hevc_loop_filter_luma_start -+ -+ sub r4, r0, #4 -+ b .Lv_loop_luma_common -+endfunc -+ -+@ void ff_hevc_rpi_v_loop_filter2_luma_neon( -+@ uint8_t * pix_r, [r0] -+@ ptrdiff_t _stride, [r1] -+@ int _beta, [r2] -+@ int tc2, [r3] -+@ int no_f, [sp+0] -+@ uint8_t * pix_l) [sp+4] -+ -+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1 -+ cmp r3, #0 -+ it eq -+ bxeq lr -+ push {r4-r10,lr} @ 32 bytes -+ ldr r4, [sp, #36] -+ ldr r10, [sp, #32] -+ -+.Lv_loop_luma_common: -+ vpush {d8-d15} -+ -+ @ It's slightly faster to do unlaned loads and transpose in the -+ @ 8-bit case, even though it needs more instructions, because -+ @ VLD4.8 is a really slow way to read from memory. -+ vld1.32 {d16[0]}, [r4:32], r1 -+ vld1.32 {d20[0]}, [r0:32], r1 -+ vld1.32 {d16[1]}, [r4:32], r1 -+ vld1.32 {d20[1]}, [r0:32], r1 -+ vld1.32 {d17[0]}, [r4:32], r1 -+ vld1.32 {d21[0]}, [r0:32], r1 -+ vld1.32 {d17[1]}, [r4:32], r1 -+ vld1.32 {d21[1]}, [r0:32], r1 -+ vld1.32 {d18[0]}, [r4:32], r1 -+ vld1.32 {d22[0]}, [r0:32], r1 -+ vld1.32 {d18[1]}, [r4:32], r1 -+ vld1.32 {d22[1]}, [r0:32], r1 -+ vld1.32 {d19[0]}, [r4:32], r1 -+ vld1.32 {d23[0]}, [r0:32], r1 -+ vld1.32 {d19[1]}, [r4:32] -+ vld1.32 {d23[1]}, [r0:32] -+ vuzp.16 q8, q9 -+ vuzp.16 q10, q11 -+ vuzp.8 q8, q9 -+ vuzp.8 q10, q11 -+ vswp d17, d18 -+ vswp d21, d22 -+ -+ bl hevc_loop_filter_luma_body -+ -+ add r6, r4, r1 -+ add r2, r0, r1 -+ lsl r1, #1 -+ -+ vpop {d8-d15} -+ -+ @ no_p[1] -+ bmi 1f -+ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 -+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 -+ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 -+ -+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 -+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] -+1: -+ @ no_q[1] -+ bcs 1f -+ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 -+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 -+ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 -+ -+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 -+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] -+1: -+ pop {r4-r10,pc} -+ -+.Lbypasswrite: -+ vpop {d8-d15} -+ pop {r4-r10,pc} -+endfunc -+ -+.macro m_filter_v_luma_16 bit_depth -+ vpush {d8-d15} -+ -+ @ Uses slightly fewer instructions to do laned loads than unlaned -+ @ and transpose. This also means that we can use the same code for -+ @ both split & unsplit deblock -+ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 -+ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 -+ -+ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 -+ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 -+ -+ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 -+ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 -+ -+ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 -+ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 -+ -+ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 -+ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 -+ -+ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 -+ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 -+ -+ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 -+ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 -+ -+ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] -+ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] -+ -+ bl hevc_loop_filter_luma_body_\bit_depth -+ -+ add r6, r4, r1 -+ add r2, r0, r1 -+ lsl r1, #1 -+ -+ vpop {d8-d15} -+ -+ @ p[1] -+ bmi 1f -+ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 -+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 -+ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 -+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 -+ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 -+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 -+ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 -+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] -+1: -+ @ q[1] -+ bcs 1f -+ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 -+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 -+ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 -+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 -+ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 -+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 -+ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 -+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] -+1: -+ pop {r4-r10,pc} -+.endm -+ -+ -+ -+ -+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] -+@ ptrdiff_t stride, [r1] -+@ int beta, [r2] -+@ int32_t *tc, [r3] -+@ uint8_t *no_p, sp[0] -+@ uint8_t *no_q); sp[4] -+@ -+@ Src should always be on 8 byte boundry & all in the same slice -+ -+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 -+ hevc_loop_filter_luma_start -+ b .Lh_loop_filter_luma_common_8 -+endfunc -+ -+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1 -+ cmp r3, #0 -+ it eq -+ bxeq lr -+ push {r4-r10,lr} @ 32 bytes -+ ldr r10, [sp, #32] -+ -+.Lh_loop_filter_luma_common_8: -+ sub r4, r0, r1, lsl #2 -+ add r0, r4, r1 -+ lsl r1, #1 -+ vpush {d8-d15} -+ -+ vld1.8 {d16}, [r4], r1 -+ vld1.8 {d17}, [r0], r1 -+ vld1.8 {d18}, [r4], r1 -+ vld1.8 {d19}, [r0], r1 -+ vld1.8 {d20}, [r4], r1 -+ vld1.8 {d21}, [r0], r1 -+ vld1.8 {d22}, [r4] -+ vld1.8 {d23}, [r0] -+ -+ bl hevc_loop_filter_luma_body -+ -+ add r0, r0, r1, lsl #1 -+ add r2, r4, r1, lsl #1 -+ add r6, r4, r1, asr #1 -+ vpop {d8-d15} -+ -+ @ P2-P0 -+ bcs 1f -+ vst1.8 {d22}, [r4], r1 -+ vst1.8 {d21}, [r6] -+ vst1.8 {d20}, [r4] -+1: -+ @ Q0-Q2 -+ bmi 1f -+ vst1.8 {d19}, [r0], r1 -+ vst1.8 {d18}, [r2] -+ vst1.8 {d17}, [r0] -+1: -+ pop {r4-r10,pc} -+endfunc -+ -+ -+.macro m_filter_h_luma_16 bit_depth -+ sub r4, r0, r1, lsl #2 -+ add r0, r4, r1 -+ lsl r1, #1 -+ vpush {d8-d15} -+ -+ vld1.16 { q8}, [r4], r1 -+ vld1.16 { q9}, [r0], r1 -+ vld1.16 {q10}, [r4], r1 -+ vld1.16 {q11}, [r0], r1 -+ vld1.16 {q12}, [r4], r1 -+ vld1.16 {q13}, [r0], r1 -+ vld1.16 {q14}, [r4] -+ vld1.16 {q15}, [r0] -+ -+ bl hevc_loop_filter_luma_body_\bit_depth -+ -+ add r0, r0, r1, lsl #1 -+ add r2, r4, r1, lsl #1 -+ add r6, r4, r1, asr #1 -+ vpop {d8-d15} -+ -+ @ P2-P0 -+ bcs 1f -+ vst1.16 {q14}, [r4], r1 -+ vst1.16 {q13}, [r6] -+ vst1.16 {q12}, [r4] -+1: -+ bmi 1f -+ vst1.16 {q11}, [r0], r1 -+ vst1.16 {q10}, [r2] -+ vst1.16 { q9}, [r0] -+1: -+ pop {r4-r10,pc} -+.endm -+ -+ -+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0 -+@ unsigned int stride, // r1 -+@ uint32_t tc4, // r2 -+@ unsigned int no_f); // r3 -+@ -+@ no_f -+@ 0 tl P0 -+@ 1 tr P1 -+@ 2 bl Q0 -+@ 3 br Q1 -+@ -+@ Probably not worth having the P/Qa only special case in this direction -+@ Given layout we won't save any memory reads or avoid any cache dirtying -+@ We would save a bit of computation but I expect the partials to be less -+@ common in the H direction than V due to how we arrange deblock. -+ -+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 -+ sub r12, r0, r1 -+ cmp r2, #0 -+ it eq -+ bxeq lr -+ vld1.8 {d26,d27}, [r0] -+ lsl r1, #1 -+ sub r0, r1 -+ vld1.8 {d18,d19}, [r12], r1 -+ vld1.8 {d16,d17}, [r0], r1 -+ vld1.8 {d28,d29}, [r12] -+ -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ -+ "sub r12, r0, r1, asr #1" -+ -+ lsls r3, #29 @ b2 -> N, b3 -> C -+ it pl -+ vstrpl d26, [r0, #0] -+ it cc -+ vstrcc d27, [r0, #8] -+ lsls r3, #2 @ b0 -> N, b1 -> C -+ it pl -+ vstrpl d18, [r12, #0] -+ it cc -+ vstrcc d19, [r12, #8] -+ bx lr -+ -+endfunc -+ -+ -+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 -+@ unsigned int stride, // r1 -+@ uint32_t tc4, // r2 -+@ unsigned int no_f); // r3 -+@ -+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] -+@ -+@ Macro here actual function near bottom -+ -+.macro m_filter_h_uv_16 bit_depth -+ sub r12, r0, r1 -+ cmp r2, #0 -+ it eq -+ bxeq lr -+ vld1.16 {q12, q13}, [r0] -+ lsl r1, #1 -+ sub r0, r1 -+ vld1.16 {q10, q11}, [r12], r1 -+ vld1.16 {q8, q9 }, [r0], r1 -+ vld1.16 {q14, q15}, [r12] -+ -+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ -+ "sub r12, r0, r1, asr #1", \ -+ "cmp r3, #0" -+ -+ bne 1f -+ vst1.16 {q10, q11}, [r12] -+ vst1.16 {q12, q13}, [r0] -+ bx lr -+ -+ @ At least one no_f bit is set -+ @ Which means we need to break this apart in an ugly fashion -+1: -+ lsls r3, #29 @ b2 -> N, b3 -> C -+ itt pl -+ vstrpl d24, [r0, #0] -+ vstrpl d25, [r0, #8] -+ itt cc -+ vstrcc d26, [r0, #16] -+ vstrcc d27, [r0, #24] -+ lsls r3, #2 @ b0 -> N, b1 -> C -+ itt pl -+ vstrpl d20, [r12, #0] -+ vstrpl d21, [r12, #8] -+ itt cc -+ vstrcc d22, [r12, #16] -+ vstrcc d23, [r12, #24] -+ bx lr -+.endm -+ -+ -+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 -+@ unsigned int stride, // r1 -+@ uint32_t tc4, // r2 -+@ uint8_t * src_l, // r3 -+@ unsigned int no_f); // sp[0] -+@ -+@ no_f: -+@ 0 tl P0 -+@ 1 tr Q0 -+@ 2 bl P1 -+@ 3 br Q1 -+ -+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 -+ cmp r2, #0 -+ it eq -+ bxeq lr -+ push {lr} -+ vld2.16 {d16[0], d18[0]}, [r3], r1 -+ vld2.16 {d20[0], d22[0]}, [r0], r1 -+ -+ cmp r2, #0x10000 -+ vld2.16 {d16[1], d18[1]}, [r3], r1 -+ vld2.16 {d20[1], d22[1]}, [r0], r1 -+ -+ vld2.16 {d16[2], d18[2]}, [r3], r1 -+ vld2.16 {d20[2], d22[2]}, [r0], r1 -+ -+ vld2.16 {d16[3], d18[3]}, [r3], r1 -+ vld2.16 {d20[3], d22[3]}, [r0], r1 -+ blo 10f -+ -+ vld2.16 {d17[0], d19[0]}, [r3], r1 -+ vld2.16 {d21[0], d23[0]}, [r0], r1 -+ -+ sub ip, r0, r3 -+ vld2.16 {d17[1], d19[1]}, [r3], r1 -+ vld2.16 {d21[1], d23[1]}, [r0], r1 -+ -+ cmp ip, #4 -+ vld2.16 {d17[2], d19[2]}, [r3], r1 -+ vld2.16 {d21[2], d23[2]}, [r0], r1 -+ -+ vld2.16 {d17[3], d19[3]}, [r3] -+ vld2.16 {d21[3], d23[3]}, [r0] -+ -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ -+ "ldr lr, [sp, #4]", \ -+ "neg r1, r1", \ -+ "it eq; cmpeq lr, #0", \ -+ "add r3, #2", \ -+ "add ip, r3, r1", \ -+ "add r2, r0, r1", \ -+ "lsl r1, #1" -+ -+ bne 1f -+ -+@ Much/most of the time r0 == r3 + 4 and no_f == 0 -+@ so it is worth having this special case -+ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b -+ vst2.16 {d19[2], d21[2]}, [ip], r1 -+ vst2.16 {d19[1], d21[1]}, [r3], r1 -+ vst2.16 {d19[0], d21[0]}, [ip], r1 -+ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a -+ vst2.16 {d18[2], d20[2]}, [ip], r1 -+ vst2.16 {d18[1], d20[1]}, [r3] -+ vst2.16 {d18[0], d20[0]}, [ip] -+ pop {pc} -+ -+@ Either split or partial -+1: -+ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 -+ ittt cs -+ addcs r0, r0, r1, lsl #1 -+ addcs r2, r2, r1, lsl #1 -+ bcs 1f -+ @ Q0b -+ vst1.16 {d21[3]}, [r0], r1 -+ vst1.16 {d21[2]}, [r2], r1 -+ vst1.16 {d21[1]}, [r0], r1 -+ vst1.16 {d21[0]}, [r2], r1 -+1: -+ ittt mi -+ addmi r3, r3, r1, lsl #1 -+ addmi ip, ip, r1, lsl #1 -+ bmi 1f -+ @ P0b -+ vst1.16 {d19[3]}, [r3], r1 -+ vst1.16 {d19[2]}, [ip], r1 -+ vst1.16 {d19[1]}, [r3], r1 -+ vst1.16 {d19[0]}, [ip], r1 -+1: -+ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 -+ bcs 1f -+ @ Q0a -+ vst1.16 {d20[3]}, [r0], r1 -+ vst1.16 {d20[2]}, [r2], r1 -+ vst1.16 {d20[1]}, [r0] -+ vst1.16 {d20[0]}, [r2] -+1: -+ it mi -+ popmi {pc} -+ @ P0a -+ vst1.16 {d18[3]}, [r3], r1 -+ vst1.16 {d18[2]}, [ip], r1 -+ vst1.16 {d18[1]}, [r3] -+ vst1.16 {d18[0]}, [ip] -+ pop {pc} -+ -+@ Single lump (rather than double) -+10: -+ @ As we have post inced r0/r3 in the load the easiest thing to do is -+ @ to subtract and write forwards, rather than backwards (as above) -+ @ b0 (P0a) -> N, b1 (Q0a) -> C -+ -+ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ -+ "ldr lr, [sp, #4]", \ -+ "add r3, #2", \ -+ "sub r0, r0, r1, lsl #2", \ -+ "sub r3, r3, r1, lsl #2", \ -+ "lsls lr, #31", \ -+ "add r2, r0, r1", \ -+ "add ip, r3, r1", \ -+ "lsl r1, #1" -+ -+ bcs 3f -+ @ Q0a -+ vst1.16 {d20[0]}, [r0], r1 -+ vst1.16 {d20[1]}, [r2], r1 -+ vst1.16 {d20[2]}, [r0] -+ vst1.16 {d20[3]}, [r2] -+3: -+ it mi -+ popmi {pc} -+ @ P0a -+ vst1.16 {d18[0]}, [r3], r1 -+ vst1.16 {d18[1]}, [ip], r1 -+ vst1.16 {d18[2]}, [r3] -+ vst1.16 {d18[3]}, [ip] -+ pop {pc} -+ -+endfunc -+ -+ -+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 -+@ unsigned int stride, // r1 -+@ uint32_t tc4, // r2 -+@ uint8_t * src_l, // r3 -+@ unsigned int no_f); // sp[0] -+@ -+ -+@ no_f -+@ 0 tl P0a -+@ 1 tr Q0a -+@ 2 bl P0b -+@ 3 br Q0b -+ -+@ P1: q8, q12 -+@ P0: q9, q13 -+@ Q0: q10, q14 -+@ Q1: q11, q15 -+ -+.macro m_filter_v_uv2_16 bit_depth -+ cmp r2, #0 -+ it eq -+ bxeq lr -+ push {lr} -+ vld2.32 {d16[0], d18[0]}, [r3], r1 -+ vld2.32 {d20[0], d22[0]}, [r0], r1 -+ -+ cmp r2, #0x10000 -+ vld2.32 {d16[1], d18[1]}, [r3], r1 -+ vld2.32 {d20[1], d22[1]}, [r0], r1 -+ -+ vld2.32 {d17[0], d19[0]}, [r3], r1 -+ vld2.32 {d21[0], d23[0]}, [r0], r1 -+ -+ vld2.32 {d17[1], d19[1]}, [r3], r1 -+ vld2.32 {d21[1], d23[1]}, [r0], r1 -+ blo 10f -+ -+ vld2.32 {d24[0], d26[0]}, [r3], r1 -+ vld2.32 {d28[0], d30[0]}, [r0], r1 -+ -+ sub ip, r0, r3 -+ vld2.32 {d24[1], d26[1]}, [r3], r1 -+ vld2.32 {d28[1], d30[1]}, [r0], r1 -+ -+ cmp ip, #8 -+ vld2.32 {d25[0], d27[0]}, [r3], r1 -+ vld2.32 {d29[0], d31[0]}, [r0], r1 -+ -+ vld2.32 {d25[1], d27[1]}, [r3] -+ vld2.32 {d29[1], d31[1]}, [r0] -+ -+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ -+ "ldr lr, [sp, #4]", \ -+ "neg r1, r1", \ -+ "it eq; cmpeq lr, #0", \ -+ "add r3, #4", \ -+ "add ip, r3, r1", \ -+ "add r2, r0, r1", \ -+ "lsl r1, #1" -+ -+ bne 1f -+ -+@ Much/most of the time r0 == r3 + 8 and no_f == 0 -+@ so it is worth having this special case -+ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b -+ vst2.32 {d27[0], d29[0]}, [ip], r1 -+ vst2.32 {d26[1], d28[1]}, [r3], r1 -+ vst2.32 {d26[0], d28[0]}, [ip], r1 -+ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a -+ vst2.32 {d19[0], d21[0]}, [ip], r1 -+ vst2.32 {d18[1], d20[1]}, [r3] -+ vst2.32 {d18[0], d20[0]}, [ip] -+ pop {pc} -+ -+@ Either split or partial -+1: -+ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 -+ ittt cs -+ addcs r0, r0, r1, lsl #1 -+ addcs r2, r2, r1, lsl #1 -+ bcs 1f -+ @ Q0b -+ vst1.32 {d29[1]}, [r0], r1 -+ vst1.32 {d29[0]}, [r2], r1 -+ vst1.32 {d28[1]}, [r0], r1 -+ vst1.32 {d28[0]}, [r2], r1 -+1: -+ ittt mi -+ addmi r3, r3, r1, lsl #1 -+ addmi ip, ip, r1, lsl #1 -+ bmi 1f -+ @ P0b -+ vst1.32 {d27[1]}, [r3], r1 -+ vst1.32 {d27[0]}, [ip], r1 -+ vst1.32 {d26[1]}, [r3], r1 -+ vst1.32 {d26[0]}, [ip], r1 -+1: -+ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 -+ bcs 1f -+ @ Q0a -+ vst1.32 {d21[1]}, [r0], r1 -+ vst1.32 {d21[0]}, [r2], r1 -+ vst1.32 {d20[1]}, [r0] -+ vst1.32 {d20[0]}, [r2] -+1: -+ it mi -+ popmi {pc} -+ @ P0a -+ vst1.32 {d19[1]}, [r3], r1 -+ vst1.32 {d19[0]}, [ip], r1 -+ vst1.32 {d18[1]}, [r3] -+ vst1.32 {d18[0]}, [ip] -+ pop {pc} -+ -+@ Single lump (rather than double) -+10: -+ @ As we have post inced r0/r3 in the load the easiest thing to do is -+ @ to subtract and write forwards, rather than backwards (as above) -+ @ b0 (P0a) -> N, b1 (Q0a) -> C -+ -+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ -+ "ldr lr, [sp, #4]", \ -+ "add r3, #4", \ -+ "sub r0, r0, r1, lsl #2", \ -+ "sub r3, r3, r1, lsl #2", \ -+ "lsls lr, #31", \ -+ "add r2, r0, r1", \ -+ "add ip, r3, r1", \ -+ "lsl r1, #1" -+ -+ bcs 3f -+ @ Q0a -+ vst1.32 {d20[0]}, [r0], r1 -+ vst1.32 {d20[1]}, [r2], r1 -+ vst1.32 {d21[0]}, [r0] -+ vst1.32 {d21[1]}, [r2] -+3: -+ it mi -+ popmi {pc} -+ @ P0a -+ vst1.32 {d18[0]}, [r3], r1 -+ vst1.32 {d18[1]}, [ip], r1 -+ vst1.32 {d19[0]}, [r3] -+ vst1.32 {d19[1]}, [ip] -+ pop {pc} -+.endm -+ -+ -+@ The NEON version is faster under ideal circumstances (i.e. everything in L1) -+@ But in real world testing it is ~20% slower, presumably due to code size -+ -+#if 0 // NEON version -+ -+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, -+ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ * int in_inc0, int in_inc1) -+ */ -+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 -+ mov ip, sp -+ push {a1-a3,v1-v8,lr} -+ ldm ip, {v1-v6} -+ cmp a1, #2 -+ bls 2f -+ vpush {d8-d13} -+ sub v5, v5, #10 -+ sub v6, v6, #10 -+1: -+ vld2.32 {d0[0], d2[0]}, [a3]! -+ vld2.32 {d4[0], d6[0]}, [a4]! -+ vmov.u8 q12, #0 -+ ldrb a2, [a3], #1 -+ ldrb ip, [a4], #1 -+ ldrb v8, [a3], #1 -+ ldrb lr, [a4], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d24[0]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d25[0]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d16[0]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d20[0]}, [ip] -+ vld1.32 {d18[0]}, [v8] -+ vld1.32 {d22[0]}, [lr] -+ -+ vld2.32 {d0[1], d2[1]}, [a3]! -+ vld2.32 {d4[1], d6[1]}, [a4]! -+ ldrb a2, [a3], #1 -+ vmov.u16 d12, #1 -+ ldrb ip, [a4], #1 -+ vmov.u16 d13, #2 -+ ldrb v8, [a3], #1 -+ vmov.u16 d27, #4 -+ ldrb lr, [a4], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d24[2]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d25[2]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d16[1]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d20[1]}, [ip] -+ vld1.32 {d18[1]}, [v8] -+ vld1.32 {d22[1]}, [lr] -+ -+ vld2.32 {d1[0], d3[0]}, [a3]! -+ vld2.32 {d5[0], d7[0]}, [a4]! -+ ldrb a2, [a3], #1 -+ ldrb ip, [a4], #1 -+ ldrb lr, [a4], #1 -+ ldrb v8, [a3], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d24[4]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d25[4]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d17[0]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d21[0]}, [ip] -+ vld1.32 {d19[0]}, [v8] -+ vld1.32 {d23[0]}, [lr] -+ -+ vld2.32 {d1[1], d3[1]}, [a3]! -+ vld2.32 {d5[1], d7[1]}, [a4]! -+ ldrb a2, [a3], #1 -+ ldrb ip, [a4], #1 -+ ldrb v8, [a3], #1 -+ ldrb lr, [a4], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d24[6]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d25[6]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d17[1]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d21[1]}, [ip] -+ vld1.32 {d19[1]}, [v8] -+ vld1.32 {d23[1]}, [lr] -+ -+ @ So now we have: -+ @ q0.32[i] = curr[i].mv[0] -+ @ q1.32[i] = curr[i].mv[1] -+ @ q2.32[i] = neigh[i].mv[0] -+ @ q3.32[i] = neigh[i].mv[1] -+ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] -+ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] -+ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] -+ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] -+ @ d24.16[i] = curr[i].pred_flag -+ @ d25.16[i] = neigh[i].pred_flag -+ -+ vtst.16 d28, d24, d12 -+ vtst.16 d29, d24, d13 -+ vadd.i16 d8, d24, d12 -+ vadd.i16 d9, d25, d12 -+ vtst.16 d30, d25, d12 -+ vtst.16 d31, d25, d13 -+ veor d26, d8, d9 -+ ldr lr, [sp, 6*8 + 1*4] -+ vmovl.s16 q4, d28 -+ vmovl.s16 q5, d29 -+ teq lr, #1 -+ vmovl.s16 q14, d30 -+ it ne -+ lslne v1, lr, #1 -+ vmovl.s16 q15, d31 -+ it ne -+ rsbne v2, v1, #32 -+ vbif q0, q1, q4 -+ vbif q2, q3, q14 -+ vbif q1, q0, q5 -+ vbif q3, q2, q15 -+ vabd.s16 q12, q0, q2 -+ vabd.s16 q2, q1 -+ vabd.s16 q0, q3 -+ vabd.s16 q1, q3 -+ vbif q8, q9, q4 -+ vbif q10, q11, q14 -+ vbif q9, q8, q5 -+ vbif q11, q10, q15 -+ vclt.u16 d6, d24, d27 -+ vclt.u16 d8, d2, d27 -+ vclt.u16 d7, d25, d27 -+ vclt.u16 d9, d3, d27 -+ vclt.u16 d2, d0, d27 -+ vclt.u16 d0, d4, d27 -+ vclt.u16 d3, d1, d27 -+ vclt.u16 d1, d5, d27 -+ vceq.i32 q12, q10, q8 -+ vceq.i32 q10, q9 -+ vceq.i32 q8, q11 -+ vceq.i32 q9, q11 -+ vshrn.i32 d6, q3, #8 -+ vshrn.i32 d7, q4, #8 -+ vshrn.i32 d8, q1, #8 -+ vshrn.i32 d9, q0, #8 -+ vmovn.i32 d4, q12 -+ vmovn.i32 d2, q10 -+ vmovn.i32 d3, q8 -+ vmovn.i32 d5, q9 -+ vand q2, q3 -+ vrev16.8 q3, q3 -+ vand q2, q3 -+ vand q1, q4 -+ vrev16.8 q4, q4 -+ vand q1, q4 -+ vand d4, d5 -+ vand d2, d3 -+ vbic d0, d12, d4 -+ vshr.u16 d26, #2 -+ vbic d0, d2 -+ vmov.i16 d1, #0x5555 -+ vorr d0, d26 -+ bne 10f -+ -+ @ Merge results into result word, no duplicates -+ vmov a2, s0 -+ vmov v8, s1 -+ vmov.u16 ip, d0[1] -+ vmov.u16 lr, d0[3] -+ lsl a2, #30 -+ lsl v8, #30 -+ lsl ip, #30 -+ lsl lr, #30 -+ orr a2, ip, a2, lsr #2 -+ orr v8, lr, v8, lsr #2 -+ orr a2, v8, a2, lsr #4 -+ subs a1, #4 -+ orr v7, a2, v7, lsr #8 -+ bhi 1b -+ -+ mov a1, #32 -+ ldr a3, [sp, #6*8] -+ vpop {d8-d13} -+ sub a1, a1, a3, lsl #1 -+ mov a1, v7, lsr a1 -+ pop {a2-a4,v1-v8,pc} -+10: -+ @ Merge results into result word, with duplicates -+ vmul.i16 d0, d1 -+ vmov a2, s0 -+ vmov v8, s1 -+ vmov.u16 ip, d0[1] -+ vmov.u16 lr, d0[3] -+ lsl a2, v2 -+ subs a1, #4 -+ lsl v8, v2 -+ lsl ip, v2 -+ lsl lr, v2 -+ ldr v2, [sp, #6*8 + 12*4 + 1*4] -+T lsr a2, v1 -+T orr a2, ip, a2 -+A orr a2, ip, a2, lsr v1 -+ lsl ip, v1, #1 -+T lsr v8, v1 -+T orr v8, lr, v8 -+A orr v8, lr, v8, lsr v1 -+ lsl lr, v1, #2 -+T lsr a2, ip -+T orr a2, v8, a2 -+A orr a2, v8, a2, lsr ip -+ ldr v1, [sp, #6*8 + 12*4] -+T lsr v7, lr -+T orr v7, a2, v7 -+A orr v7, a2, v7, lsr lr -+ bhi 1b -+ -+ mov a1, #32 -+ ldrd a3, a4, [sp, #6*8] -+ vpop {d8-d13} -+ mls a1, a3, a4, a1 -+ mls a1, a3, a4, a1 -+ mov a1, v7, lsr a1 -+ pop {a2-a4,v1-v8,pc} -+ -+ -+2: -+ sub v5, v5, #10 -+ sub v6, v6, #10 -+ vmov.u8 d16, #0 -+ blo 3f -+ vld2.32 {d0[0], d1[0]}, [a3]! -+ vld2.32 {d2[0], d3[0]}, [a4]! -+ ldrb a2, [a3], #1 -+ ldrb ip, [a4], #1 -+ ldrb lr, [a4], #1 -+ ldrb v8, [a3], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d16[0]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d16[4]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d4[0]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d5[0]}, [ip] -+ vld1.32 {d6[0]}, [v8] -+ vld1.32 {d7[0]}, [lr] -+ -+3: -+ vld2.32 {d0[1], d1[1]}, [a3]! -+ vld2.32 {d2[1], d3[1]}, [a4]! -+ ldrb a2, [a3], #1 -+ vmov.u16 d17, #1 -+ ldrb ip, [a4], #1 -+ vmov.u16 d18, #2 -+ ldrb v8, [a3], #1 -+ vmov.u16 d19, #4 -+ ldrb lr, [a4], #1 -+ add a2, v1, a2, lsl #2 -+ vld1.8 {d16[2]}, [a3], v5 -+ add ip, v3, ip, lsl #2 -+ vld1.8 {d16[6]}, [a4], v6 -+ add v8, v2, v8, lsl #2 -+ vld1.32 {d4[1]}, [a2] -+ add lr, v4, lr, lsl #2 -+ vld1.32 {d5[1]}, [ip] -+ vld1.32 {d6[1]}, [v8] -+ vld1.32 {d7[1]}, [lr] -+ -+ @ So now we have: -+ @ d0.32[i] = curr[i].mv[0] -+ @ d1.32[i] = curr[i].mv[1] -+ @ d2.32[i] = neigh[i].mv[0] -+ @ d3.32[i] = neigh[i].mv[1] -+ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] -+ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] -+ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] -+ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] -+ @ d16.16[i] = curr[i].pred_flag -+ @ d16.16[2+i] = neigh[i].pred_flag -+ -+ vtst.16 d20, d16, d17 -+ vtst.16 d22, d16, d18 -+ vadd.i16 d30, d16, d17 -+ vswp d2, d3 -+ ldr lr, [sp, #1*4] -+ vmovl.s16 q10, d20 -+ teq lr, #1 -+ vmovl.s16 q11, d22 -+ it ne -+ lslne v1, lr, #1 -+ vbif d0, d1, d20 -+ vbif d4, d6, d20 -+ vbif d3, d2, d21 -+ vbif d5, d7, d21 -+ vbif d1, d0, d22 -+ vbif d6, d4, d22 -+ vbif d2, d3, d23 -+ vbif d7, d5, d23 -+ vshr.u16 d30, #2 -+ vabd.s16 d24, d0, d3 -+ vabd.s16 d25, d1, d2 -+ vabd.s16 q0, q0, q1 -+ vceq.i32 d2, d4, d5 -+ vceq.i32 d20, d5, d6 -+ vceq.i32 d21, d4, d7 -+ vceq.i32 d3, d6, d7 -+ vclt.u16 d6, d24, d19 -+ vclt.u16 d7, d25, d19 -+ vclt.u16 d22, d1, d19 -+ vclt.u16 d23, d0, d19 -+ vshrn.i32 d6, q3, #8 -+ vmovn.i32 d2, q1 -+ vshrn.i32 d7, q11, #8 -+ vmovn.i32 d3, q10 -+ vand q0, q3, q1 -+ it ne -+ rsbne v2, v1, #32 -+ vrev16.8 q3, q3 -+ vand q0, q3 -+ vsra.u64 d30, #32 -+ vshr.u64 q1, q0, #32 -+ vand q0, q1 -+ vbic d0, d17, d0 -+ vand d30, d30, d17 -+ vbic d0, d1 -+ vmov.i16 d1, #0x5555 -+ vorr d0, d30 -+ bne 10f -+ -+ @ Construct result word, no duplicates -+ cmp a1, #2 -+ vmov.u16 a1, d0[1] -+ vmov.u16 a2, d0[0] -+ it eq -+ orreq a1, a2, a1, lsl #2 -+ pop {a2-a4,v1-v8,pc} -+10: -+ @ Construct result word, with duplicates -+ cmp a1, #2 -+ vmul.i16 d0, d1 -+ vmov a2, s0 -+ vmov.u16 a1, d0[1] -+ lsl a2, #16 -+ pkhbt a1, a1, a1, lsl #16 -+ lsr a2, v2 -+ lsr a1, v2 -+T itt eq -+T lsleq a1, v1 -+T orreq a1, a2, a1 -+A orreq a1, a2, a1, lsl v1 -+ pop {a2-a4,v1-v8,pc} -+endfunc -+ -+ -+ -+#else // non-NEON version -+ -+ -+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, -+ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ * int in_inc0, in_inc1) -+ */ -+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 -+ add ip, sp, #4*4 -+ push {a2-a4,v1-v8,lr} -+ mov v6, #32 -+1: ldmdb ip, {v1-v4} -+ ldrsb v5, [a3, #8] @ curr->ref_idx -+ ldrsb v8, [a3, #9] -+ ldrsb ip, [a4, #8] @ neigh->ref_idx -+ ldrsb lr, [a4, #9] -+ ldr v1, [v1, v5, lsl #2] -+ ldrb v5, [a3, #10] @ curr->pred_flag -+ ldr v2, [v2, v8, lsl #2] -+ ldrb v8, [a4, #10] @ neigh->pred_flag -+ ldr v3, [v3, ip, lsl #2] -+ ldr v4, [v4, lr, lsl #2] -+ teq v5, #3 -+ beq 20f -+ teq v8, #3 -+ beq 90f -+ -+ tst v5, #1 -+ itee ne -+ ldrne v5, [a3, #0] @ curr->mv[0] -+ moveq v1, v2 -+ ldreq v5, [a3, #4] @ curr->mv[1] -+ tst v8, #1 -+ itee ne -+ ldrne v8, [a4, #0] @ neigh->mv[0] -+ moveq v3, v4 -+ ldreq v8, [a4, #4] @ neigh->mv[1] -+ teq v1, v3 -+ bne 10f -+ ldr lr, =0xFFFCFFFC -+ ssub16 ip, v8, v5 -+ ssub16 v5, v5, v8 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ @ drop through -+10: it ne -+ movne v5, #1<<30 -+11: -+ sub v6, v6, #2 -+T mov v7, v7, lsr #2 -+ subs a2, a2, #1 -+A orr v7, v5, v7, lsr #2 -+T orr v7, v5, v7 -+ bhi 11b -+ -+ ldrd v3, v4, [sp, #16*4] -+ ldr a2, [sp] -+ add ip, sp, #16*4 -+ subs a1, a1, #1 -+ add a3, a3, v3 -+ add a4, a4, v4 -+ bhi 1b -+ mov a1, v7, lsr v6 -+ pop {a2-a4,v1-v8,pc} -+ -+20: teq v8, #3 -+ bne 10b -+ -+ teq v1, v3 -+ it eq -+ teqeq v2, v4 -+ bne 40f -+ teq v1, v2 -+ bne 30f -+ -+ ldrd v1, v2, [a3] @ curr->mv -+ ldrd v3, v4, [a4] @ neigh->mv -+ ldr lr, =0xFFFCFFFC -+ ssub16 ip, v3, v1 -+ ssub16 v5, v1, v3 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ bne 25f -+ ssub16 ip, v4, v2 -+ ssub16 v5, v2, v4 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ beq 11b -+ @ drop through -+25: ssub16 ip, v4, v1 -+ ssub16 v5, v1, v4 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ bne 10b -+ ssub16 ip, v3, v2 -+ ssub16 v5, v2, v3 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ b 10b -+ -+30: ldrd v1, v2, [a3] @ curr->mv -+ ldrd v3, v4, [a4] @ neigh->mv -+ ldr lr, =0xFFFCFFFC -+ ssub16 ip, v3, v1 -+ ssub16 v5, v1, v3 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ bne 10b -+ ssub16 ip, v4, v2 -+ ssub16 v5, v2, v4 -+ sel v5, v5, ip -+ ands v5, v5, lr -+ b 10b -+ -+40: teq v1, v4 -+ ite eq -+ teqeq v2, v3 -+ bne 10b -+ -+ ldrd v1, v2, [a3] @ curr->mv -+ ldrd v3, v4, [a4] @ neigh->mv -+ ldr lr, =0xFFFCFFFC -+ b 25b -+ -+90: -+ mov v5, #1<<30 -+ b 11b -+endfunc -+ -+ -+#endif -+ -+ -+@ ============================================================================= -+@ -+@ 10 bit -+ -+function hevc_loop_filter_luma_body_10 -+ m_filter_luma 10, q11, q15 -+endfunc -+ -+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 -+ hevc_loop_filter_luma_start -+ b .Lh_loop_luma_common_10 -+endfunc -+ -+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1 -+ cmp r3, #0 -+ it eq -+ bxeq lr -+ push {r4-r10,lr} @ 32 bytes -+ ldr r10, [sp, #32] -+.Lh_loop_luma_common_10: -+ m_filter_h_luma_16 10 -+endfunc -+ -+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1 -+ hevc_loop_filter_luma_start -+ sub r4, r0, #8 -+ b .Lv_loop_luma_common_10 -+endfunc -+ -+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1 -+ cmp r3, #0 -+ it eq -+ bxeq lr -+ push {r4-r10,lr} @ 32 bytes -+ ldr r4, [sp, #36] -+ ldr r10, [sp, #32] -+ -+.Lv_loop_luma_common_10: -+ m_filter_v_luma_16 10 -+endfunc -+ -+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 -+ m_filter_h_uv_16 10 -+endfunc -+ -+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1 -+ m_filter_v_uv2_16 10 -+endfunc -+ -diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S -new file mode 100644 -index 0000000000..7ed5c7dc52 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S -@@ -0,0 +1,184 @@ -+/* -+ * Copyright (c) 2014 Seppo Tomperi -+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+/* uses registers q8 - q13 for temp values */ -+.macro tr4_luma_shift shift -+ vaddl.s16 q8, d28, d30 // c0 = src0 + src2 -+ vaddl.s16 q9, d30, d31 // c1 = src2 + src3 -+ vsubl.s16 q10, d28, d31 // c2 = src0 - src3 -+ vaddl.s16 q11, d28, d31 // src0 + src3 -+ -+ vmul.i32 q12, q8, d1[0] // 29 * c0 -+ vmul.i32 q13, q10, d2[0] // 55 * c2 -+ vmul.i32 q8, q8, d2[0] // 55 * c0 -+ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1 -+ -+ vsubw.s16 q11, q11, d30 // src0 - src2 + src3 -+ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1 -+ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1 -+ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2 -+ -+ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3) -+ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3 -+ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3 -+ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3 -+ -+ vqrshrn.s32 d28, q12, \shift -+ vqrshrn.s32 d29, q13, \shift -+ vqrshrn.s32 d30, q11, \shift -+ vqrshrn.s32 d31, q8, \shift -+.endm -+ -+/* uses registers q8 - q11 for temp values */ -+.macro tr4_shift shift -+ vmull.s16 q9, d29, d0[0] // 83 * src1 -+ vmull.s16 q8, d29, d0[1] // 36 * src1 -+ vshll.s16 q14, d28, #6 // 64 * src0 -+ vshll.s16 q10, d30, #6 // 64 * src2 -+ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0 -+ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1 -+ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0 -+ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1 -+ vadd.s32 q14, q11, q9 // e0 + o0 -+ vadd.s32 q15, q10, q8 // e1 + o1 -+ vsub.s32 q8, q10, q8 // e1 - o1 -+ vsub.s32 q9, q11, q9 // e0 - o0 -+ -+ vqrshrn.s32 d28, q14, \shift -+ vqrshrn.s32 d29, q15, \shift -+ vqrshrn.s32 d30, q8, \shift -+ vqrshrn.s32 d31, q9, \shift -+.endm -+ -+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \ -+ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \ -+ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \ -+ shift, I1, I2, I3 -+ -+ vmull.s16 q4, \d1, d1[1] // 89 * src1 -+ \I1 -+ vmull.s16 q5, \d1, d1[0] // 75 * src1 -+ \I2 -+ vmull.s16 q6, \d1, d1[3] // 50 * src1 -+ \I3 -+ vmull.s16 q7, \d1, d1[2] // 18 * src1 -+ vmlal.s16 q4, \d3, d1[0] // 75 * src3 -+ vmlsl.s16 q5, \d3, d1[2] //-18 * src3 -+ vmlsl.s16 q6, \d3, d1[1] //-89 * src3 -+ vmlsl.s16 q7, \d3, d1[3] //-50 * src3 -+ -+ // tr4 -+ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2) -+ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2) -+ -+ vmlal.s16 q4, \d5, d1[3] // 50 * src5 -+ vmlsl.s16 q5, \d5, d1[1] //-89 * src5 -+ vmlal.s16 q6, \d5, d1[2] // 18 * src5 -+ vmlal.s16 q7, \d5, d1[0] // 75 * src5 -+ -+ vshll.s16 q3, \d0, #6 // 64 * src(0*2) -+ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2) -+ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0 -+ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1 -+ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0 -+ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1 -+ -+ vmlal.s16 q4, \d7, d1[2] // 18 * src7 -+ vmlsl.s16 q5, \d7, d1[3] //-50 * src7 -+ vmlal.s16 q6, \d7, d1[0] // 75 * src7 -+ vmlsl.s16 q7, \d7, d1[1] //-89 * src7 -+ -+ vsub.i32 q3, \tmp1, q1 // e0 - o0 -+ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0 -+ vadd.i32 q1, \tmp0, q2 // e1 + o1 -+ vsub.i32 q2, \tmp0, q2 // e1 - o1 -+ -+ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0] -+ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7] -+ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4] -+ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3] -+ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1] -+ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6] -+ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5] -+ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2] -+ vqrshrn.s32 \d0, \tmp0, #\shift -+ vqrshrn.s32 \d4, \tmp1, #\shift -+ vqrshrn.s32 \d1, q3, #\shift -+ vqrshrn.s32 \d5, q1, #\shift -+ vqrshrn.s32 \d2, q6, #\shift -+ vqrshrn.s32 \d6, q5, #\shift -+ vqrshrn.s32 \d3, q7, #\shift -+ vqrshrn.s32 \d7, q4, #\shift -+.endm -+ -+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3 -+ vld1.16 {\d0}, [r0 :64], r3 -+ vld1.16 {\d1}, [r2 :64], r3 -+ vld1.16 {\d2}, [r0 :64], r3 -+ vld1.16 {\d3}, [r2 :64], r3 -+ vld1.16 {\d4}, [r0 :64], r3 -+ vld1.16 {\d5}, [r2 :64], r3 -+ vld1.16 {\d6}, [r0 :64], r3 -+ vld1.16 {\d7}, [r2 :64], r3 -+ -+ tr8_process \ -+ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ -+ \q01, \q23, 7, "\I1", "\I2", "\I3" -+.endm -+ -+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift -+ tr8_process \ -+ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ -+ \q01, \q23, \shift -+ -+ vzip.16 \d0, \d4 -+ vzip.16 \d1, \d5 -+ vzip.16 \d2, \d6 -+ vzip.16 \d3, \d7 -+ vst4.16 {\d0-\d3}, [r0 :128], r3 -+ vst4.16 {\d4-\d7}, [r2 :128], r3 -+.endm -+ -+#define BIT_DEPTH 8 -+#include "rpi_hevc_idct_fn_neon.S" -+ -+.text -+ -+.align 4 -+tr4f: -+.word 0x00240053 // 36 and d1[0] = 83 -+.word 0x00000000 -+tr8f: -+.word 0x0059004b // 89, d0[0] = 75 -+.word 0x00320012 // 50, d0[2] = 18 -+tr16: -+.word 0x005a0057 // 90, d2[0] = 87 -+.word 0x00500046 // 80, d2[2] = 70 -+.word 0x0039002b // 57, d2[0] = 43 -+.word 0x00190009 // 25, d2[2] = 9 -+ -+#undef BIT_DEPTH -+#define BIT_DEPTH 10 -+#include "rpi_hevc_idct_fn_neon.S" -+ -diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c -new file mode 100644 -index 0000000000..109fa98c29 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c -@@ -0,0 +1,32 @@ -+/* -+ * Copyright (c) 2014 Seppo Tomperi -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/attributes.h" -+#include "libavutil/arm/cpu.h" -+#include "libavcodec/rpi_hevcdsp.h" -+#include "rpi_hevcdsp_arm.h" -+ -+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth) -+{ -+ int cpu_flags = av_get_cpu_flags(); -+ -+ if (have_neon(cpu_flags)) -+ ff_hevcdsp_rpi_init_neon(c, bit_depth); -+} -diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c -new file mode 100644 -index 0000000000..9294ab8010 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c -@@ -0,0 +1,467 @@ -+/* -+ * Copyright (c) 2014 Seppo Tomperi -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "config.h" -+#include "libavutil/attributes.h" -+#include "libavutil/arm/cpu.h" -+#include "libavcodec/rpi_hevcdsp.h" -+#include "rpi_hevcdsp_arm.h" -+#include "libavcodec/avcodec.h" -+#include "libavcodec/bit_depth_template.c" -+ -+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but -+// have been removed from head as we never use them. -+ -+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+ -+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+ -+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); -+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, -+ uint8_t * _pix_l); -+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, -+ unsigned int no_f); -+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f); -+ -+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); -+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, -+ uint8_t * _pix_l); -+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, -+ unsigned int no_f); -+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f); -+ -+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit); -+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit); -+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs); -+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs); -+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs); -+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs); -+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs); -+ -+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit); -+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit); -+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs); -+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs); -+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs); -+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs); -+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs); -+ -+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+ -+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+ -+ -+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+ -+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+ -+ -+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+ -+ -+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+ -+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+ -+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+ -+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+ -+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height); -+ -+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+ -+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height); -+ -+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+ -+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height); -+ -+ -+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, -+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ int in_inc0, int in_inc1); -+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); -+ -+ -+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); -+ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); -+} -+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); -+ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); -+} -+ -+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) -+{ -+ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); -+ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+} -+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) -+{ -+ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); -+ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+} -+ -+#if SAO_FILTER_N == 6 -+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); -+ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); -+} -+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); -+ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); -+} -+ -+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) -+{ -+ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); -+} -+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) -+{ -+ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); -+} -+ -+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); -+ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); -+} -+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height) -+{ -+ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); -+ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); -+} -+ -+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, -+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); -+ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, -+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); -+} -+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, -+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); -+ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, -+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); -+} -+#endif -+ -+ -+ -+#if RPI_HEVC_SAO_BUF_STRIDE != 160 -+#error SAO edge src stride not 160 - value used in .S -+#endif -+ -+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) -+{ -+ if (bit_depth == 8) { -+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; -+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; -+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; -+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; -+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; -+ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8; -+ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8; -+ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8; -+ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8; -+ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8; -+ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8; -+ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8; -+ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8; -+ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8; -+ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8; -+ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8; -+ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8; -+ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8; -+ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8; -+ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8; -+ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8; -+ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8; -+ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8; -+ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8; -+ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8; -+ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8; -+ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8; -+ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8; -+ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8; -+ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8; -+ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8; -+ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8; -+ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8; -+ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8; -+ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8; -+ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8; -+ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8; -+ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8; -+ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8; -+ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8; -+ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8; -+ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8; -+ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8; -+#if SAO_FILTER_N == 6 -+ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8; -+ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8; -+#endif -+ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8; -+ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8; -+ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8; -+ -+ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8; -+ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8; -+ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8; -+ -+#if SAO_FILTER_N == 6 -+ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; -+ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; -+#endif -+ } -+ else if (bit_depth == 10) { -+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10; -+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10; -+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10; -+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10; -+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10; -+ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10; -+ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10; -+ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10; -+ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10; -+ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10; -+ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10; -+ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10; -+ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10; -+ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10; -+ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10; -+ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10; -+ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10; -+ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10; -+ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10; -+ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10; -+ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10; -+ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10; -+ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10; -+ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10; -+ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10; -+ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10; -+ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10; -+ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10; -+ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10; -+ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10; -+ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10; -+ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10; -+ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10; -+ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10; -+ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10; -+ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10; -+ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10; -+ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10; -+ -+ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10; -+ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10; -+ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10; -+ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10; -+ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10; -+#if SAO_FILTER_N == 6 -+ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10; -+ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10; -+#endif -+ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10; -+ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10; -+ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10; -+ -+ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10; -+ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10; -+ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10; -+ -+#if SAO_FILTER_N == 6 -+ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10; -+ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10; -+#endif -+ } -+ -+ assert(offsetof(HEVCRpiMvField, mv) == 0); -+ assert(offsetof(HEVCRpiMvField, ref_idx) == 8); -+ assert(offsetof(HEVCRpiMvField, pred_flag) == 10); -+ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; -+ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; -+} -diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S -new file mode 100644 -index 0000000000..93876d14c0 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S -@@ -0,0 +1,620 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+ .arch_extension mp @ enable PLDW -+ -+#define BIT_DEPTH 10 -+ -+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmax.s16 \Q2, \Q_MIN -+ vmax.s16 \Q3, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX -+ vmin.s16 \Q2, \Q_MAX -+ vmin.s16 \Q3, \Q_MAX -+.endm -+ -+@ add_residual4x4( -+@ uint16_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1 -+ add ip, r0, r2 -+ vld1.16 {q10, q11}, [r1] -+ lsl r2, #1 -+ vld1.16 {d0}, [r0 :64], r2 -+ vld1.16 {d1}, [ip :64], r2 -+ vld1.16 {d2}, [r0 :64] -+ vld1.16 {d3}, [ip :64] -+ sub r0, r2 -+ vqadd.s16 q0, q10 -+ sub ip, r2 -+ vqadd.s16 q1, q11 -+ vmov.i16 q8, #0 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vst1.16 {d0}, [r0 :64], r2 -+ vst1.16 {d1}, [ip :64], r2 -+ vst1.16 {d2}, [r0 :64] -+ vst1.16 {d3}, [ip :64] -+ bx lr -+ -+endfunc -+ -+@ add_residual4x4_dc( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 -+ add ip, r0, r1 -+ vdup.16 q15, r2 -+ lsl r1, #1 -+ vld1.16 {d0}, [r0 :64], r1 -+ vld1.16 {d1}, [ip :64], r1 -+ vld1.16 {d2}, [r0 :64] -+ vld1.16 {d3}, [ip :64] -+ sub r0, r1 -+ vqadd.s16 q0, q15 -+ sub ip, r1 -+ vqadd.s16 q1, q15 -+ vmov.i16 q8, #0 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vst1.16 {d0}, [r0 :64], r1 -+ vst1.16 {d1}, [ip :64], r1 -+ vst1.16 {d2}, [r0 :64] -+ vst1.16 {d3}, [ip :64] -+ bx lr -+ -+endfunc -+ -+ -+@ add_residual8x8( -+@ uint16_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1 -+ mov r3, #8 -+ vmov.i64 q8, #0 -+ add ip, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r2, #1 -+1: -+ vldm r1!, {q10-q13} -+ vld1.16 {q0}, [r0 :128], r2 -+ vld1.16 {q1}, [ip :128], r2 -+ vld1.16 {q2}, [r0 :128] -+ vld1.16 {q3}, [ip :128] -+ sub r0, r2 -+ vqadd.s16 q0, q10 -+ sub ip, r2 -+ vqadd.s16 q1, q11 -+ subs r3, #4 -+ vqadd.s16 q2, q12 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0}, [r0 :128], r2 -+ vst1.16 {q1}, [ip :128], r2 -+ vst1.16 {q2}, [r0 :128], r2 -+ vst1.16 {q3}, [ip :128], r2 -+ bne 1b -+ bx lr -+ -+endfunc -+ -+@ add_residual4x4_dc_c( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r3, #4 -+ vdup.32 q15, r2 -+ b 9f -+endfunc -+ -+@ add_residual8x8_dc( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r2 -+ mov r3, #8 -+9: -+ vmov.i16 q8, #0 -+ add ip, r0, r1 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r1, #1 -+1: -+ vld1.16 {q0}, [r0 :128], r1 -+ vld1.16 {q1}, [ip :128], r1 -+ vld1.16 {q2}, [r0 :128] -+ vld1.16 {q3}, [ip :128] -+ sub r0, r1 -+ vqadd.s16 q0, q15 -+ sub ip, r1 -+ vqadd.s16 q1, q15 -+ subs r3, #4 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0}, [r0 :128], r1 -+ vst1.16 {q1}, [ip :128], r1 -+ vst1.16 {q2}, [r0 :128], r1 -+ vst1.16 {q3}, [ip :128], r1 -+ bne 1b -+ bx lr -+ -+endfunc -+ -+@ add_residual16x16( -+@ uint16_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1 -+ add ip, r0, r2 -+ vmov.i16 q8, #0 -+ lsl r2, #1 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ mov r3, #16 -+1: -+ vldm r1!, {q10-q13} -+ @ For RPI Sand we could guarantee :256 but not for general -+ @ non-RPI allocation. :128 is as good as we can claim -+ vld1.16 {q0, q1}, [r0 :128] -+ subs r3, #2 -+ vld1.16 {q2, q3}, [ip :128] -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q11 -+ vqadd.s16 q2, q12 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0, q1}, [r0 :128], r2 -+ vst1.16 {q2, q3}, [ip :128], r2 -+ bne 1b -+ bx lr -+endfunc -+ -+@ add_residual8x8_dc_c( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r3, #8 -+ vdup.32 q15, r2 -+ b 9f -+endfunc -+ -+@ add_residual16x16_dc( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 -+ vdup.i16 q15, r2 -+ mov r3, #16 -+9: -+ vmov.i16 q8, #0 -+ add ip, r0, r1 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r1, #1 -+1: -+ @ For RPI Sand we could guarantee :256 but not for general -+ @ non-RPI allocation. :128 is as good as we can claim -+ vld1.16 {q0, q1}, [r0 :128] -+ subs r3, #2 -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q15 -+ vld1.16 {q2, q3}, [ip :128] -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0, q1}, [r0 :128], r1 -+ vst1.16 {q2, q3}, [ip :128], r1 -+ bne 1b -+ bx lr -+ -+endfunc -+ -+ -+@ add_residual32x32( -+@ uint16_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1 -+ push {lr} -+ mov r3, #32 -+ vmov.i16 q8, #0 -+ add lr, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ add ip, r0, #32 -+1: -+ vldm r1!, {q10-q13} -+ vldm r0, {q0-q3} -+ vqadd.s16 q0, q10 -+ pldw [lr] -+ vqadd.s16 q1, q11 -+ add lr, r2 -+ vqadd.s16 q2, q12 -+ subs r3, #1 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0-q1}, [r0], r2 -+ vst1.16 {q2-q3}, [ip], r2 -+ bne 1b -+ pop {pc} -+ -+endfunc -+ -+@ add_residual16x16_dc_c( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r3, #16 -+ vdup.32 q15, r2 -+ b 9f -+endfunc -+ -+@ add_residual32x32_dc( -+@ uint16_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r2 -+ mov r3, #32 -+9: -+ vmov.i16 q8, #0 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ add ip, r0, #32 -+1: -+ vldm r0, {q0-q3} -+ vqadd.s16 q0, q15 -+ subs r3, #1 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0-q1}, [r0], r1 -+ vst1.16 {q2-q3}, [ip], r1 -+ bne 1b -+ bx lr -+ -+endfunc -+ -+@ ============================================================================ -+@ U add -+ -+@ add_residual4x4_u( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ vld1.16 {q10, q11}, [r1 :256] -+ lsl r2, #1 -+ vld2.16 {d0, d2}, [r0 :128], r2 -+ vld2.16 {d1, d3}, [ip :128], r2 -+ vld2.16 {d4, d6}, [r0 :128] -+ vld2.16 {d5, d7}, [ip :128] -+ sub r0, r2 -+ vmov.i16 q8, #0 -+ sub ip, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ -+ vst2.16 {d0, d2}, [r0 :128], r2 -+ vst2.16 {d1, d3}, [ip :128], r2 -+ vst2.16 {d4, d6}, [r0 :128] -+ vst2.16 {d5, d7}, [ip :128] -+ bx lr -+endfunc -+ -+@ add_residual8x8_u( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ mov r3, #8 -+ vmov.i16 q8, #0 -+ add ip, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r2, #1 -+1: -+ vld2.16 {q0, q1}, [r0 :256] -+ subs r3, #2 -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q10, q11}, [r1 :256]! -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ bx lr -+endfunc -+ -+@ add_residual16x16_u( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 -+ push {lr} -+ vdup.16 q15, r3 -+ mov r3, #16 -+ vmov.i16 q8, #0 -+ add lr, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ add ip, r0, #32 -+1: -+ vld2.16 {q0, q1}, [r0 :256] -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q10, q11}, [r1 :256]! -+ vqadd.s16 q0, q10 -+ pldw [lr] -+ vqadd.s16 q1, q15 -+ add lr, r2 -+ vqadd.s16 q2, q11 -+ subs r3, #1 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ pop {pc} -+endfunc -+ -+@ ============================================================================ -+@ V add -+ -+@ add_residual4x4_v( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ vld1.16 {q10, q11}, [r1 :256] -+ lsl r2, #1 -+ vld2.16 {d0, d2}, [r0 :128], r2 -+ vld2.16 {d1, d3}, [ip :128], r2 -+ vld2.16 {d4, d6}, [r0 :128] -+ vld2.16 {d5, d7}, [ip :128] -+ sub r0, r2 -+ vmov.i16 q8, #0 -+ sub ip, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q10 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q11 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ -+ vst2.16 {d0, d2}, [r0 :128], r2 -+ vst2.16 {d1, d3}, [ip :128], r2 -+ vst2.16 {d4, d6}, [r0 :128] -+ vst2.16 {d5, d7}, [ip :128] -+ bx lr -+endfunc -+ -+@ add_residual8x8_v( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ mov r3, #8 -+ vmov.i16 q8, #0 -+ add ip, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r2, #1 -+1: -+ vld2.16 {q0, q1}, [r0 :256] -+ subs r3, #2 -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q10, q11}, [r1 :256]! -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q10 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q11 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ bx lr -+endfunc -+ -+@ add_residual16x16_v( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 -+ push {lr} -+ vdup.16 q15, r3 -+ mov r3, #16 -+ vmov.i16 q8, #0 -+ add lr, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ add ip, r0, #32 -+1: -+ vld2.16 {q0, q1}, [r0 :256] -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q10, q11}, [r1 :256]! -+ vqadd.s16 q0, q15 -+ pldw [lr] -+ vqadd.s16 q1, q10 -+ add lr, r2 -+ vqadd.s16 q2, q15 -+ subs r3, #1 -+ vqadd.s16 q3, q11 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ pop {pc} -+endfunc -+ -+@ ============================================================================ -+@ U & V add -+ -+@ add_residual4x4_c( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 -+ vmov.i16 q8, #0 -+ add ip, r0, r2 -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ lsl r2, #1 -+ vldm r1, {q10-q13} -+ vld2.16 {d0, d2}, [r0 :128], r2 -+ vld2.16 {d1, d3}, [ip :128], r2 -+ vld2.16 {d4, d6}, [r0 :128] -+ vld2.16 {d5, d7}, [ip :128] -+ -+ sub r0, r2 -+ vqadd.s16 q0, q10 -+ sub ip, r2 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ -+ vst2.16 {d0, d2}, [r0 :128], r2 -+ vst2.16 {d1, d3}, [ip :128], r2 -+ vst2.16 {d4, d6}, [r0 :128] -+ vst2.16 {d5, d7}, [ip :128] -+ bx lr -+endfunc -+ -+@ add_residual8x8_c( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 -+ push {lr} -+ add ip, r0, r2 -+ lsl r2, #1 -+ vmov.i16 q8, #0 -+ add r3, r1, #(8*8*2) @ Offset to V -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ mov lr, #8 -+1: -+ vld1.16 {q10, q11}, [r1 :256]! -+ subs lr, #2 -+ vld2.16 {q0, q1}, [r0 :256] -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q12, q13}, [r3 :256]! -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ pop {pc} -+endfunc -+ -+@ add_residual16x16_c( -+@ uint16_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 -+ push {r4, lr} -+ vmov.i16 q8, #0 -+ add r3, r1, #(16*16*2) @ Offset to V -+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 -+ add ip, r0, #32 -+ add r4, r0, r2 -+ mov lr, #16 -+1: -+ vld2.16 {q0, q1}, [r0 :256] -+ vld2.16 {q2, q3}, [ip :256] -+ vld1.16 {q10, q11}, [r1 :256]! -+ vld1.16 {q12, q13}, [r3 :256]! -+ vqadd.s16 q0, q10 -+ pldw [r4] -+ vqadd.s16 q1, q12 -+ add r4, r2 -+ vqadd.s16 q2, q11 -+ subs lr, #1 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0 :256], r2 -+ vst2.16 {q2, q3}, [ip :256], r2 -+ bne 1b -+ pop {r4,pc} -+endfunc -+ -diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S -new file mode 100644 -index 0000000000..d9a1d7d98c ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S -@@ -0,0 +1,741 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+ .arch_extension mp @ enable PLDW -+ -+@ General notes: -+@ -+@ Residual is generally only guaranteed to be clipped to 16 bits. -+@ This means that we do need to do vmovl, vqadd, vqmovun -+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away -+@ with this). -+@ -+@ There is an exception for the DC case because its transform is guaranteed -+@ to be small enough that overflow cannot occur during the first add. -+ -+@ ============================================================================ -+@ Y add -+ -+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1 -+ add ip, r0, r2 -+ vld1.16 {q0, q1}, [r1] -+ lsl r2, #1 -+ vld1.32 d4[0], [r0], r2 -+ rsb r3, r2, #0 -+ vld1.32 d4[1], [ip], r2 -+ vld1.32 d5[0], [r0], r3 -+ vld1.32 d5[1], [ip], r3 -+ vmovl.u8 q8, d4 -+ vmovl.u8 q9, d5 -+ vqadd.s16 q0, q8 -+ vqadd.s16 q1, q9 -+ vqmovun.s16 d0, q0 -+ vqmovun.s16 d1, q1 -+ vst1.32 d0[0], [r0], r2 -+ vst1.32 d0[1], [ip], r2 -+ vst1.32 d1[0], [r0] -+ vst1.32 d1[1], [ip] -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1 -+ push {r4, lr} -+ vld1.16 {q0, q1}, [r1]! -+ add ip, r0, r2 -+ vld1.8 {d6}, [r0] -+ add r4, r0, r2, lsl #1 -+ vld1.8 {d7}, [ip] -+ add lr, ip, r2, lsl #1 -+ lsl r2, #1 -+ mov r3, #8-2 -+ vmovl.u8 q2, d6 -+ vmovl.u8 q3, d7 -+ vqadd.s16 q2, q0 -+ vqadd.s16 q3, q1 -+1: -+ vld1.16 {q0, q1}, [r1]! -+ subs r3, #2 -+ vqmovun.s16 d4, q2 -+ vqmovun.s16 d5, q3 -+ vld1.8 {d6}, [r4], r2 -+ vld1.8 {d7}, [lr], r2 -+ vst1.8 {d4}, [r0], r2 -+ vst1.8 {d5}, [ip], r2 -+ vmovl.u8 q2, d6 -+ pldw [r4] -+ vmovl.u8 q3, d7 -+ vqadd.s16 q2, q0 -+ vqadd.s16 q3, q1 -+ bne 1b -+ -+ vqmovun.s16 d4, q2 -+ vqmovun.s16 d5, q3 -+ vst1.8 {d4}, [r0] -+ vst1.8 {d5}, [ip] -+ pop {r4, pc} -+endfunc -+ -+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1 -+ vld1.16 {q0, q1}, [r1]! -+ add ip, r0, r2 -+ vld1.8 {q3}, [r0] -+ mov r3, #16-1 -+ vmovl.u8 q2, d6 -+ vmovl.u8 q3, d7 -+ vqadd.s16 q2, q0 -+ vqadd.s16 q3, q1 -+1: -+ vld1.16 {q0, q1}, [r1]! -+ subs r3, #1 -+ vqmovun.s16 d4, q2 -+ vqmovun.s16 d5, q3 -+ vld1.8 {q3}, [ip], r2 -+ vst1.8 {q2}, [r0], r2 -+ vmovl.u8 q2, d6 -+ pldw [ip] -+ vmovl.u8 q3, d7 -+ vqadd.s16 q2, q0 -+ vqadd.s16 q3, q1 -+ bne 1b -+ -+ vqmovun.s16 d4, q2 -+ vqmovun.s16 d5, q3 -+ vst1.8 {q2}, [r0] -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1 -+ vldm r1!, {q0-q3} -+ vld1.8 {q8, q9}, [r0] -+ add ip, r0, r2 -+ vmovl.u8 q10, d16 -+ mov r3, #32-1 -+ vmovl.u8 q11, d17 -+ vmovl.u8 q12, d18 -+ vmovl.u8 q13, d19 -+ vqadd.s16 q10, q0 -+ vqadd.s16 q11, q1 -+ vqadd.s16 q12, q2 -+ vqadd.s16 q13, q3 -+1: -+ vldm r1!, {q0-q3} -+ vqmovun.s16 d20, q10 -+ vqmovun.s16 d21, q11 -+ vqmovun.s16 d22, q12 -+ vqmovun.s16 d23, q13 -+ vld1.8 {q8, q9}, [ip], r2 -+ subs r3, #1 -+ vst1.8 {q10, q11}, [r0], r2 -+ vmovl.u8 q10, d16 -+ pldw [ip] -+ vmovl.u8 q11, d17 -+ vmovl.u8 q12, d18 -+ vmovl.u8 q13, d19 -+ vqadd.s16 q10, q0 -+ vqadd.s16 q11, q1 -+ vqadd.s16 q12, q2 -+ vqadd.s16 q13, q3 -+ bne 1b -+ -+ vqmovun.s16 d20, q10 -+ vqmovun.s16 d21, q11 -+ vqmovun.s16 d22, q12 -+ vqmovun.s16 d23, q13 -+ vst1.8 {q10, q11}, [r0] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1 -+ add ip, r0, r1 -+ vdup.16 q15, r2 -+ lsl r1, #1 -+ vld1.32 d4[0], [r0], r1 -+ rsb r3, r1, #0 -+ vld1.32 d4[1], [ip], r1 -+ vld1.32 d5[0], [r0], r3 -+ vld1.32 d5[1], [ip], r3 -+ vaddw.u8 q0, q15, d4 -+ vaddw.u8 q1, q15, d5 -+ vqmovun.s16 d0, q0 -+ vqmovun.s16 d1, q1 -+ vst1.32 d0[0], [r0], r1 -+ vst1.32 d0[1], [ip], r1 -+ vst1.32 d1[0], [r0] -+ vst1.32 d1[1], [ip] -+ bx lr -+endfunc -+ -+@ ============================================================================ -+@ DC Y or C add -+ -+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1 -+ mov r3, #4-2 -+ vdup.32 q15, r2 -+ b 1f -+endfunc -+ -+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1 -+ vdup.16 q15, r2 -+ mov r3, #8-2 -+1: vld1.8 d16, [r0] -+ add ip, r0, r1 -+ push {r4, lr} -+ vld1.8 d17, [ip] -+ add r4, r0, r1, lsl #1 -+ vaddw.u8 q0, q15, d16 -+ lsl r1, #1 -+ vaddw.u8 q1, q15, d17 -+ add lr, ip, r1 -+1: -+ vld1.8 {d16}, [r4], r1 -+ vld1.8 {d17}, [lr], r1 -+ subs r3, #2 -+ vqmovun.s16 d4, q0 -+ vqmovun.s16 d5, q1 -+ vaddw.u8 q0, q15, d16 -+ vaddw.u8 q1, q15, d17 -+ vst1.8 {d4}, [r0], r1 -+ vst1.8 {d5}, [ip], r1 -+ bne 1b -+ -+ vqmovun.s16 d4, q0 -+ vqmovun.s16 d5, q1 -+ vst1.8 {d4}, [r0] -+ vst1.8 {d5}, [ip] -+ pop {r4, pc} -+endfunc -+ -+ -+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1 -+ mov r3, #8-1 -+ vdup.32 q15, r2 -+ b 1f -+endfunc -+ -+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1 -+ vdup.16 q15, r2 -+ mov r3, #16-1 -+1: vld1.8 {q8}, [r0] -+ add ip, r0, r1 -+ vaddw.u8 q0, q15, d16 -+ vaddw.u8 q1, q15, d17 -+1: -+ vld1.8 {q8}, [ip], r1 -+ subs r3, #1 -+ vqmovun.s16 d4, q0 -+ vqmovun.s16 d5, q1 -+ vaddw.u8 q0, q15, d16 -+ vaddw.u8 q1, q15, d17 -+ vst1.8 {q2}, [r0], r1 -+ bne 1b -+ -+ vqmovun.s16 d4, q0 -+ vqmovun.s16 d5, q1 -+ vst1.8 {q2}, [r0] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1 -+ mov r3, #16-1 -+ vdup.32 q15, r2 -+ b 1f -+endfunc -+ -+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8( -+@ uint8_t * dst, // [r0] -+@ unsigned int stride, // [r1] -+@ int dc) // [r2] -+ -+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1 -+ vdup.16 q15, r2 -+ mov r3, #32-1 -+1: vld1.8 {q8, q9}, [r0] -+ add ip, r0, r1 -+ vaddw.u8 q0, q15, d16 -+ vaddw.u8 q1, q15, d17 -+ vaddw.u8 q2, q15, d18 -+ vaddw.u8 q3, q15, d19 -+1: -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d22, q2 -+ vqmovun.s16 d23, q3 -+ vld1.8 {q8, q9}, [ip], r1 -+ subs r3, #1 -+ vaddw.u8 q0, q15, d16 -+ vaddw.u8 q1, q15, d17 -+ vaddw.u8 q2, q15, d18 -+ vaddw.u8 q3, q15, d19 -+ vst1.8 {q10, q11}, [r0], r1 -+ bne 1b -+ -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d22, q2 -+ vqmovun.s16 d23, q3 -+ vst1.8 {q10, q11}, [r0] -+ bx lr -+endfunc -+ -+@ ============================================================================ -+@ U add -+ -+@ add_residual4x4_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc_v) [r3] -+ -+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1 -+ add ip, r0, r2 -+ vld1.16 {q0, q1}, [r1] -+ lsl r2, #1 -+ vld1.8 {d16}, [r0 :64], r2 -+ vld1.8 {d17}, [ip :64], r2 -+ vld1.8 {d18}, [r0 :64] -+ sub r0, r2 -+ vld1.8 {d19}, [ip :64] -+ sub ip, r2 -+ vdup.16 q2, r3 -+ vdup.16 q3, r3 -+ vmovl.u8 q10, d16 -+ vmovl.u8 q11, d17 -+ vmovl.u8 q12, d18 -+ vmovl.u8 q13, d19 -+ vzip.16 q0, q2 -+ vzip.16 q1, q3 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q3, q13 -+ vqmovun.s16 d0, q0 -+ vqmovun.s16 d1, q2 -+ vqmovun.s16 d2, q1 -+ vqmovun.s16 d3, q3 -+ vst1.8 {d0}, [r0 :64], r2 -+ vst1.8 {d1}, [ip :64], r2 -+ vst1.8 {d2}, [r0 :64] -+ vst1.8 {d3}, [ip :64] -+ bx lr -+endfunc -+ -+@ add_residual8x8_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+@ int dc_v) [r3] -+ -+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ push {r4, lr} -+ vld2.8 {d16, d17}, [r0 :128] -+ lsl r2, #1 -+ vld2.8 {d18, d19}, [ip :128] -+ mov r3, #8-2 -+ vld1.16 {q0, q1}, [r1 :256]! -+ add r4, r0, r2 -+ vmovl.u8 q10, d16 -+ add lr, ip, r2 -+ vmovl.u8 q11, d18 -+ vqadd.s16 q0, q10 -+ vaddw.u8 q2, q15, d17 -+ vqadd.s16 q1, q11 -+ vaddw.u8 q3, q15, d19 -+1: -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d21, q2 -+ vld2.8 {d16, d17}, [r4 :128], r2 -+ subs r3, #2 -+ vqmovun.s16 d22, q1 -+ vqmovun.s16 d23, q3 -+ vst2.8 {d20, d21}, [r0 :128], r2 -+ vld2.8 {d18, d19}, [lr :128], r2 -+ vst2.8 {d22, d23}, [ip :128], r2 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vmovl.u8 q10, d16 -+ vmovl.u8 q11, d18 -+ vqadd.s16 q0, q10 -+ vaddw.u8 q2, q15, d17 -+ vqadd.s16 q1, q11 -+ vaddw.u8 q3, q15, d19 -+ bne 1b -+ -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d21, q2 -+ vqmovun.s16 d22, q1 -+ vqmovun.s16 d23, q3 -+ vst2.8 {d20, d21}, [r0 :128] -+ vst2.8 {d22, d23}, [ip :128] -+ pop {r4, pc} -+endfunc -+ -+@ add_residual16x16_u( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+@ int dc_v) [r3] -+ -+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ vld2.8 {q8, q9}, [r0 :256] -+ mov r3, #16-1 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vmovl.u8 q11, d16 -+ vmovl.u8 q12, d17 -+ vqadd.s16 q0, q11 -+ vaddw.u8 q11, q15, d18 -+ vqadd.s16 q1, q12 -+ vaddw.u8 q12, q15, d19 -+1: -+ vld2.8 {q8, q9}, [ip :256], r2 -+ subs r3, #1 -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d22, q11 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d23, q12 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vst2.8 {q10, q11}, [r0 :256], r2 -+ vmovl.u8 q11, d16 -+ pldw [ip] -+ vmovl.u8 q12, d17 -+ vqadd.s16 q0, q11 -+ vaddw.u8 q11, q15, d18 -+ vqadd.s16 q1, q12 -+ vaddw.u8 q12, q15, d19 -+ bne 1b -+ -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d22, q11 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d23, q12 -+ vst2.8 {q10, q11}, [r0 :256] -+ bx lr -+endfunc -+ -+@ ============================================================================ -+@ V add -+ -+@ add_residual4x4_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1 -+ add ip, r0, r2 -+ vld1.16 {q2, q3}, [r1] -+ lsl r2, #1 -+ vld1.8 {d16}, [r0 :64], r2 -+ vld1.8 {d17}, [ip :64], r2 -+ vld1.8 {d18}, [r0 :64] -+ sub r0, r2 -+ vld1.8 {d19}, [ip :64] -+ sub ip, r2 -+ vdup.16 q0, r3 -+ vdup.16 q1, r3 -+ vmovl.u8 q10, d16 -+ vmovl.u8 q11, d17 -+ vmovl.u8 q12, d18 -+ vmovl.u8 q13, d19 -+ vzip.16 q0, q2 -+ vzip.16 q1, q3 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q3, q13 -+ vqmovun.s16 d0, q0 -+ vqmovun.s16 d1, q2 -+ vqmovun.s16 d2, q1 -+ vqmovun.s16 d3, q3 -+ vst1.8 {d0}, [r0 :64], r2 -+ vst1.8 {d1}, [ip :64], r2 -+ vst1.8 {d2}, [r0 :64] -+ vst1.8 {d3}, [ip :64] -+ bx lr -+endfunc -+ -+@ add_residual8x8_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ push {r4, lr} -+ vld2.8 {d16, d17}, [r0 :128] -+ lsl r2, #1 -+ vld2.8 {d18, d19}, [ip :128] -+ mov r3, #8-2 -+ vld1.16 {q0, q1}, [r1 :256]! -+ add r4, r0, r2 -+ vmovl.u8 q10, d17 -+ add lr, ip, r2 -+ vmovl.u8 q11, d19 -+ vqadd.s16 q0, q10 -+ vaddw.u8 q2, q15, d16 -+ vqadd.s16 q1, q11 -+ vaddw.u8 q3, q15, d18 -+1: -+ vqmovun.s16 d20, q2 -+ vqmovun.s16 d21, q0 -+ vld2.8 {d16, d17}, [r4 :128], r2 -+ subs r3, #2 -+ vqmovun.s16 d22, q3 -+ vqmovun.s16 d23, q1 -+ vst2.8 {d20, d21}, [r0 :128], r2 -+ vld2.8 {d18, d19}, [lr :128], r2 -+ vst2.8 {d22, d23}, [ip :128], r2 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vmovl.u8 q10, d17 -+ vmovl.u8 q11, d19 -+ vqadd.s16 q0, q10 -+ vaddw.u8 q2, q15, d16 -+ vqadd.s16 q1, q11 -+ vaddw.u8 q3, q15, d18 -+ bne 1b -+ -+ vqmovun.s16 d20, q2 -+ vqmovun.s16 d21, q0 -+ vqmovun.s16 d22, q3 -+ vqmovun.s16 d23, q1 -+ vst2.8 {d20, d21}, [r0 :128] -+ vst2.8 {d22, d23}, [ip :128] -+ pop {r4, pc} -+endfunc -+ -+@ add_residual16x16_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1 -+ vdup.16 q15, r3 -+ add ip, r0, r2 -+ vld2.8 {q8, q9}, [r0 :256] -+ mov r3, #16-1 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vmovl.u8 q11, d18 -+ vmovl.u8 q12, d19 -+ vqadd.s16 q0, q11 -+ vaddw.u8 q11, q15, d16 -+ vqadd.s16 q1, q12 -+ vaddw.u8 q12, q15, d17 -+1: -+ vld2.8 {q8, q9}, [ip :256], r2 -+ subs r3, #1 -+ vqmovun.s16 d20, q11 -+ vqmovun.s16 d22, q0 -+ vqmovun.s16 d21, q12 -+ vqmovun.s16 d23, q1 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vst2.8 {q10, q11}, [r0 :256], r2 -+ vmovl.u8 q11, d18 -+ pldw [ip] -+ vmovl.u8 q12, d19 -+ vqadd.s16 q0, q11 -+ vaddw.u8 q11, q15, d16 -+ vqadd.s16 q1, q12 -+ vaddw.u8 q12, q15, d17 -+ bne 1b -+ -+ vqmovun.s16 d20, q11 -+ vqmovun.s16 d22, q0 -+ vqmovun.s16 d21, q12 -+ vqmovun.s16 d23, q1 -+ vst2.8 {q10, q11}, [r0 :256] -+ bx lr -+endfunc -+ -+@ ============================================================================ -+@ U & V add -+ -+@ add_residual4x4_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1 -+ add ip, r0, r2 -+ vld1.16 {q0, q1}, [r1]! @ all of U -+ lsl r2, #1 -+ vld1.8 {d16}, [r0 :64], r2 -+ rsb r3, r2, #0 -+ vld1.8 {d17}, [ip :64], r2 -+ vld1.16 {q2, q3}, [r1] @ all of V -+ vld1.8 {d18}, [r0 :64], r3 -+ vld1.8 {d19}, [ip :64], r3 -+ vmovl.u8 q10, d16 -+ vmovl.u8 q11, d17 -+ vmovl.u8 q12, d18 -+ vmovl.u8 q13, d19 -+ vzip.16 q0, q2 -+ vzip.16 q1, q3 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q3, q13 -+ vqmovun.s16 d0, q0 -+ vqmovun.s16 d1, q2 -+ vqmovun.s16 d2, q1 -+ vqmovun.s16 d3, q3 -+ vst1.8 {d0}, [r0 :64], r2 -+ vst1.8 {d1}, [ip :64], r2 -+ vst1.8 {d2}, [r0 :64] -+ vst1.8 {d3}, [ip :64] -+ bx lr -+endfunc -+ -+@ add_residual8x8_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1 -+ vld2.8 {d16, d17}, [r0 :128] -+ add r3, r1, #(8*8*2) @ Offset to V -+ vld1.16 {q0}, [r1 :128]! -+ add ip, r0, r2 -+ vld1.16 {q1}, [r3 :128]! -+ vmovl.u8 q10, d16 -+ push {lr} -+ vmovl.u8 q8, d17 -+ mov lr, #8-1 -+ vqadd.s16 q10, q0 -+ vqadd.s16 q1, q8 -+1: -+ vld2.8 {d16, d17}, [ip :128], r2 -+ subs lr, #1 -+ vld1.16 {q0}, [r1 :128]! -+ vqmovun.s16 d20, q10 -+ vqmovun.s16 d21, q1 -+ vld1.16 {q1}, [r3 :128]! -+ vst2.8 {d20, d21}, [r0 :128], r2 -+ vmovl.u8 q10, d16 -+ pldw [ip] -+ vmovl.u8 q8, d17 -+ vqadd.s16 q10, q0 -+ vqadd.s16 q1, q8 -+ bne 1b -+ -+ vqmovun.s16 d20, q10 -+ vqmovun.s16 d21, q1 -+ vst2.8 {d20, d21}, [r0 :128] -+ pop {pc} -+endfunc -+ -+@ add_residual16x16_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1 -+ vld2.8 {q8, q9}, [r0 :256] -+ add r3, r1, #(16*16*2) @ Offset to V -+ vld1.16 {q0, q1}, [r1 :256]! -+ add ip, r0, r2 -+ vld1.16 {q2, q3}, [r3 :256]! -+ vmovl.u8 q10, d16 -+ push {lr} -+ vmovl.u8 q8, d17 -+ mov lr, #16-1 -+ vmovl.u8 q11, d18 -+ vmovl.u8 q9, d19 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q8 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q9 -+1: -+ vld2.8 {q8, q9}, [ip :256], r2 -+ subs lr, #1 -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d22, q2 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d23, q3 -+ vld1.16 {q0, q1}, [r1 :256]! -+ vst2.8 {d20-d23}, [r0 :256], r2 -+ vld1.16 {q2, q3}, [r3 :256]! -+ vmovl.u8 q10, d16 -+ pldw [ip] -+ vmovl.u8 q8, d17 -+ vmovl.u8 q11, d18 -+ vmovl.u8 q9, d19 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q8 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q9 -+ bne 1b -+ -+ vqmovun.s16 d20, q0 -+ vqmovun.s16 d22, q2 -+ vqmovun.s16 d21, q1 -+ vqmovun.s16 d23, q3 -+ vst2.8 {d20-d23}, [r0 :256] -+ pop {pc} -+endfunc -+ -+@ 32x32 chroma never occurs so NIF -+ -+@ ============================================================================ -diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S -new file mode 100644 -index 0000000000..b56e0f9644 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S -@@ -0,0 +1,2245 @@ -+/* -+ * Copyright (c) 2014 - 2015 Seppo Tomperi -+ * 2017 John Cox (for Raspberry Pi) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+.set EDGE_SRC_STRIDE, 160 -+ -+@ PIC jump tables are fractionally more expensive than absolute in our code -+.set jent_pic, CONFIG_PIC -+ -+ -+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4 -+ vshr.u8 q12, q8, #3 -+ \I1 -+ vadd.i8 q8, \Q_K128 -+ \I2 -+ vshr.u8 q13, q9, #3 -+ \I3 -+ vadd.i8 q9, \Q_K128 -+ \I4 -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ -+ vqadd.s8 q8, q12 -+ vshr.u8 q12, q10, #3 -+ vadd.i8 q10, \Q_K128 -+ vqadd.s8 q9, q13 -+ vshr.u8 q13, q11, #3 -+ vadd.i8 q11, \Q_K128 -+ -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ vqadd.s8 q10, q12 -+ vsub.i8 q8, \Q_K128 -+ vqadd.s8 q11, q13 -+ vsub.i8 q9, \Q_K128 -+ vsub.i8 q10, \Q_K128 -+ vsub.i8 q11, \Q_K128 -+.endm -+ -+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4 -+ \L1 -+ \L2 -+ \L3 -+ \L4 -+ \L5 -+ vadd.i8 q12, q8, \Q_K128 -+ vshr.u8 q8, #3 -+ vtbl.8 d16, \XLAT0, d16 -+ vtbl.8 d17, \XLAT1, d17 -+ vqadd.s8 q12, q8 -+ bmi 2f -+1: \L1 -+ \L2 -+ \L3 -+ \L4 -+ \L5 -+ vsub.i8 q13, q12, \Q_K128 -+ vadd.i8 q12, q8, \Q_K128 -+ vshr.u8 q8, #3 -+ \S1 -+ \S2 -+ \S3 -+ \S4 -+ vtbl.8 d16, \XLAT0, d16 -+ vtbl.8 d17, \XLAT1, d17 -+ vqadd.s8 q12, q8 -+ bpl 1b -+2: vsub.i8 q13, q12, \Q_K128 -+ \S1 -+ \S2 -+ \S3 -+ \S4 -+.endm -+ -+ -+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmax.s16 \Q2, \Q_MIN -+ vmax.s16 \Q3, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX -+ vmin.s16 \Q2, \Q_MAX -+ vmin.s16 \Q3, \Q_MAX -+.endm -+ -+@ Clobbers q12, q13 -+.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2 -+ vshrn.i16 d24, \Q0, #(\bit_depth - 5) -+ vshrn.i16 d25, \Q1, #(\bit_depth - 5) -+ vshrn.i16 d26, \Q2, #(\bit_depth - 5) -+ \I1 -+ vtbl.8 d24, \XLAT0, d24 -+ vshrn.i16 d27, \Q3, #(\bit_depth - 5) -+ vtbl.8 d25, \XLAT1, d25 -+ \I2 -+ vtbl.8 d26, \XLAT0, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ vaddw.s8 \Q0, d24 -+ vaddw.s8 \Q1, d25 -+ vaddw.s8 \Q2, d26 -+ vaddw.s8 \Q3, d27 -+ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX -+.endm -+ -+@ Clobbers q10, q11, q12 -+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4 -+ \L1 -+ \L2 -+ \L3 -+ \L4 -+ \L5 -+ vshrn.i16 d24, \Q0, #\bit_depth - 5 -+ vshrn.i16 d25, \Q1, #\bit_depth - 5 -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 -+ vaddw.s8 q10, \Q0, d24 -+ vaddw.s8 q11, \Q1, d25 -+ bmi 2f -+1: \L1 -+ \L2 -+ \L3 -+ \L4 -+ \L5 -+ vmax.s16 q10, \Q_MIN -+ vmax.s16 q11, \Q_MIN -+ vshrn.i16 d24, \Q0, #\bit_depth - 5 -+ vshrn.i16 d25, \Q1, #\bit_depth - 5 -+ vmin.s16 q10, \Q_MAX -+ vmin.s16 q11, \Q_MAX -+ \S1 -+ \S2 -+ \S3 -+ \S4 -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 -+ vaddw.s8 q10, \Q0, d24 -+ vaddw.s8 q11, \Q1, d25 -+ bpl 1b -+2: vmax.s16 q10, \Q_MIN -+ vmax.s16 q11, \Q_MIN -+ vmin.s16 q10, \Q_MAX -+ vmin.s16 q11, \Q_MAX -+ \S1 -+ \S2 -+ \S3 -+ \S4 -+.endm -+ -+ -+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) -+@ so we are quite safe stuffing it into a byte array -+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma -+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of -+@ precision -+ -+@ This, somewhat nasty, bit of code builds the {d0-d3} translation -+@ array via the stack -+@ Given that sao_left_class > 28 can cause wrap we can't just poke -+@ all 4 bytes in at once -+@ -+@ It also loads other common regs -+ -+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately -+function band_load_y -+ ldr ip, [sp, #16] @ &sao_offset_val[0] -+ ldr r4, [sp, #20] @ sao_left_class -+ vmov.i64 d4, #0 -+ vmov.i64 q0, #0 -+ pld [r1] -+ vld2.8 {q8}, [ip] -+ sub ip, sp, #8*5 -+ vmov.i64 q1, #0 -+ add r4, ip, r4 -+ vpush {d0-d4} @ Put zero array on stack -+ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] -+ ldr ip, [ip, #8*5 + 28] @ height -+ vst1.32 {d16[0]}, [r4] -+ add r4, r1, r3 -+ vpop {d0-d4} @ Pop modified array -+ sub ip, ip, #1 -+ vorr d0, d0, d4 -+ bx lr -+endfunc -+ -+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately -+function band_load_c -+ ldr ip, [sp, #16] @ &sao_offset_val1[0] -+ ldr r4, [sp, #20] @ sao_left_class1 -+ vmov.i64 d24, #0 -+ vmov.i64 q10, #0 -+ pld [r1] -+ vld2.8 {q8}, [ip] -+ sub ip, sp, #8*5 -+ vmov.i64 q11, #0 -+ add r4, ip, r4 -+ ldr ip, [sp, #24] @ &sao_offset_val2[0] -+ vpush {d20-d24} @ Put zero array on stack -+ vld2.8 {q9}, [ip] -+ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] -+ ldr ip, [sp, #8*5 + 28] @ sao_left_class2 -+ vst1.32 {d16[0]}, [r4] -+ add ip, sp, ip -+ vshr.u64 d18, d18, #8 @ 1st interesting val is [1] -+ vldmia sp, {d0-d3} @ Load modified array -+ vldr d16, [sp, #8*4] -+ add r4, r1, r3 -+ vstmia sp, {d20-d24} @ Put zero array on stack (again) -+ vst1.32 {d18[0]}, [ip] -+ vorr d0, d0, d16 -+ vldmia sp, {d4-d7} @ Load modified array -+ vldr d18, [sp, #8*4] -+ ldr ip, [sp, #8*5 + 36] @ height -+ add sp, sp, #8*5 -+ vorr d4, d4, d18 -+ sub ip, ip, #1 -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_band_64_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_band_64_neon_8, export=1 -+ push {r4-r6, lr} -+ vmov.u8 q15, #128 -+ bl band_load_y -+ -+1: vldmia r1, {q8-q11} -+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \ -+ "pld [r4]", \ -+ "subs ip, #1", \ -+ "it ne; addne r4, r3", \ -+ "add r1, r3" -+ vstmia r0, {q8-q11} -+ add r0, r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_32_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_band_32_neon_8, export=1 -+ push {r4-r6, lr} -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ vmov.u8 q15, #128 -+ bl band_load_y -+ -+1: vld1.8 { q8, q9 }, [r1, :128], r3 -+ subs ip, #2 -+ vld1.8 {q10, q11}, [r6, :128], r3 -+ -+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 -+ -+ vst1.8 { q8, q9 }, [r0, :128], r2 -+ vst1.8 {q10, q11}, [r5, :128], r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_16_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_band_16_neon_8, export=1 -+ push {r4-r6, lr} -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ vmov.u8 q15, #128 -+ bl band_load_y -+ -+1: vld1.8 { q8}, [r1, :128], r3 -+ subs ip, #4 -+ vld1.8 { q9}, [r6, :128], r3 -+ vld1.8 {q10}, [r1, :128], r3 -+ vld1.8 {q11}, [r6, :128], r3 -+ -+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 -+ -+ vst1.8 { q8}, [r0, :128], r2 -+ vst1.8 { q9}, [r5, :128], r2 -+ vst1.8 {q10}, [r0, :128], r2 -+ vst1.8 {q11}, [r5, :128], r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_8_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_band_8_neon_8, export=1 -+ ldr ip, [sp, #8] @ width -+ push {r4-r6, lr} -+ vmov.u8 q15, #128 -+ cmp ip, #8 -+ bl band_load_y -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ blt 4f -+ -+ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ -+ "vld1.8 {d16}, [r1, :64], r3", \ -+ "subs ip, #2", \ -+ "vld1.8 {d17}, [r6, :64], r3", \ -+ "", \ -+ "", \ -+ "vst1.8 {d26}, [r0, :64], r2", \ -+ "vst1.8 {d27}, [r5, :64], r2" -+ pop {r4-r6, pc} -+4: -+ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ -+ "vld1.32 {d16[0]}, [r1, :32], r3", \ -+ "subs ip, #4", \ -+ "vld1.32 {d16[1]}, [r6, :32], r3", \ -+ "vld1.32 {d17[0]}, [r1, :32], r3", \ -+ "vld1.32 {d17[1]}, [r6, :32], r3", \ -+ "vst1.32 {d26[0]}, [r0, :32], r2", \ -+ "vst1.32 {d26[1]}, [r5, :32], r2", \ -+ "vst1.32 {d27[0]}, [r0, :32], r2", \ -+ "vst1.32 {d27[1]}, [r5, :32], r2" -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_c_32_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 -+ push {r4-r6, lr} -+ add r5, r0, #32 -+ add r6, r1, #32 -+ vmov.u8 q15, #128 -+ bl band_load_c -+ -+1: vld2.8 { q8, q9 }, [r1, :128], r3 -+ subs ip, #1 -+ vld2.8 {q10, q11}, [r6, :128], r3 -+ -+ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \ -+ "pld [r4]", \ -+ "it ne; addne r4, r3" -+ -+ vst2.8 { q8, q9 }, [r0, :128], r2 -+ vst2.8 {q10, q11}, [r5, :128], r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_c_16_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 -+ push {r4-r6, lr} -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ vmov.u8 q15, #128 -+ bl band_load_c -+ -+1: vld2.8 { q8, q9 }, [r1, :128], r3 -+ subs ip, #2 -+ vld2.8 {q10, q11}, [r6, :128], r3 -+ -+ sao_band_64b_8 {d0-d3}, {d4-d7}, q15 -+ -+ vst2.8 { q8, q9 }, [r0, :128], r2 -+ vst2.8 {q10, q11}, [r5, :128], r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+endfunc -+ -+@ ff_hevc_rpi_sao_band_c_8_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 -+ ldr ip, [sp, #16] @ width -+ push {r4-r6, lr} -+ vmov.u8 q15, #128 -+ cmp ip, #8 -+ bl band_load_c -+ blt 4f -+ -+ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ -+ "vld2.8 {d16-d17}, [r1, :128], r3", \ -+ "subs ip, #1", \ -+ "", \ -+ "", \ -+ "", \ -+ "vst2.8 {d26-d27}, [r0, :128], r2" -+ pop {r4-r6, pc} -+4: -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ -+ "vld1.8 {d16}, [r1, :64], r3", \ -+ "subs ip, #2", \ -+ "vld1.8 {d17}, [r6, :64], r3", \ -+ "vuzp.8 d16, d17", \ -+ "", \ -+ "vzip.8 d26, d27", \ -+ "vst1.8 {d26}, [r0, :64], r2", \ -+ "vst1.8 {d27}, [r5, :64], r2" -+ pop {r4-r6, pc} -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_band_64_neon_10 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+.macro band_64_16 bit_depth -+ push {r4-r6, lr} -+ vmov.i64 q2, #0 -+ vmov.i16 q3, #(1 << \bit_depth) - 1 -+ bl band_load_y -+ vpush {q4-q7} -+ -+1: vldm r1, {q4-q11} -+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ -+ "subs ip, #1", \ -+ "add r1, r3" -+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth -+ vstm r0, {q4-q11} -+ add r0, r2 -+ bpl 1b -+ -+ vpop {q4-q7} -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_64_neon_10, export=1 -+ band_64_16 10 -+endfunc -+ -+@ ff_hevc_rpi_sao_band_32_neon_10 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+.macro band_32_16 bit_depth -+ push {r4-r6, lr} -+ vmov.i64 q2, #0 -+ vmov.i16 q3, #(1 << \bit_depth) - 1 -+ bl band_load_y -+ -+1: vldm r1, {q8-q11} -+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ -+ "subs ip, #1", \ -+ "add r1, r3" -+ vstm r0, {q8-q11} -+ add r0, r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_32_neon_10, export=1 -+ band_32_16 10 -+endfunc -+ -+@ ff_hevc_rpi_sao_band_16_neon_10 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+.macro band_16_16 bit_depth -+ push {r4-r6, lr} -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ vmov.i64 q14, #0 -+ vmov.i16 q15, #(1 << \bit_depth) - 1 -+ bl band_load_y -+ -+1: vld1.16 { q8, q9 }, [r1, :128], r3 -+ subs r12, #2 -+ vld1.16 {q10, q11}, [r6, :128], r3 -+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth -+ vst1.16 { q8, q9 }, [r0, :128], r2 -+ vst1.16 {q10, q11}, [r5, :128], r2 -+ bpl 1b -+ -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_16_neon_10, export=1 -+ band_16_16 10 -+endfunc -+ -+@ ff_hevc_rpi_sao_band_8_neon_10 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+.macro band_8_16 bit_depth -+ ldr ip, [sp, #8] @ width -+ push {r4-r6, lr} -+ vmov.i64 q14, #0 -+ cmp ip, #8 -+ vmov.i16 q15, #(1 << \bit_depth) - 1 -+ bl band_load_y -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ blt 4f -+ -+ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ -+ "vld1.16 {q8}, [r1, :128], r3", \ -+ "subs ip, #2", \ -+ "vld1.16 {q9}, [r6, :128], r3", \ -+ "", \ -+ "", \ -+ "vst1.16 {q10}, [r0, :128], r2", \ -+ "vst1.16 {q11}, [r5, :128], r2" -+ pop {r4-r6, pc} -+4: -+ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ -+ "vld1.16 {d16}, [r1, :64], r3", \ -+ "subs ip, #4", \ -+ "vld1.16 {d17}, [r6, :64], r3", \ -+ "vld1.16 {d18}, [r1, :64], r3", \ -+ "vld1.16 {d19}, [r6, :64], r3", \ -+ "vst1.16 {d20}, [r0, :64], r2", \ -+ "vst1.16 {d21}, [r5, :64], r2", \ -+ "vst1.16 {d22}, [r0, :64], r2", \ -+ "vst1.16 {d23}, [r5, :64], r2" -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_8_neon_10, export=1 -+ band_8_16 10 -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_band_c_32_neon_10( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+.macro band_c_32_16 bit_depth -+ push {r4-r6, lr} -+ add r5, r0, #32 -+ add r6, r1, #32 -+ sub r2, #64 -+ sub r3, #64 -+ vmov.i64 q14, #0 -+ vmov.i16 q15, #(1 << \bit_depth) - 1 -+ bl band_load_c -+ mov lr, #64 -+ vpush {q4-q7} -+ -+1: vld2.16 { q4, q5 }, [r1, :128], lr -+ subs ip, #1 -+ vld2.16 { q6, q7 }, [r6, :128], lr -+ vld2.16 { q8, q9 }, [r1, :128], r3 -+ vld2.16 {q10, q11}, [r6, :128], r3 -+ -+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ -+ "pld [r4]", \ -+ "it ne; addne r4, r3" -+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth -+ -+ vst2.16 { q4, q5 }, [r0, :128], lr -+ vst2.16 { q6, q7 }, [r5, :128], lr -+ vst2.16 { q8, q9 }, [r0, :128], r2 -+ vst2.16 {q10, q11}, [r5, :128], r2 -+ -+ bpl 1b -+ -+ vpop {q4-q7} -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 -+ band_c_32_16 10 -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_band_c_16_neon_10( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+.macro band_c_16_16 bit_depth -+ push {r4-r6, lr} -+ add r5, r0, #32 -+ add r6, r1, #32 -+ vmov.i64 q14, #0 -+ vmov.i16 q15, #(1 << \bit_depth) - 1 -+ bl band_load_c -+ -+1: vld2.16 { q8, q9 }, [r1, :128], r3 -+ subs ip, #1 -+ vld2.16 {q10, q11}, [r6, :128], r3 -+ -+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth -+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth -+ -+ vst2.16 { q8, q9 }, [r0, :128], r2 -+ vst2.16 {q10, q11}, [r5, :128], r2 -+ -+ bpl 1b -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 -+ band_c_16_16 10 -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_band_c_8_neon_10( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] -+ -+.macro band_c_8_16 bit_depth -+ ldr ip, [sp, #16] @ width -+ push {r4-r6, lr} -+ vmov.i64 q14, #0 -+ cmp ip, #8 -+ vmov.i16 q15, #(1 << \bit_depth) - 1 -+ bl band_load_c -+ blt 4f -+ -+ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ -+ "vld2.16 {q8,q9}, [r1, :128], r3", \ -+ "subs ip, #1", \ -+ "", \ -+ "", \ -+ "", \ -+ "vst2.16 {q10,q11}, [r0, :128], r2" -+ pop {r4-r6, pc} -+4: -+ add r5, r0, r2 -+ add r6, r1, r3 -+ lsl r2, #1 -+ lsl r3, #1 -+ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ -+ "vld2.16 {d16,d18}, [r1, :128], r3", \ -+ "subs ip, #2", \ -+ "vld2.16 {d17,d19}, [r6, :128], r3", \ -+ "", \ -+ "", \ -+ "vst2.16 {d20,d22}, [r0, :128], r2", \ -+ "vst2.16 {d21,d23}, [r5, :128], r2" -+ pop {r4-r6, pc} -+.endm -+ -+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 -+ band_c_8_16 10 -+endfunc -+ -+ -+@ ============================================================================= -+@ SAO EDGE -+ -+@ r0 destination address -+@ r2 stride to post-increment r0 with -+@ [r5] translate values -+@ -+@ a <- c <- b -+@ a in q0 - q3 -+@ c in q4 - q7 -+@ b in q8 - q11 -+@ -+@ q12-15 used as temp -+@ -+@ Can be used for both Y & C as we unzip/zip the deltas and -+@ transform "u/v" separately via d26/d27. For Y d26=d27 -+ -+function edge_64b_body_8 -+ -+ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q1 -+ vcgt.u8 q14, q6, q2 -+ vcgt.u8 q15, q7, q3 -+ -+ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 -+ vcgt.u8 q1, q5 -+ vcgt.u8 q2, q6 -+ vcgt.u8 q3, q7 -+ -+ vsub.s8 q0, q12 @ a = sign(c-a) -+ vsub.s8 q1, q13 -+ vsub.s8 q2, q14 -+ vsub.s8 q3, q15 -+ -+ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q9 -+ vcgt.u8 q14, q6, q10 -+ vcgt.u8 q15, q7, q11 -+ -+ vsub.s8 q0, q12 -+ vsub.s8 q1, q13 -+ vsub.s8 q2, q14 -+ vsub.s8 q3, q15 -+ -+ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 -+ vcgt.u8 q13, q9, q5 -+ vcgt.u8 q14, q10, q6 -+ vcgt.u8 q15, q11, q7 -+ -+ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) -+ vadd.s8 q1, q13 -+ vmov.u8 q12, #2 -+ vadd.s8 q2, q14 -+ vadd.s8 q3, q15 -+ -+ vadd.s8 q0, q12 -+ vadd.s8 q1, q12 -+ -+ vld1.8 {d26, d27}, [r5] -+ -+ vadd.s8 q2, q12 -+ vuzp.8 q0, q1 -+ vmov.u8 q15, #128 -+ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) -+ -+ vtbl.8 d0, {d26}, d0 -+ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add -+ -+ vtbl.8 d1, {d26}, d1 -+ vadd.s8 q14, q5, q15 -+ -+ vtbl.8 d2, {d27}, d2 -+ vuzp.8 q2, q3 -+ -+ vtbl.8 d3, {d27}, d3 -+ -+ vtbl.8 d4, {d26}, d4 -+ vzip.8 q0, q1 -+ -+ vtbl.8 d5, {d26}, d5 -+ vqadd.s8 q0, q12 -+ vqadd.s8 q1, q14 -+ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add -+ -+ vtbl.8 d6, {d27}, d6 -+ vtbl.8 d7, {d27}, d7 -+ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add -+ vzip.8 q2, q3 -+ -+ vsub.s8 q0, q15 -+ vqadd.s8 q2, q12 -+ vqadd.s8 q3, q14 -+ vsub.s8 q1, q15 -+ vsub.s8 q2, q15 -+ vsub.s8 q3, q15 -+ -+ bx lr -+endfunc -+ -+@ r0 destination address -+@ r2 stride to post-increment r0 with -+@ r4 upper clip value -+@ [r5] translate values -+@ -+@ a <- c <- b -+@ a in q0 - q3 -+@ c in q4 - q7 -+@ b in q8 - q11 -+@ -+@ q12-15 used as temp -+@ -+@ Can be used for both Y & C as we unzip/zip the deltas and -+@ transform "u/v" separately via d26/d27. For Y d26=d27 -+ -+function edge_64b_body_16 -+ -+ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 -+ vcgt.u16 q13, q5, q1 -+ vcgt.u16 q14, q6, q2 -+ vcgt.u16 q15, q7, q3 -+ -+ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 -+ vcgt.u16 q1, q1, q5 -+ vcgt.u16 q2, q2, q6 -+ vcgt.u16 q3, q3, q7 -+ -+ vsub.s16 q0, q0, q12 // a = sign(c-a) -+ vsub.s16 q1, q1, q13 -+ vsub.s16 q2, q2, q14 -+ vsub.s16 q3, q3, q15 -+ -+ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 -+ vcgt.u16 q13, q5, q9 -+ vcgt.u16 q14, q6, q10 -+ vcgt.u16 q15, q7, q11 -+ -+ vsub.s16 q0, q0, q12 -+ vsub.s16 q1, q1, q13 -+ vsub.s16 q2, q2, q14 -+ vsub.s16 q3, q3, q15 -+ -+ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 -+ vcgt.u16 q13, q9, q5 -+ vcgt.u16 q14, q10, q6 -+ vcgt.u16 q15, q11, q7 -+ -+ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) -+ vadd.s16 q1, q1, q13 -+ vadd.s16 q2, q2, q14 -+ vadd.s16 q3, q3, q15 -+ -+ vmov.u8 q12, #2 -+ -+ vmovn.s16 d0, q0 -+ vmovn.s16 d1, q1 -+ vmovn.s16 d2, q2 -+ vmovn.s16 d3, q3 -+ -+ vldr d26, [r5] -+ -+ vuzp.8 q0, q1 -+ -+ vldr d27, [r5, #8] -+ -+ vadd.s8 q0, q0, q12 -+ vadd.s8 q1, q1, q12 -+ -+ vmov.i64 q12, #0 -+ -+ vtbl.8 d0, {d26}, d0 -+ vtbl.8 d1, {d26}, d1 -+ vtbl.8 d2, {d27}, d2 -+ vtbl.8 d3, {d27}, d3 -+ -+ vdup.i16 q13, r4 -+ -+ vzip.8 q0, q1 -+ -+ @ Avoid overwrite whilst widening -+ vaddw.s8 q2, q6, d2 -+ vaddw.s8 q3, q7, d3 -+ vaddw.s8 q1, q5, d1 -+ vaddw.s8 q0, q4, d0 -+ -+ @ now clip -+ clip16_4 q2, q3, q1, q0, q12, q13 -+ -+ bx lr -+endfunc -+ -+ -+@ a <- c <- b -+@ a in q0 -+@ c in q1 -+@ b in q2 -+@ Temp q3, q9, q10 -+@ -+@ d16, d17 (q8) xlat U, V -+@ q14.u8 #2 -+@ q15.u8 #128 -+ -+function edge_16b_body_8 -+ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0 -+ vadd.u8 q9, q14, q9 -+ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0 -+ vsub.u8 q9, q9, q0 -+ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0 -+ vadd.u8 q9, q9, q0 -+ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0 -+ vsub.u8 q0, q9, q0 -+ -+ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add -+ -+ vuzp.8 d0, d1 -+ -+ vtbl.8 d0, {d16}, d0 -+ vtbl.8 d1, {d17}, d1 -+ -+ vzip.8 d0, d1 -+ vqadd.s8 q0, q3 -+ vsub.s8 q0, q15 -+ -+ bx lr -+endfunc -+ -+@ a <- c <- b -+@ a in q0 -+@ c in q1 -+@ b in q2 -+@ Temp q3 -+@ -+@ q12, #0 -+@ d16, d17 xlat U, V -+@ q14.u8 #2 -+@ q15.u16 max -+function edge_16b_body_16 -+ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0 -+ vadd.u16 q9, q14, q9 -+ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0 -+ vsub.u16 q9, q9, q0 -+ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0 -+ vadd.u16 q9, q9, q0 -+ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0 -+ vsub.u16 q0, q9, q0 -+ -+ vmovn.s16 d0, q0 -+ @ d1 will have random contents that we transform but -+ @ that doesn't matter as we then discard them -+ vuzp.8 d0, d1 -+ -+ vtbl.8 d0, {d16}, d0 -+ vtbl.8 d1, {d17}, d1 -+ -+ vzip.8 d0, d1 -+ -+ vaddw.s8 q0, q1, d0 -+ -+ @ now clip -+ vmax.s16 q0, q12 -+ vmin.s16 q0, q15 -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_edge_[c_]xx_neon( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only -+@ int eo, [sp, #sp_base + 0] -+@ int width, [sp, #sp_base + 4] -+@ int height) [sp, #sp_base + 8] -+ -+@ Jumps via jump_tab with -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ EDGE_SRC_STRIDE [r3] -+@ (1 << \bit_depth) - 1 [r4] -+@ * xlat_table [r5] // setup_64b only -+@ int height [r12] -+@ -+@ 0 [q12] // > 8 bit -+@ 2 [q14] -+@ 128 [q15] // = 8 bit -+@ r4 [q15] // > 8 bit -+ -+.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0 -+ -+@ Build translate registers -+@ As translate values can only be 0-4 we don't care about junk in the rest -+@ of the register -+.if \is_chroma -+ ldr ip, [sp, #0] -+ push {r4-r6, lr} @ 16 bytes -+ vld1.8 {d16[2]}, [r3] -+ add r3, r3, #2 -+ vld1.8 {d17[2]}, [ip] -+ add ip, ip, #2 -+ vld1.8 {d16[0]}, [r3] -+ add r3, r3, #2 -+ vld1.8 {d17[0]}, [ip] -+ add ip, ip, #2 -+ vld1.8 {d16[1]}, [r3] -+ add r3, r3, #2 -+ vld1.8 {d17[1]}, [ip] -+ add ip, ip, #2 -+ vld1.8 {d16[3]}, [r3] -+ add r3, r3, #2 -+ vld1.8 {d17[3]}, [ip] -+ add ip, ip, #2 -+ vld1.8 {d16[4]}, [r3] -+ vld1.8 {d17[4]}, [ip] -+ movw r3, EDGE_SRC_STRIDE -+.set sp_base, 20 -+.else -+ add ip, r3, #4 -+ vld1.8 {d16[1]}, [r3] -+ add r3, r3, #2 -+ vld1.8 {d17[0]}, [ip] -+ add ip, ip, #2 -+ vld1.8 {d16[0]}, [r3] -+ add r3, r3, #6 -+ vld1.8 {d17[1]}, [ip] -+ vld1.8 {d16[2]}, [r3] -+ movw r3, EDGE_SRC_STRIDE -+ push {r4-r6, lr} @ 16 bytes -+ vzip.8 d16, d17 -+ vmov d17, d16 -+.set sp_base, 16 -+.endif -+ -+@ If setup_64b we need the xlat table on the stack -+.if \setup_64b -+ sub r5, sp, #16 -+.endif -+ -+@ Get jump address -+@ We have a special case for width 4 as the calling code doesn't detect it -+@ If we may have w4 then we add a 2nd jump table after the 1st -+.if \check_w4 -+ ldr r12, [sp, #sp_base + 4] @ width -+ adr r6, \jump_tab -+ ldr lr, [sp, #sp_base + 0] @ e0 -+ cmp r12, #8 -+ it lt -+ addlt r6, #16 -+.else -+ ldr lr, [sp, #sp_base + 0] @ e0 -+ adr r6, \jump_tab -+.endif -+ -+ ldr r12, [sp, #sp_base + 8] @ height -+ -+.if \bit_depth > 8 -+ movw r4, (1 << \bit_depth) - 1 -+.endif -+.if \setup_16b -+.if \bit_depth > 8 -+ vmov.i64 q12, #0 -+ vdup.16 q15, r4 -+ vmov.u16 q14, #2 -+.else -+ vmov.u8 q15, #128 -+ vmov.u8 q14, #2 -+.endif -+.endif -+ -+@ If setup_64b we need q4-q7 saved. -+.if \setup_64b -+ vpush {q4-q8} @ 80 bytes, q8 pushed first -+.set sp_base, sp_base + 80 -+.endif -+ -+ ldr r6, [r6, lr, lsl #2] -+ -+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes -+.if \do2 -+ push {r0, r1, r6, r12} -+.if jent_pic -+ bl 98f -+.else -+ blx r6 -+.endif -+ pop {r0, r1, r6, r12} -+ -+ add r0, #64 -+ add r1, #64 -+.endif -+ -+.if jent_pic -+ bl 98f -+.else -+ blx r6 -+.endif -+ -+@ Tidy up & return -+.if \setup_64b -+ vpop {q4-q8} @ spurious but harmless load of q8 -+.endif -+ pop {r4-r6, pc} -+ -+.if jent_pic && !\xjump -+@ Magic label - used as 98b in jent macro -+98: -+ add pc, r6 -+.endif -+.endm -+ -+ -+.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab -+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 -+.endm -+ -+.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0 -+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump -+.endm -+ -+ -+.macro edge_64b_e0, body_fn, pb -+ sub r1, #8 -+ mov r6, lr -+1: vldm r1, {d7-d16} -+ // load a -+ vext.8 q0, q3, q4, #(16 - \pb) -+ add r1, r3 -+ vext.8 q1, q4, q5, #(16 - \pb) -+ subs r12, #1 -+ vext.8 q2, q5, q6, #(16 - \pb) -+ vext.8 q3, q6, q7, #(16 - \pb) -+ pld [r1] -+ // load b -+ vext.8 q11, q7, q8, #\pb @ Avoid overwrite -+ pld [r1, #64] -+ vext.8 q8, q4, q5, #\pb -+ vext.8 q9, q5, q6, #\pb -+ vext.8 q10, q6, q7, #\pb -+ bl \body_fn -+ vstm r0, {q0-q3} -+ add r0, r0, r2 -+ bgt 1b -+ bx r6 -+.endm -+ -+.macro edge_32bx2_e0, body_fn, pb -+ add r6, r1, r3 -+ push {r7,lr} -+ sub r1, #8 -+ add r7, r0, r2 -+ lsl r2, #1 -+1: vldmia r1, {d7-d12} -+ // load a -+ vext.8 q0, q3, q4, #16 - \pb -+ add r1, r1, r3, lsl #1 -+ vext.8 q1, q4, q5, #16 - \pb -+ subs r12, #2 -+ // load b -+ vext.8 q8, q4, q5, #\pb -+ vext.8 q9, q5, q6, #\pb -+ vldr d25, [r6, #-8] -+ vldmia r6, {d12-d15} -+ vldr d26, [r6, #32] -+ // load a -+ vext.8 q2, q12, q6, #16 - \pb -+ add r6, r6, r3, lsl #1 -+ vext.8 q3, q6, q7, #16 - \pb -+ // load b -+ vext.8 q10, q6, q7, #\pb -+ vext.8 q11, q7, q13, #\pb -+ bl \body_fn -+ vst1.8 {q0-q1}, [r0, :256], r2 -+ vst1.8 {q2-q3}, [r7, :256], r2 -+ bgt 1b -+ pop {r7,pc} -+.endm -+ -+.macro edge_16b_e0, body_fn, pb -+ sub r1, #8 -+ mov r6, lr -+1: vldmia r1, {d1-d4} -+ add r1, r3 -+ subs r12, #1 -+ vext.8 q0, q0, q1, #16 - \pb -+ vext.8 q2, q1, q2, #\pb -+ -+ bl \body_fn -+ vst1.8 {q0}, [r0, :128], r2 -+ bgt 1b -+ bx r6 -+.endm -+ -+.macro edge_8bx2_e0, body_fn, pb -+ add r6, r1, r3 -+ push {r7,lr} -+ sub r1, #8 -+ add r7, r0, r2 -+ lsl r2, #1 -+1: vldmia r1, {d1-d2} -+ vldmia r6, {d3-d4} -+ vldr d6, [r1, #16] -+ subs r12, #2 -+ vldr d7, [r6, #-8] -+ add r1, r1, r3, lsl #1 -+ vext.8 d0, d1, d2, #8 - \pb -+ add r6, r6, r3, lsl #1 -+ vext.8 d5, d3, d4, #\pb -+ vext.8 d4, d2, d6, #\pb -+ vext.8 d1, d7, d3, #8 - \pb -+ -+ bl \body_fn -+ vst1.8 {d0}, [r0, :64], r2 -+ vst1.8 {d1}, [r7, :64], r2 -+ bgt 1b -+ pop {r7,pc} -+.endm -+ -+.macro edge_4bx4_e0, body_fn, pb -+ add r6, r1, r3 -+ push {r7,lr} -+ add r7, r0, r2 -+ lsl r2, #1 -+ -+ tst r1, #4 -+ bne 2f -+1: // r1 (and assumed r6) are 64-bit aligned -+ vldr d2, [r1] -+ vldr d0, [r1, #-8] -+ add r1, r1, r3, lsl #1 -+ vldr d20, [r6] -+ subs r12, #4 -+ vldr d18, [r6, #-8] -+ add r6, r6, r3, lsl #1 -+ vldr d3, [r1] -+ vshr.u64 d4, d2, #\pb * 8 -+ vldr d1, [r1, #-8] -+ add r1, r1, r3, lsl #1 -+ vldr d21, [r6] -+ vext.8 d0, d0, d2, #8 - \pb -+ vldr d19, [r6,#-8] -+ add r6, r6, r3, lsl #1 -+ vshr.u64 d22, d20, #\pb * 8 -+ vext.8 d18, d18, d20, #8 - \pb -+ vshr.u64 d5, d3, #\pb * 8 -+ vext.8 d1, d1, d3, #8 - \pb -+ vshr.u64 d23, d21, #\pb * 8 -+ vext.8 d19, d19, d21, #8 - \pb -+ vsli.64 q1, q10, #32 -+ vsli.64 q2, q11, #32 -+ vsli.64 q0, q9, #32 -+ -+ bl \body_fn -+ vst1.32 {d0[0]}, [r0, :32], r2 -+ vst1.32 {d0[1]}, [r7, :32], r2 -+ vst1.32 {d1[0]}, [r0, :32], r2 -+ vst1.32 {d1[1]}, [r7, :32], r2 -+ bgt 1b -+ pop {r7,pc} -+ -+2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned -+ vldr d20, [r1, #-4] -+ vldr d22, [r1, #4] -+ add r1, r1, r3, lsl #1 -+ vldr d2, [r6, #-4] -+ subs r12, #4 -+ vldr d4, [r6, #4] -+ add r6, r6, r3, lsl #1 -+ vldr d21, [r1, #-4] -+ vshl.i64 d18, d20, #\pb * 8 -+ vldr d23, [r1, #4] -+ add r1, r1, r3, lsl #1 -+ vldr d3, [r6, #-4] -+ vext.8 d22, d20, d22, #\pb -+ vldr d5, [r6, #4] -+ add r6, r6, r3, lsl #1 -+ vshl.i64 d0, d2, #\pb * 8 -+ vext.8 d4, d2, d4, #\pb -+ vshl.i64 d19, d21, #\pb * 8 -+ vext.8 d23, d21, d23, #\pb -+ vshl.i64 d1, d3, #\pb * 8 -+ vext.8 d5, d3, d5, #\pb -+ vsri.64 q1, q10, #32 -+ vsri.64 q0, q9, #32 -+ vsri.64 q2, q11, #32 -+ -+ bl \body_fn -+ vst1.32 {d0[0]}, [r0, :32], r2 -+ vst1.32 {d0[1]}, [r7, :32], r2 -+ vst1.32 {d1[0]}, [r0, :32], r2 -+ vst1.32 {d1[1]}, [r7, :32], r2 -+ bgt 2b -+ pop {r7,pc} -+.endm -+ -+ -+.macro edge_64b_e1, body_fn -+ sub r1, r3 -+ push {lr} -+ add r6, r1, #32 -+ // load a -+ vld1.8 {q0-q1}, [r1, :256], r3 -+ vld1.8 {q2-q3}, [r6, :256], r3 -+ // load c -+ vld1.8 {q4-q5}, [r1, :256], r3 -+ vld1.8 {q6-q7}, [r6, :256], r3 -+1: // load b -+ vld1.8 {q8-q9}, [r1, :256], r3 -+ subs r12, #1 -+ vld1.8 {q10-q11}, [r6, :256], r3 -+ bl \body_fn -+ vstm r0, {q0-q3} -+ // copy c to a -+ vmov.64 q0, q4 -+ pld [r1, r3] -+ vmov.64 q1, q5 -+ it le -+ pople {lr} -+ vmov.64 q2, q6 -+ it le -+ bxle lr -+ vmov.64 q3, q7 -+ add r0, r0, r2 -+ // copy b to c -+ vmov.64 q4, q8 -+ vmov.64 q5, q9 -+ vmov.64 q6, q10 -+ vmov.64 q7, q11 -+ b 1b -+.endm -+ -+.macro edge_32bx2_e1, body_fn -+ sub r6, r1, r3 -+ vld1.8 {q2-q3}, [r1, :256], r3 -+ vld1.8 {q0-q1}, [r6, :256] -+ mov r6, lr -+ -+1: @ Given the data duplication here we could obviously do better than -+ @ using the generic body_fn but it almost certainly isn't worth it -+ vld1.8 {q8-q9}, [r1, :256], r3 -+ subs r12, #2 -+ vmov q4, q2 -+ vmov q5, q3 -+ vld1.8 {q10-q11}, [r1, :256], r3 -+ vmov q6, q8 -+ vmov q7, q9 -+ -+ bl \body_fn -+ -+ vst1.8 {q0-q1}, [r0, :256], r2 -+ // copy b to a -+ vmov q0, q8 -+ vmov q1, q9 -+ vst1.8 {q2-q3}, [r0, :256], r2 -+ vmov q2, q10 -+ it le -+ bxle r6 -+ vmov q3, q11 -+ b 1b -+.endm -+ -+.macro edge_16b_e1, body_fn -+ sub r6, r1, r3 -+ // load c -+ vld1.8 {q1}, [r1, :128], r3 -+ // load a -+ vld1.8 {q0}, [r6, :128] -+ mov r6, lr -+1: // load b -+ vld1.8 {q2}, [r1, :128], r3 -+ bl \body_fn -+ vst1.8 {q0}, [r0, :128], r2 -+ subs r12, #1 -+ // copy c to a -+ vmov.64 q0, q1 -+ it le -+ bxle r6 -+ // copy b to c -+ vmov.64 q1, q2 -+ b 1b -+.endm -+ -+.macro edge_8bx2_e1, body_fn -+ sub r6, r1, r3 -+ lsl r3, #1 -+ push {r7, lr} -+ vld1.8 {d1}, [r1, :64], r3 -+ vld1.8 {d0}, [r6, :64], r3 -+ add r7, r0, r2 -+ lsl r2, #1 -+1: @ Given the data duplication here we could obviously do better than -+ @ using the generic body_fn but it almost certainly isn't worth it -+ vld1.8 {d4}, [r6, :64], r3 -+ vmov d2, d1 -+ vld1.8 {d5}, [r1, :64], r3 -+ subs r12, #2 -+ vmov d3, d4 -+ -+ bl \body_fn -+ -+ vst1.8 {d0}, [r0, :64], r2 -+ vst1.8 {d1}, [r7, :64], r2 -+ -+ // copy b to a -+ vmov q0, q2 -+ bgt 1b -+ pop {r7, pc} -+.endm -+ -+.macro edge_4bx4_e1, body_fn -+ sub r6, r1, r3 -+ lsl r3, #1 -+ push {r7, lr} -+ vld1.32 {d0[1]}, [r1, :32], r3 -+ add r7, r0, r2 -+ vld1.32 {d0[0]}, [r6, :32], r3 -+ lsl r2, #1 -+ vld1.32 {d4[1]}, [r1, :32], r3 -+ vld1.32 {d4[0]}, [r6, :32], r3 -+ vld1.32 {d5[1]}, [r1, :32], r3 -+ vld1.32 {d5[0]}, [r6, :32], r3 -+ vmov d1, d4 -+ vext.32 d2, d0, d4, #1 -+ subs r12, #4 -+ vmov d22, d5 -+ vext.32 d3, d4, d5, #1 -+ b 2f -+ -+1: vst1.32 {d0[0]}, [r0, :32], r2 -+ vext.32 d2, d22, d4, #1 -+ vst1.32 {d0[1]}, [r7, :32], r2 -+ vmov d0, d22 -+ vst1.32 {d1[0]}, [r0, :32], r2 -+ vext.32 d3, d4, d5, #1 -+ vst1.32 {d1[1]}, [r7, :32], r2 -+ vmov d1, d4 -+ vmov d22, d5 -+2: @ Given the data duplication here we could probably do better than -+ @ using the generic body_fn but it almost certainly isn't worth it -+ bl \body_fn -+ ble 3f -+ vld1.32 {d4[0]}, [r6, :32], r3 -+ subs r12, #4 -+ vld1.32 {d4[1]}, [r1, :32], r3 -+ vld1.32 {d5[0]}, [r6, :32], r3 -+ vld1.32 {d5[1]}, [r1, :32], r3 -+ b 1b -+ -+3: vst1.32 {d0[0]}, [r0, :32], r2 -+ vst1.32 {d0[1]}, [r7, :32], r2 -+ vst1.32 {d1[0]}, [r0, :32] -+ vst1.32 {d1[1]}, [r7, :32] -+ pop {r7, pc} -+.endm -+ -+.macro edge_64b_e2, body_fn, pb -+ push {lr} -+ sub r6, r1, r3 -+ // load c and a -+ vld1.8 {q4-q5}, [r1, :128] -+ vldr d25, [r6, #-8] -+ vldmia r6, {d16-d23} -+ vext.8 q0, q12, q8, #16 - \pb -+ add r6, r1, #32 -+ vext.8 q1, q8, q9, #16 - \pb -+ add r1, r1, r3 -+ vext.8 q2, q9, q10, #16 - \pb -+ vld1.8 {q6-q7}, [r6, :128] -+ sub r6, r1, r3 -+ vext.8 q3, q10, q11, #16 - \pb -+ -+1: // load b -+ vldmia r1, {d16-d24} -+ vext.8 q8, q8, q9, #\pb -+ pld [r1, r3] -+ vext.8 q9, q9, q10, #\pb -+ subs r12, #1 -+ vext.8 q10, q10, q11, #\pb -+ vext.8 q11, q11, q12, #\pb -+ bl \body_fn -+ // next a is mostly available in c -+ vldr d25, [r6, #-8] -+ vstmia r0, {q0-q3} -+ vext.8 q3, q6, q7, #16 - \pb -+ it le -+ pople {lr} -+ vext.8 q2, q5, q6, #16 - \pb -+ it le -+ bxle lr -+ vext.8 q1, q4, q5, #16 - \pb -+ add r6, r6, r3 -+ vext.8 q0, q12, q4, #16 - \pb -+ add r0, r0, r2 -+ // next c is mostly available in b -+ vldr d8, [r1] -+ vext.8 d9, d16, d17, #8 - \pb -+ vext.8 q5, q8, q9, #16 - \pb -+ add r1, r1, r3 -+ vext.8 q6, q9, q10, #16 - \pb -+ pld [r6, #-8] -+ vext.8 q7, q10, q11, #16 - \pb -+ b 1b -+.endm -+ -+.macro edge_32bx2_e2, body_fn, pb -+ sub r6, r1, r3 -+ push {r7, lr} -+ add r7, r0, r2 -+ lsl r2, #1 -+ // load a and first 32b of c -+ vld1.8 {q4-q5}, [r1, :256] -+ vldr d25, [r6, #-8] -+ vld1.8 {q13-q14}, [r6, :256] -+ vldr d31, [r1, #-8] -+ add r6, r6, r3, lsl #1 -+ vext.8 q0, q12, q13, #16 - \pb -+ add r1, r1, r3, lsl #1 -+ vext.8 q1, q13, q14, #16 - \pb -+ vext.8 q2, q15, q4, #16 - \pb -+ vext.8 q3, q4, q5, #16 - \pb -+1: -+ // load second 32b of c and second 32b of b -+ vldmia r6, {d12-d16} -+ vldmia r1, {d20-d24} -+ // first 32b of b is mostly available in second 32b of c -+ vext.8 q9, q7, q8, #\pb -+ subs r12, #2 -+ vext.8 q8, q6, q7, #\pb -+ vext.8 q10, q10, q11, #\pb -+ vext.8 q11, q11, q12, #\pb -+ -+ bl \body_fn -+ -+ vst1.8 {q0-q1}, [r0, :256], r2 -+ vst1.8 {q2-q3}, [r7, :256], r2 -+ ble 2f -+ -+ vldr d25, [r6, #-8] -+ add r6, r6, r3, lsl #1 -+ vldr d8, [r1] -+ vext.8 d9, d20, d21, #8 - \pb -+ vldr d31, [r1, #-8] -+ add r1, r1, r3, lsl #1 -+ // first 32b of a is mostly available in second 32b of c -+ vext.8 q1, q6, q7, #16 - \pb -+ vext.8 q0, q12, q6, #16 - \pb -+ // first 32b of c is mostly available in second 32b of b -+ vext.8 q5, q10, q11, #16 - \pb -+ // second 32b of a is mostly available in first 32b of c -+ vext.8 q2, q15, q4, #16 - \pb -+ vext.8 q3, q4, q5, #16 - \pb -+ b 1b -+ -+2: pop {r7, pc} -+.endm -+ -+.macro edge_16b_e2, body_fn, pb -+ push {lr} -+ sub r6, r1, r3 -+ vld1.8 {q1}, [r1, :128], r3 -+ vldr d19, [r6, #-8] -+ vld1.8 {q10}, [r6, :128], r3 -+ -+1: vldmia r1, {d4-d6} -+ vext.8 q0, q9, q10, #16 - \pb -+ subs r12, #1 -+ vext.8 q2, q2, q3, #\pb -+ bl \body_fn -+ vst1.8 {q0}, [r0, :128], r2 -+ ble 2f -+ vmov q10, q1 -+ vldr d2, [r1] -+ add r1, r1, r3 -+ vldr d19, [r6, #-8] -+ add r6, r6, r3 -+ vext.8 d3, d4, d5, #8 - \pb -+ b 1b -+ -+2: pop {pc} -+.endm -+ -+.macro edge_8bx2_e2, body_fn, pb -+ sub r6, r1, r3 -+ push {r7, lr} -+ add r7, r0, r2 -+ lsl r2, #1 -+ vldr d18, [r6, #-8] -+ vldr d19, [r6] -+ add r6, r6, r3, lsl #1 -+ vldr d20, [r1, #-8] -+ vldr d2, [r1] -+ add r1, r1, r3, lsl #1 -+ vldmia r6, {d3-d4} -+ vld1.8 {d21-d22}, [r1, :128] -+ -+1: vext.8 d0, d18, d19, #8 - \pb -+ vext.8 d4, d3, d4, #\pb -+ vext.8 d1, d20, d2, #8 - \pb -+ subs r12, #2 -+ vext.8 d5, d21, d22, #\pb -+ -+ bl \body_fn -+ -+ vst1.8 {d0}, [r0, :64], r2 -+ vst1.8 {d1}, [r7, :64], r2 -+ ble 2f -+ -+ vldr d18, [r6, #-8] -+ add r6, r6, r3, lsl #1 -+ vldr d20, [r1, #-8] -+ vmov d19, d3 -+ vldr d2, [r1] -+ add r1, r1, r3, lsl #1 -+ vldmia r6, {d3-d4} -+ vld1.8 {d21-d22}, [r1, :128] -+ b 1b -+ -+2: pop {r7, pc} -+.endm -+ -+.macro edge_4bx4_e2, body_fn, pb -+ sub r6, r1, r3 -+ push {r7-r9, lr} -+ add r8, r1, r3 -+ sub r6, r6, #\pb -+ add r8, r8, #\pb -+ add r7, r0, r2 -+ lsl r2, #1 -+ -+1: vld1.32 {d0[0]}, [r6], r3 -+ subs r12, #4 -+ vld1.32 {d2[0]}, [r1], r3 -+ vld1.32 {d4[0]}, [r8], r3 -+ vld1.32 {d0[1]}, [r6], r3 -+ vld1.32 {d2[1]}, [r1], r3 -+ vld1.32 {d4[1]}, [r8], r3 -+ vld1.32 {d1[0]}, [r6], r3 -+ vld1.32 {d3[0]}, [r1], r3 -+ vld1.32 {d5[0]}, [r8], r3 -+ vld1.32 {d1[1]}, [r6], r3 -+ vld1.32 {d3[1]}, [r1], r3 -+ vld1.32 {d5[1]}, [r8], r3 -+ -+ bl \body_fn -+ -+ vst1.32 {d0[0]}, [r0, :32], r2 -+ vst1.32 {d0[1]}, [r7, :32], r2 -+ vst1.32 {d1[0]}, [r0, :32], r2 -+ vst1.32 {d1[1]}, [r7, :32], r2 -+ bgt 1b -+ -+ pop {r7-r9,pc} -+.endm -+ -+.macro edge_64b_e3, body_fn, pb -+ push {lr} -+ sub r6, r1, r3 -+ // load c and a -+ vld1.8 {q4-q5}, [r1, :128] -+ vldmia r6, {d16-d24} -+ vext.8 q0, q8, q9, #\pb -+ add r6, r1, #32 -+ vext.8 q1, q9, q10, #\pb -+ add r1, r1, r3 -+ vext.8 q2, q10, q11, #\pb -+ vld1.8 {q6-q7}, [r6, :128] -+ sub r6, r1, r3 -+ vext.8 q3, q11, q12, #\pb -+ -+1: // load b -+ vldr d17, [r1, #-8] -+ vldmia r1, {d18-d25} -+ vext.8 q8, q8, q9, #16 - \pb -+ pld [r1, r3] -+ vext.8 q9, q9, q10, #16 - \pb -+ subs r12, #1 -+ vext.8 q10, q10, q11, #16 - \pb -+ vext.8 q11, q11, q12, #16 - \pb -+ bl \body_fn -+ // next a is mostly available in c -+ vldr d24, [r6, #64] -+ vstmia r0, {q0-q3} -+ vext.8 q0, q4, q5, #\pb -+ it le -+ pople {lr} -+ vext.8 q1, q5, q6, #\pb -+ it le -+ bxle lr -+ vext.8 q2, q6, q7, #\pb -+ add r6, r6, r3 -+ vext.8 q3, q7, q12, #\pb -+ add r0, r0, r2 -+ // next c is mostly available in b -+ vext.8 d14, d22, d23, #\pb -+ vldr d15, [r1, #56] -+ vext.8 q4, q8, q9, #\pb -+ add r1, r1, r3 -+ vext.8 q5, q9, q10, #\pb -+ vext.8 q6, q10, q11, #\pb -+ b 1b -+.endm -+ -+.macro edge_32bx2_e3, body_fn, pb -+ sub r6, r1, r3 -+ push {r7, lr} -+ add r7, r0, r2 -+ lsl r2, #1 -+ // load a and first 32b of c -+ vldmia r1, {d8-d12} -+ vldmia r6, {d24-d28} -+ vext.8 q2, q4, q5, #\pb -+ add r6, r6, r3, lsl #1 -+ vext.8 q3, q5, q6, #\pb -+ add r1, r1, r3, lsl #1 -+ vext.8 q0, q12, q13, #\pb -+ vext.8 q1, q13, q14, #\pb -+1: -+ // load second 32b of c and second 32b of b -+ vldr d25, [r6, #-8] -+ subs r12, #2 -+ vldmia r6, {d12-d15} -+ vldr d27, [r1, #-8] -+ vldmia r1, {d20-d23} -+ // first 32b of b is mostly available in second 32b of c -+ vext.8 q8, q12, q6, #16 - \pb -+ vext.8 q9, q6, q7, #16 - \pb -+ vext.8 q11, q10, q11, #16 - \pb -+ vext.8 q10, q13, q10, #16 - \pb -+ -+ bl \body_fn -+ -+ vst1.8 {q0-q1}, [r0, :256], r2 -+ vst1.8 {q2-q3}, [r7, :256], r2 -+ ble 2f -+ -+ vldr d24, [r6, #32] -+ add r6, r6, r3, lsl #1 -+ vldr d11, [r1, #24] -+ vext.8 d10, d22, d23, #\pb -+ vldr d30, [r1, #32] -+ add r1, r1, r3, lsl #1 -+ // first 32b of a is mostly available in second 32b of c -+ vext.8 q0, q6, q7, #\pb -+ vext.8 q1, q7, q12, #\pb -+ // first 32b of c is mostly available in second 32b of b -+ vext.8 q4, q10, q11, #\pb -+ // second 32b of a is mostly available in first 32b of c -+ vext.8 q3, q5, q15, #\pb -+ vext.8 q2, q4, q5, #\pb -+ b 1b -+ -+2: pop {r7, pc} -+.endm -+ -+.macro edge_16b_e3, body_fn, pb -+ push {lr} -+ sub r6, r1, r3 -+ vld1.8 {q1}, [r1, :128], r3 -+ vldmia r6, {d18-d20} -+ add r6, r6, r3 -+ -+1: vldr d5, [r1, #-8] -+ vld1.8 {q3}, [r1, :128] -+ subs r12, #1 -+ vext.8 q0, q9, q10, #\pb -+ vext.8 q2, q2, q3, #16 - \pb -+ bl \body_fn -+ vst1.8 {q0}, [r0, :128], r2 -+ ble 2f -+ vmov q9, q1 -+ vldr d3, [r1, #8] -+ add r1, r1, r3 -+ vldr d20, [r6, #16] -+ add r6, r6, r3 -+ vext.8 d2, d4, d5, #\pb -+ b 1b -+ -+2: pop {pc} -+.endm -+ -+.macro edge_8bx2_e3, body_fn, pb -+ sub r6, r1, r3 -+ push {r7, lr} -+ add r7, r0, r2 -+ lsl r2, #1 -+ vld1.8 {d18-d19}, [r6] -+ add r6, r6, r3, lsl #1 -+ vldr d20, [r1, #8] -+ vldr d2, [r1] -+ add r1, r1, r3, lsl #1 -+ vldr d4, [r6, #-8] -+ vldr d3, [r6] -+ vldr d21, [r1, #-8] -+ vldr d22, [r1] -+ -+1: vext.8 d0, d18, d19, #\pb -+ vext.8 d4, d4, d3, #8 - \pb -+ vext.8 d1, d2, d20, #\pb -+ subs r12, #2 -+ vext.8 d5, d21, d22, #8 - \pb -+ -+ bl \body_fn -+ -+ vst1.8 {d0}, [r0, :64], r2 -+ vst1.8 {d1}, [r7, :64], r2 -+ ble 2f -+ -+ vldr d19, [r6, #8] -+ add r6, r6, r3, lsl #1 -+ vldr d20, [r1, #8] -+ vmov d18, d3 -+ vldr d2, [r1] -+ add r1, r1, r3, lsl #1 -+ vldr d4, [r6, #-8] -+ vldr d3, [r6] -+ vldr d21, [r1, #-8] -+ vldr d22, [r1] -+ b 1b -+ -+2: pop {r7, pc} -+.endm -+ -+.macro edge_4bx4_e3, body_fn, pb -+ @ e3 is the same as e2 but with the X offset reversed -+ edge_4bx4_e2 \body_fn, (-\pb) -+.endm -+ -+@ Jump table entry - if in neon mode the bottom bit must be set -+@ ? There is probably a real asm instruction to do this but I haven't found it -+.macro jent lab -+.if jent_pic -+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is -+@ simpler and clearer in the code to stick with .word -+T .word (0 + \lab) - (4 + 98b) -+A .word (0 + \lab) - (8 + 98b) -+.else -+T .word 1 + \lab -+A .word \lab -+.endif -+.endm -+ -+.macro edge_64b_bodies, body_fn, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ -+0: edge_64b_e0 \body_fn, \pb -+10: edge_64b_e1 \body_fn -+20: edge_64b_e2 \body_fn, \pb -+30: edge_64b_e3 \body_fn, \pb -+.endm -+ -+.macro edge_32bx2_bodies, body_fn, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ -+0: edge_32bx2_e0 \body_fn, \pb -+10: edge_32bx2_e1 \body_fn -+20: edge_32bx2_e2 \body_fn, \pb -+30: edge_32bx2_e3 \body_fn, \pb -+.endm -+ -+.macro edge_16b_bodies, body_fn, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ -+0: edge_16b_e0 \body_fn, \pb -+10: edge_16b_e1 \body_fn -+20: edge_16b_e2 \body_fn, \pb -+30: edge_16b_e3 \body_fn, \pb -+.endm -+ -+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ jent 5f -+ jent 15f -+ jent 25f -+ jent 35f -+ -+0: edge_32bx2_e0 \body_fn_64b, \pb -+10: edge_32bx2_e1 \body_fn_64b -+20: edge_32bx2_e2 \body_fn_64b, \pb -+30: edge_32bx2_e3 \body_fn_64b, \pb -+5: edge_16b_e0 \body_fn_16b, \pb -+15: edge_16b_e1 \body_fn_16b -+25: edge_16b_e2 \body_fn_16b, \pb -+35: edge_16b_e3 \body_fn_16b, \pb -+.endm -+ -+.macro edge_16b_8bx2_bodies, body_fn, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ jent 5f -+ jent 15f -+ jent 25f -+ jent 35f -+ -+0: edge_16b_e0 \body_fn, \pb -+10: edge_16b_e1 \body_fn -+20: edge_16b_e2 \body_fn, \pb -+30: edge_16b_e3 \body_fn, \pb -+5: edge_8bx2_e0 \body_fn, \pb -+15: edge_8bx2_e1 \body_fn -+25: edge_8bx2_e2 \body_fn, \pb -+35: edge_8bx2_e3 \body_fn, \pb -+.endm -+ -+.macro edge_8bx2_4bx4_bodies, body_fn, pb -+ jent 0f -+ jent 10f -+ jent 20f -+ jent 30f -+ jent 5f -+ jent 15f -+ jent 25f -+ jent 35f -+ -+0: edge_8bx2_e0 \body_fn, \pb -+10: edge_8bx2_e1 \body_fn -+20: edge_8bx2_e2 \body_fn, \pb -+30: edge_8bx2_e3 \body_fn, \pb -+5: edge_4bx4_e0 \body_fn, \pb -+15: edge_4bx4_e1 \body_fn -+25: edge_4bx4_e2 \body_fn, \pb -+35: edge_4bx4_e3 \body_fn, \pb -+.endm -+ -+@ void ff_hevc_rpi_sao_edge_8_neon_8( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_8_neon_8, export=1 -+ edge_16b_init 8, 0, 1, 99f -+99: -+ edge_8bx2_4bx4_bodies edge_16b_body_8, 1 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_16_neon_8( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_16_neon_8, export=1 -+ edge_16b_init 8, 0, 0, 99f -+99: -+ edge_16b_bodies edge_16b_body_8, 1 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_32_neon_8( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_32_neon_8, export=1 -+ edge_64b_init 8, 0, 0, 99f -+99: -+ edge_32bx2_bodies edge_64b_body_8, 1 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_64_neon_8( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_64_neon_8, export=1 -+ edge_64b_init 8, 0, 0, 99f -+99: -+ edge_64b_bodies edge_64b_body_8, 1 -+endfunc -+ -+@ ff_hevc_rpi_sao_edge_c_8_neon_8( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1 -+ edge_16b_init 8, 1, 1, 99f -+99: -+ edge_16b_8bx2_bodies edge_16b_body_8, 2 -+endfunc -+ -+@ ff_hevc_rpi_sao_edge_c_16_neon_8( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1 -+ edge_64b_init 8, 1, 0, 99f -+99: -+ edge_32bx2_bodies edge_64b_body_8, 2 -+endfunc -+ -+@ ff_hevc_rpi_sao_edge_c_32_neon_8( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1 -+ edge_64b_init 8, 1, 0, 99f -+99: -+ edge_64b_bodies edge_64b_body_8, 2 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_8_neon_10( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_8_neon_10, export=1 -+ edge_16b_init 10, 0, 1, 99f -+99: -+ edge_16b_8bx2_bodies edge_16b_body_16, 2 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_16_neon_10( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_16_neon_10, export=1 -+ edge_64b_init 10, 0, 0, 99f -+99: -+ edge_32bx2_bodies edge_64b_body_16, 2 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_64_neon_10( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+@ We simply split the 32 case into 2 vertical stripes -+@ and call the fns for w32 -+@ -+@ Calling code will always have src != dst so we don't have to worry -+@ about edge effects -+ -+function ff_hevc_rpi_sao_edge_64_neon_10, export=1 -+ edge_64b_init 10, 0, 1, 99f, xjump=1 -+endfunc -+ -+@ void ff_hevc_rpi_sao_edge_32_neon_10( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ int stride_dst, [r2] -+@ int16_t *_sao_offset_val, [r3] -+@ int eo, [sp, #0] -+@ int width, [sp, #4] -+@ int height) [sp, #8] -+ -+function ff_hevc_rpi_sao_edge_32_neon_10, export=1 -+ edge_64b_init 10, 0, 0, 99f -+99: -+ edge_64b_bodies edge_64b_body_16, 2 -+endfunc -+ -+@ ff_hevc_rpi_sao_edge_c_8_neon_10( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1 -+ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 -+99: -+ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 -+endfunc -+ -+@ ff_hevc_rpi_sao_edge_c_32_neon_10( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1 -+ edge_64b_init 10, 1, 1, 99f, xjump=1 -+endfunc -+ -+ -+@ ff_hevc_rpi_sao_edge_c_16_neon_10( -+@ uint8_t *_dst, [r0] -+@ const uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ const int16_t *_sao_offset_val_u, [r3] -+@ const int16_t *_sao_offset_val_v, [sp, #0] -+@ int eo, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1 -+ edge_64b_init 10, 1, 0, 99f -+99: -+ edge_64b_bodies edge_64b_body_16, 4 -+endfunc -+ -diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h -new file mode 100644 -index 0000000000..36a23a5bf9 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_arm.h -@@ -0,0 +1,28 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H -+#define AVCODEC_ARM_HEVCPRED_ARM_H -+ -+#include "libavcodec/rpi_hevcpred.h" -+ -+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); -+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); -+ -+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ -+ -diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c -new file mode 100644 -index 0000000000..80724d4cf3 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c -@@ -0,0 +1,35 @@ -+/* -+ * Copyright (c) 2018 John Cox (for Raspberry Pi) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/attributes.h" -+#include "libavutil/cpu.h" -+#include "libavutil/arm/cpu.h" -+ -+#include "libavcodec/rpi_hevcpred.h" -+#include "rpi_hevcpred_arm.h" -+ -+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) -+{ -+ int cpu_flags = av_get_cpu_flags(); -+ -+ if (have_neon(cpu_flags)) -+ ff_hevc_rpi_pred_init_neon(c, bit_depth); -+} -+ -diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c -new file mode 100644 -index 0000000000..21e7700174 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c -@@ -0,0 +1,210 @@ -+/* -+ * Copyright (c) 2018 John Cox (for Raspberry Pi) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "rpi_hevcpred_arm.h" -+ -+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; -+ -+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+ -+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+ -+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); -+ -+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+ -+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); -+ -+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) -+{ -+ switch (bit_depth) -+ { -+ case 8: -+ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; -+ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; -+ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 -+ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; -+ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; -+ -+ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; -+ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; -+ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; -+ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; -+ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; -+ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; -+ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; -+ -+ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; -+ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; -+ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; -+ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; -+ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; -+ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; -+ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; -+ -+ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; -+ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; -+ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; -+ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; -+ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; -+ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; -+ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; -+ -+ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; -+ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; -+ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; -+ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; -+ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; -+ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; -+ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; -+ -+ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; -+ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; -+ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; -+ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; -+ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; -+ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; -+ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; -+ break; -+ case 10: -+ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; -+ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; -+ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; -+ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; -+ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; -+ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; -+ -+ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; -+ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; -+ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; -+ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; -+ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; -+ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; -+ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; -+ -+ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; -+ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; -+ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; -+ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; -+ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; -+ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; -+ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; -+ -+ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; -+ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; -+ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; -+ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; -+ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; -+ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; -+ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; -+ -+ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; -+ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; -+ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; -+ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; -+ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; -+ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; -+ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; -+ -+ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; -+ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; -+ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; -+ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; -+ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; -+ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; -+ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; -+ break; -+ default: -+ break; -+ } -+} -+ -diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S -new file mode 100644 -index 0000000000..fa8f67cf03 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S -@@ -0,0 +1,2984 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+/* -+ * General angular pred -+ * -+ * Horizontal (10) & Vertical (26) cases have their own file -+ * and are not dealt with properly here (luma filtering is missing) -+ * -+ * The inv_angle calculations are annoying - if it wasn't for the +128 -+ * rounding step then the result would simply be the loop counter :-( -+ */ -+ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+.text -+ -+@ Horizontal Patch functions -+@ These need a transpose before store so exist as smaller patches -+@ Patches can be called repeatedly without any intermediate setup -+@ to generate a horizontal block -+@ -+@ It is almost certainly the case that larger patch fns can be built -+@ and they would be a little faster, but we would still need the small -+@ fns and code size (or at least instruction cache size) is an issue -+@ given how much code we already have here -+ -+@ Generate 8x8 luma 8 patch -+@ -+@ r3 Out stride -+@ r4 Angle add -+@ r7 Inv angle (_up only) -+@ -+@ In/Out (updated) -+@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) -+@ r2 Left ptr - updated -+@ r10 Inv angle accumulator (_up only) -+@ r12 32 - angle frac (_down) or angle frac (_up) -+@ d0 Older reference samples -+@ d1=r8+r9 Newer reference samples -+@ d2 32 - angle frac -+@ d3 Angle frac -+@ q2 Partially computed next result (_up only) -+@ -+@ Temps -+@ r5 Loop counter -+@ r6 -+@ r7 (_down only) -+@ r11 (_up only) -+@ q2, q8-q11 -+ -+patch_h_down_8x8_8: -+ ldrd r8, r9, [r2] @ Left -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r6 -+ lsr r8, #8 -+ vdup.8 d2, r12 -+ orr r8, r8, r9, lsl #24 -+ ldr r9, [r2, #5]! -+ vmov d1, r8, r9 -+ // drop through... -+patch_h_down_8x8_8_continue: -+ mov r5, #8 -+1: -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ vext.8 q8, q8, q9, #8 -+ itt mi -+ lsrmi r7, r8, #8 -+ vmovmi d0, r8, r9 -+ vdup.8 d2, r12 -+ vext.8 q9, q9, q10, #8 -+ it mi -+ orrmi r8, r7, r9, lsl #24 -+ vext.8 q10, q10, q11, #8 -+ it mi -+ ldrmi r9, [r2, #1]! -+ vmov d22, d23 -+ vrshrn.u16 d23, q2, #5 -+ it mi -+ vmovmi d1, r8, r9 -+ subs r5, #1 -+ vdup.8 d3, r6 -+ bne 1b -+ // drop through... -+store_tran_8x8_8: -+ vzip.8 d16, d17 -+ add r6, r0, r3 -+ vzip.8 d18, d19 -+ lsl r3, #1 -+ vzip.8 d20, d21 -+ add r5, r0, r3 -+ vzip.8 d22, d23 -+ vzip.16 q8, q9 -+ vzip.16 q10, q11 -+ vzip.32 q8, q10 -+ vzip.32 q9, q11 -+ vst1.8 {d16}, [r0]! -+ vst1.8 {d17}, [r6], r3 -+ vst1.8 {d20}, [r5], r3 -+ vst1.8 {d21}, [r6], r3 -+ vst1.8 {d18}, [r5], r3 -+ vst1.8 {d19}, [r6], r3 -+ vst1.8 {d22}, [r5] -+ asr r3, #1 -+ vst1.8 {d23}, [r6] -+ -+ bx lr -+ -+patch_h_up_8x8_8: -+ ldrd r8, r9, [r2] -+ rsb r6, r4, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r4 -+ lsr r11, r8, #24 -+ vdup.8 d2, r6 -+ ldr r8, [r2, #-1]! -+ orr r9, r11, r9, lsl #8 -+ vmov d1, r8, r9 -+ mov r12, r4 -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+patch_h_up_8x8_8_continue: -+ mov r5, #8 -+1: -+ add r12, r4 -+ mov r11, #0 -+ cmp r12, #33 -+ it cs -+ addcs r10, r7 -+ vext.8 q8, q8, q9, #8 -+ itt cs -+ subcs r12, #32 -+ tstcs r10, #1<<31 -+ rsb r6, r12, #32 -+ it eq -+ asreq r11, r10, #8 -+ it cs -+ vmovcs d0, r8, r9 -+ vdup.8 d2, r6 -+ it cs -+ lsrcs r6, r8, #24 -+ vext.8 q9, q9, q10, #8 -+ itt cs -+ orrcs r9, r6, r9, lsl #8 -+ ldrbcs r11, [r1, r11] -+ vdup.8 d3, r12 -+ vext.8 q10, q10, q11, #8 -+ it hi -+ ldrbhi r11, [r2, #-1]! -+ vmov d22, d23 -+ vrshrn.u16 d23, q2, #5 -+ itt cs -+ orrcs r8, r11, r8, lsl #8 -+ vmovcs d1, r8, r9 -+ vmull.u8 q2, d0, d2 -+ subs r5, #1 -+ vmlal.u8 q2, d1, d3 -+ bne 1b -+ -+ b store_tran_8x8_8 -+ -+ -+.macro ADRT reg, val -+@ adr in T32 has enough range but not in A32 -+A adrl \reg, \val -+T adr \reg, \val -+.endm -+ -+@ ff_hevc_rpi_pred_angular_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_4_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r8, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ ldr lr, [r2], #1 @ Top -+ rsb r12, r6, #32 -+ vmov s0, lr -+ vdup.8 d3, r6 -+ ldr lr, [r2], #1 -+ vdup.8 d2, r12 -+ vmov s2, lr -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ itt mi -+ vmovmi s0, lr -+ ldrmi lr, [r2], #1 -+ vdup.8 d2, r12 -+ it mi -+ vmovmi s2, lr -+ vdup.8 d3, r6 -+ mov r5, #2 -+1: -+ vrshrn.u16 d20, q2, #5 -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ vext.64 q8, q8, q9, #1 -+ it mi -+ vmovmi s0, lr -+ vext.64 q9, q9, q10, #1 -+ it mi -+ ldrmi lr, [r2], #1 -+ vdup.8 d2, r12 -+ it mi -+ vmovmi s2, lr -+ subs r5, #1 -+ vdup.8 d3, r6 -+ bne 1b -+ -+ vrshrn.u16 d20, q2, #5 -+ vmull.u8 q2, d0, d2 -+ add r12, r0, r3 -+ vmlal.u8 q2, d1, d3 -+ lsl r3, #1 -+ vext.64 q8, q8, q9, #1 -+ vext.64 q9, q9, q10, #1 -+ vrshrn.u16 d20, q2, #5 -+ -+98: -+ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3 -+ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3 -+ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0] -+ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12] -+ pop {r4-r8, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ rsb r12, r6, #32 -+ ldr lr, [r2] @ Left -+ ldrb r2, [r2, #-1] @ Top-left -+ vmov s0, lr -+ vdup.8 d2, r12 -+ vdup.8 d3, r6 -+ orr lr, r2, lr, lsl #8 -+ vmov s2, lr -+ sub r8, r7, #128 -+ mov r5, #3 -+2: -+ vmull.u8 q2, d0, d2 -+ subs r12, r4 -+ vmlal.u8 q2, d1, d3 -+T it mi -+ addmi r12, #32 -+T asr r6, r8, #8 -+T it mi -+T ldrbmi r2, [r1, r6] -+A ldrbmi r2, [r1, r8, asr #8] -+ rsb r6, r12, #32 -+ vdup.8 d2, r12 -+ ittt mi -+ vmovmi s0, lr -+ orrmi lr, r2, lr, lsl #8 -+ vmovmi s2, lr -+ vrshrn.u16 d20, q2, #5 -+ vdup.8 d3, r6 -+ it mi -+ addmi r8, r7 -+ subs r5, #1 -+ vext.64 q8, q8, q9, #1 -+ vext.64 q9, q9, q10, #1 -+ bne 2b -+ -+ vmull.u8 q2, d0, d2 -+ add r12, r0, r3 -+ vmlal.u8 q2, d1, d3 -+ lsl r3, #1 -+ vrshrn.u16 d20, q2, #5 -+ b 98b -+ -+@ Left of vertical - works down left -+18: -+ ldrh r7, [r7] -+ rsb r12, r6, #32 -+ ldr lr, [r1] @ Top -+ ldrb r1, [r2, #-1] @ Top-left -+ vmov s0, lr -+ vdup.8 d2, r12 -+ vdup.8 d3, r6 -+ orr lr, r1, lr, lsl #8 -+ vmov s2, lr -+ sub r8, r7, #128 -+ mov r5, #3 -+2: -+ vmull.u8 q2, d0, d2 -+ subs r12, r4 -+ vmlal.u8 q2, d1, d3 -+T it mi -+ addmi r12, #32 -+T asr r6, r8, #8 -+T it mi -+T ldrbmi r1, [r2, r6] -+A ldrbmi r1, [r2, r8, asr #8] -+ rsb r6, r12, #32 -+ vdup.8 d2, r12 -+ ittt mi -+ vmovmi s0, lr -+ orrmi lr, r1, lr, lsl #8 -+ vmovmi s2, lr -+ vrshrn.u16 d4, q2, #5 -+ vdup.8 d3, r6 -+ it mi -+ addmi r8, r7 -+ subs r5, #1 -+ vst1.32 {d4[0]}, [r0], r3 -+ bne 2b -+ -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vrshrn.u16 d4, q2, #5 -+ vst1.32 {d4[0]}, [r0] -+ -+ pop {r4-r8, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ ldr lr, [r1], #1 @ Top -+ rsb r12, r6, #32 -+ vmov s0, lr -+ vdup.8 d3, r6 -+ ldr lr, [r1], #1 -+ vdup.8 d2, r12 -+ vmov s2, lr -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ itt mi -+ vmovmi s0, lr -+ ldrmi lr, [r1], #1 -+ vdup.8 d2, r12 -+ it mi -+ vmovmi s2, lr -+ vdup.8 d3, r6 -+ mov r5, #2 -+1: -+ vrshrn.u16 d6, q2, #5 -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ vst1.32 {d6[0]}, [r0], r3 -+ itt mi -+ vmovmi s0, lr -+ ldrmi lr, [r1], #1 -+ vdup.8 d2, r12 -+ it mi -+ vmovmi s2, lr -+ subs r5, #1 -+ vdup.8 d3, r6 -+ bne 1b -+ -+ vrshrn.u16 d6, q2, #5 -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vst1.32 {d6[0]}, [r0], r3 -+ vrshrn.u16 d6, q2, #5 -+ vst1.32 {d6[0]}, [r0] -+ -+ pop {r4-r8, pc} -+ -+endfunc -+ -+ -+ -+@ ff_hevc_rpi_pred_angular_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_8_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ bl patch_h_down_8x8_8 -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ bl patch_h_up_8x8_8 -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ ldrb lr, [r2, #-1] @ Top-left -+ ldrh r7, [r7] -+ vmov d0, r8, r9 -+ lsl r9, r9, #8 -+ vdup.8 d2, r12 -+ orr r9, r9, r8, lsr #24 -+ orr r8, lr, r8, lsl #8 -+ vmov d1, r8, r9 -+ sub r1, r7, #128 -+ mov r5, #7 -+1: -+ vdup.8 d3, r6 -+ vmull.u8 q2, d0, d2 -+ subs r12, r12, r4 -+ vmlal.u8 q2, d1, d3 -+ ittt mi -+ addmi lr, r2, r1, asr #8 -+ addmi r12, r12, #32 -+ vmovmi d0, r8, r9 -+ rsb r6, r12, #32 -+ itt mi -+ lslmi r9, r9, #8 -+ ldrbmi lr, [lr] -+ vdup.8 d2, r12 -+ vrshrn.u16 d4, q2, #5 -+ itttt mi -+ orrmi r9, r9, r8, lsr #24 -+ orrmi r8, lr, r8, lsl #8 -+ vmovmi d1, r8, r9 -+ addmi r1, r1, r7 -+ subs r5, r5, #1 -+ vst1.8 {d4}, [r0], r3 -+ bne 1b -+ -+ vdup.8 d3, r6 -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vrshrn.u16 d4, q2, #5 -+ vst1.8 {d4}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r6 -+ mov r5, #7 -+ lsr r8, #8 -+ vdup.8 d2, r12 -+ orr r8, r8, r9, lsl #24 -+ ldr r9, [r1, #5]! -+ vmov d1, r8, r9 -+1: -+ vmull.u8 q2, d0, d2 -+ subs r12, r4 -+ vmlal.u8 q2, d1, d3 -+ it mi -+ addmi r12, #32 -+ rsb r6, r12, #32 -+ itt mi -+ vmovmi d0, r8, r9 -+ lsrmi r8, #8 -+ vdup.8 d2, r12 -+ itt mi -+ orrmi r8, r8, r9, lsl #24 -+ ldrmi r9, [r1, #1]! -+ vrshrn.u16 d6, q2, #5 -+ it mi -+ vmovmi d1, r8, r9 -+ vdup.8 d3, r6 -+ subs r5, #1 -+ vst1.8 {d6}, [r0], r3 -+ bne 1b -+ -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vrshrn.u16 d6, q2, #5 -+ vst1.8 {d6}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_16_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r1, r2 @ save r2 - r1 unused by patch_down -+ -+ bl patch_h_down_8x8_8 -+ bl patch_h_down_8x8_8_continue -+ -+ add r2, r1, #8 @ restore r2, but 8 rows further down left -+ sub r0, #16 -+ mov r6, r4 -+ add r0, r0, r3, lsl #3 -+ -+ bl patch_h_down_8x8_8 -+ bl patch_h_down_8x8_8_continue -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ -+ push {r2} -+ bl patch_h_up_8x8_8 -+ bl patch_h_up_8x8_8_continue -+ pop {r2} -+ -+ sub r0, #16 -+ mov r10, #-128 -+ add r2, #8 -+ add r0, r0, r3, lsl #3 -+ sub r10, r10, r7, lsl #3 -+ -+ bl patch_h_up_8x8_8 -+ bl patch_h_up_8x8_8_continue -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.8 {q9}, [r1] -+ sub r1, r2, #1 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ vdup.8 d6, r6 -+ vext.8 q8, q9, q9, #15 -+ sub r8, r7, #128 -+ vld1.8 {d16[0]}, [r1] -+ vdup.8 d7, r12 -+ mov r5, #15 -+1: -+ vmull.u8 q0, d18, d7 -+ subs r12, r4 -+ vmlal.u8 q0, d16, d6 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d19, d7 -+ it cc -+ addcc r1, r2, r8, asr #8 -+ vmlal.u8 q1, d17, d6 -+ rsb r6, r12, #32 -+ vext.8 q10, q8, q8, #15 -+ sub r5, #1 -+ vld1.8 {d20[0]}, [r1] -+ it cc -+ addcc r8, r7 -+ vmov q11, q8 -+ teq r5, #0 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmull.u8 q0, d22, d7 -+ subs r12, r4 -+ vmlal.u8 q0, d20, d6 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d23, d7 -+ it cc -+ addcc r1, r2, r8, asr #8 -+ vmlal.u8 q1, d21, d6 -+ rsb r6, r12, #32 -+ vext.8 q8, q10, q10, #15 -+ sub r5, #1 -+ vld1.8 {d16[0]}, [r1] -+ it cc -+ addcc r8, r7 -+ vmov q9, q10 -+ teq r5, #0 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmull.u8 q0, d22, d7 -+ vmlal.u8 q0, d20, d6 -+ vmull.u8 q1, d23, d7 -+ vmlal.u8 q1, d21, d6 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmull.u8 q0, d18, d7 -+ vmlal.u8 q0, d16, d6 -+ vmull.u8 q1, d19, d7 -+ vmlal.u8 q1, d17, d6 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ vld1.8 {q9}, [r1]! -+ rsb r12, r6, #32 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vext.8 q8, q9, q9, #1 -+ vld1.8 {d17[7]}, [r1]! -+ mov r5, #15 -+1: -+ vmull.u8 q0, d16, d6 -+ subs r12, r4 -+ vmlal.u8 q0, d18, d7 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d17, d6 -+ rsb r6, r12, #32 -+ vmlal.u8 q1, d19, d7 -+ sub r5, #1 -+ vext.8 q10, q8, q8, #1 -+ teq r5, #0 -+ vld1.8 {d21[7]}, [r1] -+ it cc -+ addcc r1, #1 -+ vmov q11, q8 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmull.u8 q0, d20, d6 -+ subs r12, r4 -+ vmlal.u8 q0, d22, d7 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d21, d6 -+ rsb r6, r12, #32 -+ vmlal.u8 q1, d23, d7 -+ sub r5, #1 -+ vext.8 q8, q10, q10, #1 -+ teq r5, #0 -+ vld1.8 {d17[7]}, [r1] -+ it cc -+ addcc r1, #1 -+ vmov q9, q10 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmull.u8 q0, d20, d6 -+ vmlal.u8 q0, d22, d7 -+ vmull.u8 q1, d21, d6 -+ vmlal.u8 q1, d23, d7 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmull.u8 q0, d16, d6 -+ vmlal.u8 q0, d18, d7 -+ vmull.u8 q1, d17, d6 -+ vmlal.u8 q1, d19, d7 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_32_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_32_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r10, #4 -+ mov r1, r2 -+1: -+ bl patch_h_down_8x8_8 -+ bl patch_h_down_8x8_8_continue -+ bl patch_h_down_8x8_8_continue -+ bl patch_h_down_8x8_8_continue -+ -+ add r2, r1, #8 @ restore r2, but 8 rows further down left -+ add r1, r1, #8 -+ mov r6, r4 -+ sub r0, #32 -+ subs r10, #1 -+ add r0, r0, r3, lsl #3 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ vmov.i8 d6, #1<<2 -+1: -+ push {r2,r10} -+ bl patch_h_up_8x8_8 -+ bl patch_h_up_8x8_8_continue -+ bl patch_h_up_8x8_8_continue -+ bl patch_h_up_8x8_8_continue -+ pop {r2,r10} -+ -+ vmov r8, s12 -+ sub r0, #32 -+ add r2, #8 -+ add r0, r0, r3, lsl #3 -+ sub r10, r10, r7, lsl #3 -+ vshr.u8 d6, #1 -+ teq r8, #0 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.8 {q0-q1}, [r1] -+ sub r9, r2, #1 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ mov r5, #32 -+1: -+ vld1.8 {d17[7]}, [r9] -+ add r8, r7 -+ vmov q2, q0 -+ vmov q3, q1 -+ add r9, r2, r8, asr #8 -+ vext.8 q1, q0, q1, #15 -+ vext.8 q0, q8, q0, #15 -+2: -+ vmull.u8 q10, d4, d19 -+ subs r12, r4 -+ vmlal.u8 q10, d0, d18 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q11, d5, d19 -+ rsb r6, r12, #32 -+ vmlal.u8 q11, d1, d18 -+ sub r5, #1 -+ vmull.u8 q12, d6, d19 -+ teq r5, #0 -+ vmlal.u8 q12, d2, d18 -+ vmull.u8 q13, d7, d19 -+ vmlal.u8 q13, d3, d18 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ vrshrn.u16 d20, q10, #5 -+ vrshrn.u16 d21, q11, #5 -+ vrshrn.u16 d22, q12, #5 -+ vrshrn.u16 d23, q13, #5 -+ vst1.8 {q10-q11}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.8 {q0-q1}, [r1]! -+ rsb r12, r6, #32 -+ vld1.8 {d16[0]}, [r5] -+ mov r5, #32 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+1: -+ vmov q2, q0 -+ add r1, #1 -+ vmov q3, q1 -+ vext.8 q0, q0, q1, #1 -+ vext.8 q1, q1, q8, #1 -+2: -+ vmull.u8 q10, d0, d18 -+ subs r12, r4 -+ vmlal.u8 q10, d4, d19 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q11, d1, d18 -+ rsb r6, r12, #32 -+ vmlal.u8 q11, d5, d19 -+ sub r5, #1 -+ vmull.u8 q12, d2, d18 -+ teq r5, #0 -+ vmlal.u8 q12, d6, d19 -+ vmull.u8 q13, d3, d18 -+ vmlal.u8 q13, d7, d19 -+ vld1.8 {d16[0]}, [r1] -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ vrshrn.u16 d20, q10, #5 -+ vrshrn.u16 d21, q11, #5 -+ vrshrn.u16 d22, q12, #5 -+ vrshrn.u16 d23, q13, #5 -+ vst1.8 {q10-q11}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ Chroma 8 bit 4x4 patch fns -+ .text -+ -+patch_h_down_c_4x4_8: -+ ldrd r8, r9, [r2] @ Left -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r6 -+ lsr r8, #16 -+ vdup.8 d2, r12 -+ orr r8, r8, r9, lsl #16 -+ ldr r9, [r2, #6]! -+ vmov d1, r8, r9 -+ // drop through... -+patch_h_down_c_4x4_8_continue: -+ mov r5, #4 -+1: -+ subs r12, r4 -+ vmull.u8 q2, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmlal.u8 q2, d1, d3 -+ rsb r6, r12, #32 -+ vext.8 q8, q8, q9, #8 -+ it mi -+ lsrmi r7, r8, #16 -+ vmov d18, d19 -+ it mi -+ vmovmi d0, r8, r9 -+ vdup.8 d2, r12 -+ it mi -+ orrmi r8, r7, r9, lsl #16 -+ vrshrn.u16 d19, q2, #5 -+ itt mi -+ ldrmi r9, [r2, #2]! -+ vmovmi d1, r8, r9 -+ subs r5, #1 -+ vdup.8 d3, r6 -+ bne 1b -+ // drop through... -+store_tran_c_4x4_8: -+ vzip.16 d16, d17 -+ add r6, r0, r3 -+ vzip.16 d18, d19 -+ lsl r3, #1 -+ vzip.32 q8, q9 -+ add r5, r0, r3 -+ vst1.16 {d16}, [r0]! -+ vst1.16 {d17}, [r6], r3 -+ vst1.16 {d18}, [r5] -+ asr r3, #1 -+ vst1.16 {d19}, [r6] -+ -+ bx lr -+ -+patch_h_up_c_4x4_8: -+ ldrd r8, r9, [r2] -+ rsb r6, r4, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r4 -+ lsr r11, r8, #16 -+ vdup.8 d2, r6 -+ ldr r8, [r2, #-2]! -+ orr r9, r11, r9, lsl #16 -+ vmov d1, r8, r9 -+ mov r12, r4 -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+patch_h_up_c_4x4_8_continue: -+ mov r5, #4 -+1: -+ add r12, r4 -+ cmp r12, #33 -+ it cs -+ addcs r10, r7 -+ mov r11, #0 -+ itt cs -+ subcs r12, #32 -+ tstcs r10, #1<<31 -+ rsb r6, r12, #32 -+ it eq -+ asreq r11, r10, #7 -+ it cs -+ vmovcs d0, r8, r9 -+ it eq -+ biceq r11, #1 -+ vdup.8 d2, r6 -+ it cs -+ lsrcs r6, r8, #16 -+ vdup.8 d3, r12 -+ vext.8 q8, q8, q9, #8 -+ itt cs -+ orrcs r9, r6, r9, lsl #16 -+ ldrhcs r11, [r1, r11] -+ vmov d18, d19 -+ it hi -+ ldrhhi r11, [r2, #-2]! -+ vrshrn.u16 d19, q2, #5 -+ itt cs -+ orrcs r8, r11, r8, lsl #16 -+ vmovcs d1, r8, r9 -+ vmull.u8 q2, d0, d2 -+ subs r5, #1 -+ vmlal.u8 q2, d1, d3 -+ bne 1b -+ -+ b store_tran_c_4x4_8 -+ -+ -+@ ff_hevc_rpi_pred_angular_c_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ bl patch_h_down_c_4x4_8 -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ bl patch_h_up_c_4x4_8 -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ ldrh lr, [r2, #-2] @ Top-left -+ ldrh r7, [r7] -+ vmov d0, r8, r9 -+ lsl r9, r9, #16 -+ vdup.8 d2, r12 -+ orr r9, r9, r8, lsr #16 -+ orr r8, lr, r8, lsl #16 -+ vmov d1, r8, r9 -+ sub r1, r7, #128 -+ mov r5, #3 -+1: -+ vdup.8 d3, r6 -+ vmull.u8 q2, d0, d2 -+ subs r12, r12, r4 -+ vmlal.u8 q2, d1, d3 -+ itttt mi -+ addmi lr, r2, r1, asr #7 -+ bicmi lr, #1 -+ addmi r12, r12, #32 -+ vmovmi d0, r8, r9 -+ rsb r6, r12, #32 -+ itt mi -+ lslmi r9, r9, #16 -+ ldrhmi lr, [lr] -+ vdup.8 d2, r12 -+ vrshrn.u16 d4, q2, #5 -+ itttt mi -+ orrmi r9, r9, r8, lsr #16 -+ orrmi r8, lr, r8, lsl #16 -+ vmovmi d1, r8, r9 -+ addmi r1, r1, r7 -+ subs r5, r5, #1 -+ vst1.16 {d4}, [r0], r3 -+ bne 1b -+ -+ vdup.8 d3, r6 -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vrshrn.u16 d4, q2, #5 -+ vst1.16 {d4}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.8 d3, r6 -+ mov r5, #3 -+ lsr r8, #16 -+ vdup.8 d2, r12 -+ orr r8, r8, r9, lsl #16 -+ ldr r9, [r1, #6]! -+ vmov d1, r8, r9 -+1: -+ vmull.u8 q2, d0, d2 -+ subs r12, r4 -+ vmlal.u8 q2, d1, d3 -+ it mi -+ addmi r12, #32 -+ rsb r6, r12, #32 -+ itt mi -+ vmovmi d0, r8, r9 -+ lsrmi r8, #16 -+ vdup.8 d2, r12 -+ itt mi -+ orrmi r8, r8, r9, lsl #16 -+ ldrmi r9, [r1, #2]! -+ vrshrn.u16 d6, q2, #5 -+ it mi -+ vmovmi d1, r8, r9 -+ vdup.8 d3, r6 -+ subs r5, #1 -+ vst1.16 {d6}, [r0], r3 -+ bne 1b -+ -+ vmull.u8 q2, d0, d2 -+ vmlal.u8 q2, d1, d3 -+ vrshrn.u16 d6, q2, #5 -+ vst1.16 {d6}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_c_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r1, r2 @ save r2 - r1 unused by patch_down -+ -+ bl patch_h_down_c_4x4_8 -+ bl patch_h_down_c_4x4_8_continue -+ -+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left -+ sub r0, #16 -+ mov r6, r4 -+ add r0, r0, r3, lsl #2 -+ -+ bl patch_h_down_c_4x4_8 -+ bl patch_h_down_c_4x4_8_continue -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ -+ push {r2} -+ bl patch_h_up_c_4x4_8 -+ bl patch_h_up_c_4x4_8_continue -+ pop {r2} -+ -+ sub r0, #16 -+ mov r10, #-128 -+ add r2, #8 -+ add r0, r0, r3, lsl #2 -+ sub r10, r10, r7, lsl #2 -+ -+ bl patch_h_up_c_4x4_8 -+ bl patch_h_up_c_4x4_8_continue -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.8 {q9}, [r1] -+ sub r1, r2, #2 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ vdup.8 d6, r6 -+ vext.8 q8, q9, q9, #14 -+ sub r8, r7, #128 -+ vld1.16 {d16[0]}, [r1] -+ vdup.8 d7, r12 -+ mov r5, #7 -+1: -+ subs r12, r4 -+ vmull.u8 q0, d18, d7 -+ it cc -+ asrcc r1, r8, #8 -+ vmlal.u8 q0, d16, d6 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d19, d7 -+ it cc -+ addcc r1, r2, r1, lsl #1 -+ vmlal.u8 q1, d17, d6 -+ rsb r6, r12, #32 -+ vext.8 q10, q8, q8, #14 -+ sub r5, #1 -+ vld1.16 {d20[0]}, [r1] -+ it cc -+ addcc r8, r7 -+ vmov q11, q8 -+ teq r5, #0 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ subs r12, r4 -+ vmull.u8 q0, d22, d7 -+ it cc -+ asrcc r1, r8, #8 -+ vmlal.u8 q0, d20, d6 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d23, d7 -+ it cc -+ addcc r1, r2, r1, lsl #1 -+ vmlal.u8 q1, d21, d6 -+ rsb r6, r12, #32 -+ vext.8 q8, q10, q10, #14 -+ sub r5, #1 -+ vld1.16 {d16[0]}, [r1] -+ it cc -+ addcc r8, r7 -+ vmov q9, q10 -+ teq r5, #0 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmull.u8 q0, d22, d7 -+ vmlal.u8 q0, d20, d6 -+ vmull.u8 q1, d23, d7 -+ vmlal.u8 q1, d21, d6 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmull.u8 q0, d18, d7 -+ vmlal.u8 q0, d16, d6 -+ vmull.u8 q1, d19, d7 -+ vmlal.u8 q1, d17, d6 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ vld1.8 {q9}, [r1]! -+ rsb r12, r6, #32 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vext.8 q8, q9, q9, #2 -+ vld1.16 {d17[3]}, [r1]! -+ mov r5, #7 -+1: -+ vmull.u8 q0, d16, d6 -+ subs r12, r4 -+ vmlal.u8 q0, d18, d7 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d17, d6 -+ rsb r6, r12, #32 -+ vmlal.u8 q1, d19, d7 -+ sub r5, #1 -+ vext.8 q10, q8, q8, #2 -+ teq r5, #0 -+ vld1.16 {d21[3]}, [r1] -+ it cc -+ addcc r1, #2 -+ vmov q11, q8 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmull.u8 q0, d20, d6 -+ subs r12, r4 -+ vmlal.u8 q0, d22, d7 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q1, d21, d6 -+ rsb r6, r12, #32 -+ vmlal.u8 q1, d23, d7 -+ sub r5, #1 -+ vext.8 q8, q10, q10, #2 -+ teq r5, #0 -+ vld1.16 {d17[3]}, [r1] -+ it cc -+ addcc r1, #2 -+ vmov q9, q10 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vdup.8 d6, r6 -+ vdup.8 d7, r12 -+ vst1.8 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmull.u8 q0, d20, d6 -+ vmlal.u8 q0, d22, d7 -+ vmull.u8 q1, d21, d6 -+ vmlal.u8 q1, d23, d7 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmull.u8 q0, d16, d6 -+ vmlal.u8 q0, d18, d7 -+ vmull.u8 q1, d17, d6 -+ vmlal.u8 q1, d19, d7 -+ vrshrn.u16 d0, q0, #5 -+ vrshrn.u16 d1, q1, #5 -+ vst1.8 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_c_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r10, #4 -+ mov r1, r2 -+1: -+ bl patch_h_down_c_4x4_8 -+ bl patch_h_down_c_4x4_8_continue -+ bl patch_h_down_c_4x4_8_continue -+ bl patch_h_down_c_4x4_8_continue -+ -+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left -+ add r1, r1, #4*2 -+ mov r6, r4 -+ sub r0, #32 -+ subs r10, #1 -+ add r0, r0, r3, lsl #2 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ vmov.i8 d6, #1<<2 -+1: -+ push {r2, r10} -+ bl patch_h_up_c_4x4_8 -+ bl patch_h_up_c_4x4_8_continue -+ bl patch_h_up_c_4x4_8_continue -+ bl patch_h_up_c_4x4_8_continue -+ pop {r2, r10} -+ -+ vmov r8, s12 -+ sub r0, #32 -+ add r2, #8 -+ add r0, r0, r3, lsl #2 -+ sub r10, r10, r7, lsl #2 -+ vshr.u8 d6, #1 -+ teq r8, #0 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.8 {q0-q1}, [r1] -+ sub r9, r2, #2 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ mov r5, #16 -+1: -+ vld1.16 {d17[3]}, [r9] -+ add r8, r7 -+ vmov q2, q0 -+ vmov q3, q1 -+ asr r9, r8, #8 -+ vext.8 q1, q0, q1, #14 -+ add r9, r2, r9, lsl #1 -+ vext.8 q0, q8, q0, #14 -+2: -+ vmull.u8 q10, d4, d19 -+ subs r12, r4 -+ vmlal.u8 q10, d0, d18 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q11, d5, d19 -+ rsb r6, r12, #32 -+ vmlal.u8 q11, d1, d18 -+ sub r5, #1 -+ vmull.u8 q12, d6, d19 -+ teq r5, #0 -+ vmlal.u8 q12, d2, d18 -+ vmull.u8 q13, d7, d19 -+ vmlal.u8 q13, d3, d18 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ vrshrn.u16 d20, q10, #5 -+ vrshrn.u16 d21, q11, #5 -+ vrshrn.u16 d22, q12, #5 -+ vrshrn.u16 d23, q13, #5 -+ vst1.8 {q10-q11}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.8 {q0-q1}, [r1]! -+ rsb r12, r6, #32 -+ vld1.16 {d16[0]}, [r5] -+ mov r5, #16 -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+1: -+ vmov q2, q0 -+ add r1, #2 -+ vmov q3, q1 -+ vext.8 q0, q0, q1, #2 -+ vext.8 q1, q1, q8, #2 -+2: -+ vmull.u8 q10, d0, d18 -+ subs r12, r4 -+ vmlal.u8 q10, d4, d19 -+ it cc -+ addcc r12, #32 -+ vmull.u8 q11, d1, d18 -+ rsb r6, r12, #32 -+ vmlal.u8 q11, d5, d19 -+ sub r5, #1 -+ vmull.u8 q12, d2, d18 -+ teq r5, #0 -+ vmlal.u8 q12, d6, d19 -+ vmull.u8 q13, d3, d18 -+ vmlal.u8 q13, d7, d19 -+ vld1.16 {d16[0]}, [r1] -+ vdup.8 d18, r6 -+ vdup.8 d19, r12 -+ vrshrn.u16 d20, q10, #5 -+ vrshrn.u16 d21, q11, #5 -+ vrshrn.u16 d22, q12, #5 -+ vrshrn.u16 d23, q13, #5 -+ vst1.8 {q10-q11}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+@------------------------------------------------------------------------------ -+@ Data -+ -+ .text -+ .balign 64 -+angle_2: -+ .byte 32 -+ .byte 26, 21, 17, 13, 9, 5, 2, 0 -+ @ Sign inverted from standards table -+ .byte 2, 5, 9, 13, 17, 21, 26, 32 -+ .byte 26, 21, 17, 13, 9, 5, 2, 0 -+ @ Standard sign -+ .byte 2, 5, 9, 13, 17, 21, 26, 32 -+ -+ .balign 2 -+ -+ @ Sign inverted from standards table -+inv_angle: -+ .short 4096, 1638, 910, 630, 482, 390, 315 -+ .short 256 -+ .short 315, 390, 482, 630, 910, 1638, 4096 -+ -+@------------------------------------------------------------------------------ -+@ -+@ 10 bit fns -+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code -+@ but runs out of register width for 12+ bit -+ -+ .text -+ .balign 64 -+ -+patch_h_down_4x4_10: -+ ldrd r8, r9, [r2] @ Left -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.16 d3, r6 -+ lsr r8, #16 -+ vdup.16 d2, r12 -+ orr r8, r8, r9, lsl #16 -+ ldr r9, [r2, #6]! -+ vmov d1, r8, r9 -+ // drop through... -+patch_h_down_4x4_10_continue: -+ mov r5, #4 -+1: -+ subs r12, r4 -+ vmul.u16 d4, d0, d2 -+ it mi -+ addmi r12, #32 -+ vmla.u16 d4, d1, d3 -+ rsb r6, r12, #32 -+ vext.16 q8, q8, q9, #4 -+ it mi -+ lsrmi r7, r8, #16 -+ vmov d18, d19 -+ it mi -+ vmovmi d0, r8, r9 -+ vdup.16 d2, r12 -+ it mi -+ orrmi r8, r7, r9, lsl #16 -+ vrshr.u16 d19, d4, #5 -+ itt mi -+ ldrmi r9, [r2, #2]! -+ vmovmi d1, r8, r9 -+ subs r5, #1 -+ vdup.16 d3, r6 -+ bne 1b -+ // drop through... -+store_tran_4x4_10: -+ vzip.16 d16, d17 -+ add r6, r0, r3 -+ vzip.16 d18, d19 -+ lsl r3, #1 -+ vzip.32 q8, q9 -+ add r5, r0, r3 -+ vst1.16 {d16}, [r0]! -+ vst1.16 {d17}, [r6], r3 -+ vst1.16 {d18}, [r5] -+ asr r3, #1 -+ vst1.16 {d19}, [r6] -+ -+ bx lr -+ -+patch_h_up_4x4_10: -+ ldrd r8, r9, [r2] -+ rsb r6, r4, #32 -+ vmov d0, r8, r9 -+ vdup.16 d3, r4 -+ lsr r11, r8, #16 -+ vdup.16 d2, r6 -+ ldr r8, [r2, #-2]! -+ orr r9, r11, r9, lsl #16 -+ vmov d1, r8, r9 -+ mov r12, r4 -+ vmul.u16 d4, d0, d2 -+ vmla.u16 d4, d1, d3 -+patch_h_up_4x4_10_continue: -+ mov r5, #4 -+1: -+ add r12, r4 -+ cmp r12, #33 -+ it cs -+ addcs r10, r7 -+ mov r11, #0 -+ itt cs -+ subcs r12, #32 -+ tstcs r10, #1<<31 -+ rsb r6, r12, #32 -+ it eq -+ asreq r11, r10, #7 -+ it cs -+ vmovcs d0, r8, r9 -+ it eq -+ biceq r11, #1 -+ vdup.16 d2, r6 -+ it cs -+ lsrcs r6, r8, #16 -+ vdup.16 d3, r12 -+ vext.16 q8, q8, q9, #4 -+ itt cs -+ orrcs r9, r6, r9, lsl #16 -+ ldrhcs r11, [r1, r11] -+ vmov d18, d19 -+ it hi -+ ldrhhi r11, [r2, #-2]! -+ vrshr.u16 d19, d4, #5 -+ itt cs -+ orrcs r8, r11, r8, lsl #16 -+ vmovcs d1, r8, r9 -+ vmul.u16 d4, d0, d2 -+ subs r5, #1 -+ vmla.u16 d4, d1, d3 -+ bne 1b -+ -+ b store_tran_4x4_10 -+ -+ -+@ ff_hevc_rpi_pred_angular_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_4_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ bl patch_h_down_4x4_10 -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ bl patch_h_up_4x4_10 -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ ldrh lr, [r2, #-2] @ Top-left -+ ldrh r7, [r7] -+ vmov d0, r8, r9 -+ lsl r9, r9, #16 -+ vdup.16 d2, r12 -+ orr r9, r9, r8, lsr #16 -+ orr r8, lr, r8, lsl #16 -+ vmov d1, r8, r9 -+ sub r1, r7, #128 -+ mov r5, #3 -+1: -+ sel lr, lr, lr @ force pipeline 0 on Cortex-A53 -+ vdup.16 d3, r6 -+ vmul.u16 d4, d0, d2 -+ subs r12, r12, r4 -+ vmla.u16 d4, d1, d3 -+ itttt mi -+ addmi lr, r2, r1, asr #7 -+ bicmi lr, #1 -+ addmi r12, r12, #32 -+ vmovmi d0, r8, r9 -+ rsb r6, r12, #32 -+ itt mi -+ lslmi r9, r9, #16 -+ ldrhmi lr, [lr] -+ vdup.16 d2, r12 -+ vrshr.u16 d4, d4, #5 -+ itttt mi -+ orrmi r9, r9, r8, lsr #16 -+ orrmi r8, lr, r8, lsl #16 -+ vmovmi d1, r8, r9 -+ addmi r1, r1, r7 -+ subs r5, r5, #1 -+ vst1.16 {d4}, [r0], r3 -+ bne 1b -+ -+ vdup.16 d3, r6 -+ nop @ force next insn into pipeline 0 to enable -+ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53 -+ vmla.u16 d4, d1, d3 -+ vrshr.u16 d4, d4, #5 -+ vst1.16 {d4}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ ldrd r8, r9, [r1] @ Top -+ rsb r12, r6, #32 -+ vmov d0, r8, r9 -+ vdup.16 d3, r6 -+ lsr r8, #16 -+ vdup.16 d2, r12 -+ orr r8, r8, r9, lsl #16 -+ ldr r9, [r1, #6]! -+ vmov d1, r8, r9 -+ mov r5, #3 -+1: -+ vmul.u16 d4, d0, d2 -+ subs r12, r4 -+ vmla.u16 d4, d1, d3 -+ it mi -+ addmi r12, #32 -+ rsb r6, r12, #32 -+ itt mi -+ vmovmi d0, r8, r9 -+ lsrmi r8, #16 -+ vdup.16 d2, r12 -+ itt mi -+ orrmi r8, r8, r9, lsl #16 -+ ldrmi r9, [r1, #2]! -+ vrshr.u16 d4, d4, #5 -+ it mi -+ vmovmi d1, r8, r9 -+ vdup.16 d3, r6 -+ subs r5, #1 -+ vst1.16 {d4}, [r0], r3 -+ bne 1b -+ -+ vmul.u16 d4, d0, d2 -+ vmla.u16 d4, d1, d3 -+ vrshr.u16 d4, d4, #5 -+ vst1.16 {d4}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_8_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r1, r2 @ save r2 - r1 unused by patch_down -+ -+ bl patch_h_down_4x4_10 -+ bl patch_h_down_4x4_10_continue -+ -+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left -+ sub r0, #16 -+ mov r6, r4 -+ add r0, r0, r3, lsl #2 -+ -+ bl patch_h_down_4x4_10 -+ bl patch_h_down_4x4_10_continue -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ -+ push {r2} -+ bl patch_h_up_4x4_10 -+ bl patch_h_up_4x4_10_continue -+ pop {r2} -+ -+ sub r0, #16 -+ mov r10, #-128 -+ add r2, #8 -+ add r0, r0, r3, lsl #2 -+ sub r10, r10, r7, lsl #2 -+ -+ bl patch_h_up_4x4_10 -+ bl patch_h_up_4x4_10_continue -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.16 {q9}, [r1] -+ sub r1, r2, #2 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ vdup.16 q2, r6 -+ vext.16 q8, q9, q9, #7 -+ sub r8, r7, #128 -+ vld1.16 {d16[0]}, [r1] -+ vdup.16 q3, r12 -+ mov r5, #7 -+1: -+ vmul.u16 q0, q9, q3 -+ subs r12, r4 -+ vmla.u16 q0, q8, q2 -+ ittt cc -+ asrcc r1, r8, #8 -+ addcc r12, #32 -+ addcc r1, r2, r1, lsl #1 -+ vext.16 q10, q8, q8, #7 -+ rsb r6, r12, #32 -+ vmov q11, q8 -+ sub r5, #1 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r8, r7 -+ vld1.16 {d20[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmul.u16 q0, q11, q3 -+ subs r12, r4 -+ vmla.u16 q0, q10, q2 -+ ittt cc -+ asrcc r1, r8, #8 -+ addcc r12, #32 -+ addcc r1, r2, r1, lsl #1 -+ vext.16 q8, q10, q10, #7 -+ rsb r6, r12, #32 -+ vmov q9, q10 -+ sub r5, #1 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r8, r7 -+ vld1.16 {d16[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmul.u16 q0, q11, q3 -+ vmla.u16 q0, q10, q2 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmul.u16 q0, q9, q3 -+ vmla.u16 q0, q8, q2 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ vld1.16 {q9}, [r1]! -+ rsb r12, r6, #32 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vext.16 q8, q9, q9, #1 -+ vld1.16 {d17[3]}, [r1]! -+ mov r5, #7 -+1: -+ vmul.u16 q0, q8, q2 -+ subs r12, r4 -+ vmla.u16 q0, q9, q3 -+ it cc -+ addcc r12, #32 -+ vext.16 q10, q8, q8, #1 -+ rsb r6, r12, #32 -+ vld1.16 {d21[3]}, [r1] -+ sub r5, #1 -+ vmov q11, q8 -+ teq r5, #0 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r1, #2 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmul.u16 q0, q10, q2 -+ subs r12, r4 -+ vmla.u16 q0, q11, q3 -+ it cc -+ addcc r12, #32 -+ vext.16 q8, q10, q10, #1 -+ rsb r6, r12, #32 -+ vld1.16 {d17[3]}, [r1] -+ sub r5, #1 -+ vmov q9, q10 -+ teq r5, #0 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r1, #2 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmul.u16 q0, q10, q2 -+ vmla.u16 q0, q11, q3 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+4: -+ bcc 3b -+5: -+ vmul.u16 q0, q8, q2 -+ vmla.u16 q0, q9, q3 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_16_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r10, #4 -+ mov r1, r2 -+1: -+ bl patch_h_down_4x4_10 -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ -+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left -+ add r1, r1, #4*2 -+ mov r6, r4 -+ sub r0, #32 -+ subs r10, #1 -+ add r0, r0, r3, lsl #2 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r10, #-128 -+ vmov.i8 d6, #1<<2 -+1: -+ push {r2, r10} -+ bl patch_h_up_4x4_10 -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ pop {r2, r10} -+ -+ vmov r8, s12 -+ sub r0, #32 -+ add r2, #8 -+ add r0, r0, r3, lsl #2 -+ sub r10, r10, r7, lsl #2 -+ vshr.u8 d6, #1 -+ teq r8, #0 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.16 {q0-q1}, [r1] -+ sub r9, r2, #2 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ mov r5, #16 -+1: -+ vld1.16 {d17[3]}, [r9] -+ add r8, r7 -+ vmov q2, q0 -+ vmov q3, q1 -+ asr r9, r8, #8 -+ vext.16 q1, q0, q1, #7 -+ add r9, r2, r9, lsl #1 -+ vext.16 q0, q8, q0, #7 -+2: -+ vmul.u16 q11, q2, q10 -+ subs r12, r4 -+ vmla.u16 q11, q0, q9 -+ it cc -+ addcc r12, #32 -+ vmul.u16 q12, q3, q10 -+ rsb r6, r12, #32 -+ vmla.u16 q12, q1, q9 -+ sub r5, #1 -+ teq r5, #0 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ vrshr.u16 q11, q11, #5 -+ vrshr.u16 q12, q12, #5 -+ vst1.16 {q11-q12}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.16 {q0-q1}, [r1]! -+ rsb r12, r6, #32 -+ vld1.16 {d16[0]}, [r5] -+ mov r5, #16 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+1: -+ vmov q2, q0 -+ add r1, #2 -+ vmov q3, q1 -+ vext.16 q0, q0, q1, #1 -+ vext.16 q1, q1, q8, #1 -+2: -+ vmul.u16 q11, q0, q9 -+ subs r12, r4 -+ vmla.u16 q11, q2, q10 -+ it cc -+ addcc r12, #32 -+ vmul.u16 q12, q1, q9 -+ rsb r6, r12, #32 -+ vmla.u16 q12, q3, q10 -+ sub r5, #1 -+ vld1.16 {d16[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ vrshr.u16 q11, q11, #5 -+ vrshr.u16 q12, q12, #5 -+ vst1.16 {q11-q12}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_32_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_32_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r11, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #1 -+ vpush {d8} -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ add sp, #8 -+ mov r10, #8 -+ mov r1, r2 -+1: -+ bl patch_h_down_4x4_10 -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ bl patch_h_down_4x4_10_continue -+ -+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left -+ add r1, r1, #4*2 -+ mov r6, r4 -+ sub r0, #64 -+ subs r10, #1 -+ add r0, r0, r3, lsl #2 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ add sp, #8 -+ ldrh r7, [r7] -+ mov r10, #-128 -+ vmov.i8 d6, #1<<6 -+1: -+ push {r2, r10} -+ bl patch_h_up_4x4_10 -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ bl patch_h_up_4x4_10_continue -+ pop {r2, r10} -+ -+ vmov r8, s12 -+ sub r0, #64 -+ add r2, #8 -+ add r0, r0, r3, lsl #2 -+ sub r10, r10, r7, lsl #2 -+ vshr.u8 d6, #1 -+ teq r8, #0 -+ bne 1b -+ -+ pop {r4-r11, pc} -+ -+@ Left of vertical - works down left -+18: -+ add r5, r1, #32 -+ vld1.16 {q1-q2}, [r1] -+ rsb r12, r6, r6, lsl #16 -+ vld1.16 {q3-q4}, [r5] -+ sub r9, r2, #2 -+ rsb r4, r12, #0 -+ rsb r12, r12, #32 << 16 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vmov d0, d9 -+ vmov s2, r12 -+ add r10, r0, #32 -+ mov r5, #32 -+1: -+ vld1.16 {d1[3]}, [r9] -+ add r8, r7 -+ vmov q11, q4 -+ vmov q10, q3 -+ asr r9, r8, #8 -+ vmov q9, q2 -+ add r9, r2, r9, lsl #1 -+ vmov q8, q1 -+ vext.16 q4, q3, q4, #7 -+ vext.16 q3, q2, q3, #7 -+ vext.16 q2, q1, q2, #7 -+ vext.16 q1, q0, q1, #7 -+2: -+ vmul.u16 q12, q8, d1[1] -+ adds r12, r4 -+ vmla.u16 q12, q1, d1[0] -+ it cc -+ addcc r12, #32 << 16 -+ vmul.u16 q13, q9, d1[1] -+ it cc -+ subcc r12, #32 -+ vmla.u16 q13, q2, d1[0] -+ sub r5, #1 -+ vmul.u16 q14, q10, d1[1] -+ teq r5, #0 -+ vmla.u16 q14, q3, d1[0] -+ vmul.u16 q15, q11, d1[1] -+ vmla.u16 q15, q4, d1[0] -+ vmov s2, r12 -+ vrshr.u16 q12, q12, #5 -+ vrshr.u16 q13, q13, #5 -+ vrshr.u16 q14, q14, #5 -+ vrshr.u16 q15, q15, #5 -+ vst1.16 {q12-q13}, [r0], r3 -+ vst1.16 {q14-q15}, [r10], r3 -+ bhi 2b -+ bne 1b -+ -+ vpop {d8} -+ vmov d9, d0 -+ pop {r4-r11, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.16 {q1-q2}, [r1] -+ rsb r12, r6, r6, lsl #16 -+ vld1.16 {q3-q4}, [r5] -+ add r1, r1, #64 -+ rsb r4, r12, #0 -+ rsb r12, r12, #32 << 16 -+ vmov d1, d9 -+ vmov s1, r12 -+ add r10, r0, #32 -+ mov r5, #32 -+1: -+ vld1.16 {d0[0]}, [r1]! -+ vmov q8, q1 -+ vmov q9, q2 -+ vmov q10, q3 -+ vmov q11, q4 -+ vext.16 q1, q1, q2, #1 -+ vext.16 q2, q2, q3, #1 -+ vext.16 q3, q3, q4, #1 -+ vext.16 q4, q4, q0, #1 -+2: -+ vmul.u16 q12, q1, d0[2] -+ adds r12, r4 -+ vmla.u16 q12, q8, d0[3] -+ it cc -+ addcc r12, #32 << 16 -+ vmul.u16 q13, q2, d0[2] -+ it cc -+ subcc r12, #32 -+ vmla.u16 q13, q9, d0[3] -+ sub r5, #1 -+ vmul.u16 q14, q3, d0[2] -+ teq r5, #0 -+ vmla.u16 q14, q10, d0[3] -+ vmul.u16 q15, q4, d0[2] -+ vmla.u16 q15, q11, d0[3] -+ vmov s1, r12 -+ vrshr.u16 q12, q12, #5 -+ vrshr.u16 q13, q13, #5 -+ vrshr.u16 q14, q14, #5 -+ vrshr.u16 q15, q15, #5 -+ vst1.16 {q12-q13}, [r0], r3 -+ vst1.16 {q14-q15}, [r10], r3 -+ bhi 2b -+ bne 1b -+ -+ vpop {d8} -+ vmov d9, d1 -+ pop {r4-r11, pc} -+ -+endfunc -+ -+ -+ -+@ Generate 4x4 chroma patch -+@ -+@ In (const) -+@ r1 Up ptr (_up only) -+@ r3 Out stride -+@ r4 Angle add -+@ r7 Inv angle (_up only) -+@ -+@ In/Out (updated) -+@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) -+@ r2 Left ptr - updated -+@ r6 Angle frac (init to r4 + 32) -+@ r8 Inv angle accumulator -+@ q2 Cur Line - load before 1st call for down - set by _up -+@ q8 Cur Line - load before 1st call for up - set by _down -+@ -+@ Temps -+@ r5 Loop counter -+@ r12 -+@ d0, q1, q12-q15 -+ -+patch_h_down_c_4x4_10: -+ vld1.16 {q12}, [r2]! -+ rsb r12, r6, #32 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ mov r5, #4 -+1: -+ vmov q13, q12 -+ vext.16 q12, q12, q12, #2 -+ vld1.32 {d25[1]}, [r2]! -+patch_h_down_c_4x4_10_continue: -+2: -+ vmov q8, q9 -+ subs r12, r4 -+ vmul.u16 q0, q13, q3 -+ it cc -+ addcc r12, #32 -+ vmla.u16 q0, q12, q2 -+ rsb r6, r12, #32 -+ vmov q9, q10 -+ sub r5, #1 -+ vmov q10, q11 -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vrshr.u16 q11, q0, #5 -+ bhi 2b -+ bne 1b -+ -+ bcs 3f -+ vmov q13, q12 -+ vext.16 q12, q12, q12, #2 -+ vld1.32 {d25[1]}, [r2]! -+3: -+ -+store_tran_c_4x4_10: -+T add r6, r0, r3 -+ vzip.32 q8, q10 -+A add r6, r0, r3 -+T lsl r3, #1 -+ vzip.32 q9, q11 -+A add r5, r0, r3, lsl #1 -+T add r5, r0, r3 -+ vst2.32 {d16,d18}, [r0]! -+A lsl r3, #1 -+ vst2.32 {d17,d19}, [r6], r3 -+ asr r3, #1 -+ vst2.32 {d20,d22}, [r5] -+ mov r5, #4 -+ vst2.32 {d21,d23}, [r6] -+ bx lr -+ -+patch_h_up_c_4x4_10: -+ vld1.16 {q1}, [r2] -+ rsb r12, r6, #32 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ mov r5, #4 -+1: -+ adds r8, r7 -+ vmov q12, q1 -+ it mi -+ ldrmi r6, [r2, #-4]! -+ vext.16 q1, q1, q1, #6 -+ itt pl -+ asrpl r6, r8, #8 -+ ldrpl r6, [r1, r6, lsl #2] -+ vmov s4, r6 -+patch_h_up_c_4x4_10_continue: -+2: -+ vmov q8, q9 -+ subs r12, r4 -+ vmul.u16 q0, q12, q3 -+ it cc -+ addcc r12, #32 -+ vmla.u16 q0, q1, q2 -+ rsb r6, r12, #32 -+ vmov q9, q10 -+ sub r5, #1 -+ vmov q10, q11 -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vrshr.u16 q11, q0, #5 -+ bhi 2b -+ bne 1b -+ -+ bcs store_tran_c_4x4_10 -+ adds r8, r7 -+ vmov q12, q1 -+ it mi -+ ldrmi r6, [r2, #-4]! -+ vext.16 q1, q1, q1, #6 -+ itt pl -+ asrpl r6, r8, #8 -+ ldrpl r6, [r1, r6, lsl #2] -+ vmov s4, r6 -+ b store_tran_c_4x4_10 -+ -+ -+@ ff_hevc_rpi_pred_angular_c_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r8, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #2 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ bl patch_h_down_c_4x4_10 -+ pop {r4-r8, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r8, #-128 -+ sub r8, r7 -+ bl patch_h_up_c_4x4_10 -+ pop {r4-r8, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.16 {q9}, [r1] -+ sub r1, r2, #4 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ vdup.16 q2, r6 -+ vext.16 q8, q9, q9, #6 -+ sub r8, r7, #128 -+ vld1.32 {d16[0]}, [r1] -+ vdup.16 q3, r12 -+ mov r5, #3 -+1: -+ vmul.u16 q0, q9, q3 -+ subs r12, r4 -+ vmla.u16 q0, q8, q2 -+ ittt cc -+ asrcc r1, r8, #8 -+ addcc r12, #32 -+ addcc r1, r2, r1, lsl #2 -+ vext.16 q10, q8, q8, #6 -+ rsb r6, r12, #32 -+ vmov q11, q8 -+ sub r5, #1 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r8, r7 -+ vld1.32 {d20[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmul.u16 q0, q11, q3 -+ subs r12, r4 -+ vmla.u16 q0, q10, q2 -+ ittt cc -+ asrcc r1, r8, #8 -+ addcc r12, #32 -+ addcc r1, r2, r1, lsl #2 -+ vext.16 q8, q10, q10, #6 -+ rsb r6, r12, #32 -+ vmov q9, q10 -+ sub r5, #1 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r8, r7 -+ vld1.32 {d16[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmul.u16 q0, q11, q3 -+ vmla.u16 q0, q10, q2 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r8, pc} -+4: -+ bcc 3b -+5: -+ vmul.u16 q0, q9, q3 -+ vmla.u16 q0, q8, q2 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r8, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ vld1.16 {q9}, [r1]! -+ rsb r12, r6, #32 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vext.16 q8, q9, q9, #2 -+ vld1.32 {d17[1]}, [r1]! -+ mov r5, #3 -+1: -+ vmul.u16 q0, q8, q2 -+ subs r12, r4 -+ vmla.u16 q0, q9, q3 -+ it cc -+ addcc r12, #32 -+ vext.16 q10, q8, q8, #2 -+ rsb r6, r12, #32 -+ vld1.32 {d21[1]}, [r1] -+ sub r5, #1 -+ vmov q11, q8 -+ teq r5, #0 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r1, #4 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 1b -+ beq 4f -+2: -+ vmul.u16 q0, q10, q2 -+ subs r12, r4 -+ vmla.u16 q0, q11, q3 -+ it cc -+ addcc r12, #32 -+ vext.16 q8, q10, q10, #2 -+ rsb r6, r12, #32 -+ vld1.32 {d17[1]}, [r1] -+ sub r5, #1 -+ vmov q9, q10 -+ teq r5, #0 -+ vrshr.u16 q0, q0, #5 -+ it cc -+ addcc r1, #4 -+ vdup.16 q2, r6 -+ vdup.16 q3, r12 -+ vst1.16 {q0}, [r0], r3 -+ bhi 2b -+ bne 1b -+ bcc 5f -+3: -+ vmul.u16 q0, q10, q2 -+ vmla.u16 q0, q11, q3 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r8, pc} -+4: -+ bcc 3b -+5: -+ vmul.u16 q0, q8, q2 -+ vmla.u16 q0, q9, q3 -+ vrshr.u16 q0, q0, #5 -+ vst1.16 {q0}, [r0] -+ -+ pop {r4-r8, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_c_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r8, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #2 -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ mov r1, r2 @ save r2 - r1 unused by patch_down -+ -+ bl patch_h_down_c_4x4_10 -+ bl patch_h_down_c_4x4_10_continue -+ -+ add r2, r1, #4*4 @ restore r2, but 4 rows further down left -+ sub r0, #32 -+ mov r6, r4 -+ add r0, r0, r3, lsl #2 -+ -+ bl patch_h_down_c_4x4_10 -+ bl patch_h_down_c_4x4_10_continue -+ -+ pop {r4-r8, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ ldrh r7, [r7] -+ mov r8, #-128 -+ sub r8, r7 -+ -+ push {r2, r8} -+ bl patch_h_up_c_4x4_10 -+ bl patch_h_up_c_4x4_10_continue -+ pop {r2, r8} -+ -+ sub r0, #32 -+ mov r6, r4 -+ add r2, #16 -+ sub r8, r8, r7, lsl #2 -+ add r0, r0, r3, lsl #2 -+ -+ bl patch_h_up_c_4x4_10 -+ bl patch_h_up_c_4x4_10_continue -+ -+ pop {r4-r8, pc} -+ -+@ Left of vertical - works down left -+18: -+ vld1.16 {q0-q1}, [r1] -+ sub r9, r2, #4 -+ rsb r12, r6, #32 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ mov r5, #8 -+1: -+ vld1.32 {d17[1]}, [r9] -+ add r8, r7 -+ vmov q2, q0 -+ vmov q3, q1 -+ asr r9, r8, #8 -+ vext.16 q1, q0, q1, #6 -+ add r9, r2, r9, lsl #2 -+ vext.16 q0, q8, q0, #6 -+2: -+ vmul.u16 q11, q2, q10 -+ subs r12, r4 -+ vmla.u16 q11, q0, q9 -+ it cc -+ addcc r12, #32 -+ vmul.u16 q12, q3, q10 -+ rsb r6, r12, #32 -+ vmla.u16 q12, q1, q9 -+ sub r5, #1 -+ teq r5, #0 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ vrshr.u16 q11, q11, #5 -+ vrshr.u16 q12, q12, #5 -+ vst1.16 {q11-q12}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r8, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.16 {q0-q1}, [r1]! -+ rsb r12, r6, #32 -+ vld1.32 {d16[0]}, [r5] -+ mov r5, #8 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+1: -+ vmov q2, q0 -+ add r1, #4 -+ vmov q3, q1 -+ vext.16 q0, q0, q1, #2 -+ vext.16 q1, q1, q8, #2 -+2: -+ vmul.u16 q11, q0, q9 -+ subs r12, r4 -+ vmla.u16 q11, q2, q10 -+ it cc -+ addcc r12, #32 -+ vmul.u16 q12, q1, q9 -+ rsb r6, r12, #32 -+ vmla.u16 q12, q3, q10 -+ sub r5, #1 -+ vld1.32 {d16[0]}, [r1] -+ teq r5, #0 -+ vdup.16 q9, r6 -+ vdup.16 q10, r12 -+ vrshr.u16 q11, q11, #5 -+ vrshr.u16 q12, q12, #5 -+ vst1.16 {q11-q12}, [r0], r3 -+ bhi 2b -+ bne 1b -+ -+ pop {r4-r8, pc} -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_angular_c_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride [r3] -+@ unsigned int mode [sp, #0] 2..34 -+ -+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 -+ ldr r12, [sp] -+ push {r4-r10, lr} -+ ADRT r4, angle_2 - 2 -+ ADRT r7, inv_angle - 11*2 -+ add r7, r7, r12, lsl #1 -+ lsl r3, #2 -+ vpush {d8} -+ ldrsb r6, [r4, r12] -+ cmp r12, #26 -+ ldrsb r4, [r4, r12] -+ bge 26f -+ cmp r12, #18 -+ bge 18f -+ cmp r12, #10 -+ bge 10f -+ -+@ Down of Horizontal - works down left -+ add sp, #8 -+ mov r10, #4 -+ mov r1, r2 -+1: -+ bl patch_h_down_c_4x4_10 -+ bl patch_h_down_c_4x4_10_continue -+ bl patch_h_down_c_4x4_10_continue -+ bl patch_h_down_c_4x4_10_continue -+ -+ add r2, r1, #4*4 @ restore r2, but 4 rows further down left -+ add r1, r1, #4*4 -+ mov r6, r4 -+ sub r0, #64 -+ subs r10, #1 -+ add r0, r0, r3, lsl #2 -+ bne 1b -+ -+ pop {r4-r10, pc} -+ -+@ Up of Horizontal - works down up -+10: -+ add sp, #8 -+ mov r10, #4 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ sub r8, r7 -+2: -+ push {r2, r8} -+ bl patch_h_up_c_4x4_10 -+ bl patch_h_up_c_4x4_10_continue -+ bl patch_h_up_c_4x4_10_continue -+ bl patch_h_up_c_4x4_10_continue -+ pop {r2, r8} -+ -+ sub r0, #64 -+ mov r6, r4 -+ add r2, #16 -+ sub r8, r8, r7, lsl #2 -+ add r0, r0, r3, lsl #2 -+ subs r10, #1 -+ bne 2b -+ -+ pop {r4-r10, pc} -+ -+@ Left of vertical - works down left -+18: -+ add r5, r1, #32 -+ vld1.16 {q1-q2}, [r1] -+ rsb r12, r6, r6, lsl #16 -+ vld1.16 {q3-q4}, [r5] -+ sub r9, r2, #4 -+ rsb r4, r12, #0 -+ rsb r12, r12, #32 << 16 -+ ldrh r7, [r7] -+ mov r8, #-128 -+ vmov d0, d9 -+ vmov s2, r12 -+ add r10, r0, #32 -+ mov r5, #16 -+1: -+ vld1.32 {d1[1]}, [r9] -+ add r8, r7 -+ vmov q11, q4 -+ vmov q10, q3 -+ asr r9, r8, #8 -+ vmov q9, q2 -+ add r9, r2, r9, lsl #2 -+ vmov q8, q1 -+ vext.16 q4, q3, q4, #6 -+ vext.16 q3, q2, q3, #6 -+ vext.16 q2, q1, q2, #6 -+ vext.16 q1, q0, q1, #6 -+2: -+ vmul.u16 q12, q8, d1[1] -+ adds r12, r4 -+ vmla.u16 q12, q1, d1[0] -+ it cc -+ addcc r12, #32 << 16 -+ vmul.u16 q13, q9, d1[1] -+ it cc -+ subcc r12, #32 -+ vmla.u16 q13, q2, d1[0] -+ sub r5, #1 -+ vmul.u16 q14, q10, d1[1] -+ teq r5, #0 -+ vmla.u16 q14, q3, d1[0] -+ vmul.u16 q15, q11, d1[1] -+ vmla.u16 q15, q4, d1[0] -+ vmov s2, r12 -+ vrshr.u16 q12, q12, #5 -+ vrshr.u16 q13, q13, #5 -+ vrshr.u16 q14, q14, #5 -+ vrshr.u16 q15, q15, #5 -+ vst1.16 {q12-q13}, [r0], r3 -+ vst1.16 {q14-q15}, [r10], r3 -+ bhi 2b -+ bne 1b -+ -+ vpop {d8} -+ vmov d9, d0 -+ pop {r4-r10, pc} -+ -+@ Right of vertical - works along top - left unused -+26: -+ add r5, r1, #32 -+ vld1.16 {q1-q2}, [r1] -+ rsb r12, r6, r6, lsl #16 -+ vld1.16 {q3-q4}, [r5] -+ add r1, r1, #64 -+ rsb r4, r12, #0 -+ rsb r12, r12, #32 << 16 -+ vmov d1, d9 -+ vmov s1, r12 -+ add r10, r0, #32 -+ mov r5, #16 -+1: -+ vld1.32 {d0[0]}, [r1]! -+ vmov q8, q1 -+ vmov q9, q2 -+ vmov q10, q3 -+ vmov q11, q4 -+ vext.16 q1, q1, q2, #2 -+ vext.16 q2, q2, q3, #2 -+ vext.16 q3, q3, q4, #2 -+ vext.16 q4, q4, q0, #2 -+2: -+ vmul.u16 q12, q1, d0[2] -+ adds r12, r4 -+ vmla.u16 q12, q8, d0[3] -+ it cc -+ addcc r12, #32 << 16 -+ vmul.u16 q13, q2, d0[2] -+ it cc -+ subcc r12, #32 -+ vmla.u16 q13, q9, d0[3] -+ sub r5, #1 -+ vmul.u16 q14, q3, d0[2] -+ teq r5, #0 -+ vmla.u16 q14, q10, d0[3] -+ vmul.u16 q15, q4, d0[2] -+ vmla.u16 q15, q11, d0[3] -+ vmov s1, r12 -+ vrshr.u16 q12, q12, #5 -+ vrshr.u16 q13, q13, #5 -+ vrshr.u16 q14, q14, #5 -+ vrshr.u16 q15, q15, #5 -+ vst1.16 {q12-q13}, [r0], r3 -+ vst1.16 {q14-q15}, [r10], r3 -+ bhi 2b -+ bne 1b -+ -+ vpop {d8} -+ vmov d9, d1 -+ pop {r4-r10, pc} -+ -+endfunc -diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S -new file mode 100644 -index 0000000000..df8c1c25b9 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S -@@ -0,0 +1,705 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+ -+@ ff_hevc_rpi_pred_dc_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_4_neon_8, export=1 -+ -+ @ Average the els of top & left -+ ldr r2, [r2] -+ vld1.32 {d0[0]}, [r1] -+ mov r1, #2 -+ vmov s1, r2 -+ vmov s2, r2 -+ vmov.i16 q2, #3 -+ add r2, r0, r3 -+ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] -+ lsl r3, #1 -+ vmovl.u8 q0, d0 -+ vmov.i64 d7, #0xffff -+ vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) -+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vmov.i64 d7, #0xff -+ vpadd.i16 d6, d6 @ 1 (all the same) -+ vrshr.u16 d6, #3 -+ vmla.i16 q0, q2, d6[0] -+ vdup.8 d6, d6[0] -+ vrshrn.i16 d0, q0, #2 -+ -+ @ Store top line -+ vst1.32 {d0[0]}, [r0], r3 -+ -+ @ Store the rest -+ vshr.u64 d1, d0, #5*8 -+ vshr.u64 d2, d0, #6*8 -+ vshr.u64 d3, d0, #7*8 -+ vbif d1, d6, d7 -+ vbif d2, d6, d7 -+ vst1.32 {d1[0]}, [r2], r3 -+ vbif d3, d6, d7 -+ vst1.32 {d2[0]}, [r0] -+ vst1.32 {d3[0]}, [r2] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {d0}, [r1] -+ vld1.8 {d1}, [r2] -+A add r2, r0, r3, lsl #1 -+A lsl r3, #2 -+T lsl r3, #1 -+T add r2, r0, r3 -+T lsl r3, #1 -+ vaddl.u8 q0, d0, d1 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ vpadd.i32 d2, d0, d0 @ This adds U & V separately -+ vpadd.i32 d3, d0, d0 -+ vrshrn.u16 d0, q1, #3 -+ -+ @ Store -+ vst1.8 {d0}, [r0], r3 -+ vst1.8 {d0}, [r2], r3 -+ vst1.8 {d0}, [r0] -+ vst1.8 {d0}, [r2] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_8_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {d0}, [r1] -+ mov r1, #2 -+ vld1.8 {d16}, [r2] -+ vmov.i16 q2, #3 -+ vmov.i64 d7, #0xffff -+ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] -+ vmovl.u8 q0, d0 -+ vadd.i16 d6, d2, d3 @ d6 has 4 vals -+ vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vmov.i64 d7, #0xff -+ vmovl.u8 q1, d16 -+ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) -+ vpadd.i16 d6, d6 @ 1 (all the same) -+ vrshr.u16 d6, #4 -+ vmla.i16 q1, q2, d6[0] -+ vmla.i16 q0, q2, d6[0] -+ vdup.8 d6, d6[0] -+ vrshrn.i16 d2, q1, #2 -+ vrshrn.i16 d0, q0, #2 -+ -+ @ Store top line -+ vst1.8 {d0}, [r0], r3 -+ -+ @ Store the rest -+ vshr.u64 d2, #8 -+ vbit d6, d2, d7 -+ vshr.u64 d2, #8 -+ vst1.8 {d6}, [r0], r3 -+ mov r1, #6 -+1: -+ vbit d6, d2, d7 -+ vshr.u64 d2, #8 -+ vst1.8 {d6}, [r0], r3 -+ subs r1, #2 -+ vbit d6, d2, d7 -+ vshr.u64 d2, #8 -+ vst1.8 {d6}, [r0], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {q0}, [r1] -+ mov r1, #8 -+ vld1.8 {q1}, [r2] -+T lsl r3, #1 -+ vaddl.u8 q0, d0, d1 -+A add r2, r0, r3, lsl #1 -+A lsl r3, #2 -+T add r2, r0, r3 -+T lsl r3, #1 -+ vaddl.u8 q1, d2, d3 -+ vadd.i16 q1, q0 -+ vadd.i16 d3, d2 @ d3 has 2 val pairs -+ vpadd.i32 d2, d3, d3 @ This add U & V separately -+ vpadd.i32 d3, d3, d3 -+ vrshrn.u16 d0, q1, #4 -+ vrshrn.u16 d1, q1, #4 -+ -+ @ Store -+1: -+ vst1.8 {q0}, [r0], r3 -+ subs r1, #4 -+ vst1.8 {q0}, [r2], r3 -+ vst1.8 {q0}, [r0], r3 -+ vst1.8 {q0}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_16_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {q8}, [r1] -+ mov r1, #2 -+ vld1.8 {q9}, [r2] -+ vaddl.u8 q10, d16, d17 -+ vaddl.u8 q11, d16, d18 -+ vaddl.u8 q0, d18, d19 -+ vmov.i16 q1, #3 -+ vadd.i16 q10, q0 -+ vmovl.u8 q0, d18 -+ vadd.i16 d20, d21 -+ vmov.i16 d2[0], r1 @ 2, 3, 3, 3... -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vmovl.u8 q2, d16 -+ vmovl.u8 q9, d19 -+ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) -+ vmov.i64 d7, #0xffff -+ vmovl.u8 q8, d17 -+ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] -+ vmov.i64 d7, #0xff -+ vpadd.i16 d20, d20 @ 1 (all the same) -+ vrshr.u16 d21, d20, #5 -+ vrshr.u16 d20, d20, #5 -+ vmla.i16 q0, q10, d2[1] -+ vmla.i16 q9, q10, d2[1] -+ vmla.i16 q2, q10, q1 -+ vmla.i16 q8, q10, d2[1] -+ vdup.8 q1, d20[0] -+ vrshrn.i16 d0, q0, #2 -+ vrshrn.i16 d1, q9, #2 -+ vrshrn.i16 d4, q2, #2 -+ vrshrn.i16 d5, q8, #2 -+ vext.8 q0, q0, q0, #1 -+ -+ @ Store top line -+ vst1.8 {q2}, [r0], r3 -+ -+ @ Store the rest -+ mov r1, #15 -+1: -+ vbit d2, d0, d7 -+ vext.8 q0, q0, q0, #1 -+ subs r1, #1 -+ vst1.8 {q1}, [r0], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {q0-q1}, [r1] -+ mov r1, #16 -+ vld1.8 {q2-q3}, [r2] -+T lsl r3, #1 -+ vaddl.u8 q0, d0, d1 -+A add r2, r0, r3, lsl #1 -+T add r2, r0, r3 -+ vaddl.u8 q1, d2, d3 -+A lsl r3, #2 -+T lsl r3, #1 -+ vaddl.u8 q2, d4, d5 -+ vaddl.u8 q3, d6, d7 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ vpadd.i32 d4, d0, d0 @ This adds U & V separately -+ vpadd.i32 d5, d0, d0 -+ vrshrn.u16 d0, q2, #5 -+ vrshrn.u16 d1, q2, #5 -+ vrshrn.u16 d2, q2, #5 -+ vrshrn.u16 d3, q2, #5 -+ -+ @ Store -+1: -+ vst1.8 {q0-q1}, [r0], r3 -+ subs r1, #2 -+ vst1.8 {q0-q1}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_32_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_32_neon_8, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {q0-q1}, [r1] -+ mov r1, #32 -+ vld1.8 {q2-q3}, [r2] -+ add r2, r0, r3 -+ vaddl.u8 q0, d0, d1 -+ lsl r3, #1 -+ vaddl.u8 q1, d2, d3 -+ vaddl.u8 q2, d4, d5 -+ vaddl.u8 q3, d6, d7 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d4, d0, d0 @ 1 (all the same) -+ vpadd.i16 d5, d0, d0 -+ vrshrn.u16 d0, q2, #6 -+ vrshrn.u16 d1, q2, #6 -+ vrshrn.u16 d2, q2, #6 -+ vrshrn.u16 d3, q2, #6 -+ -+ @ Store -+1: -+ vst1.8 {q0-q1}, [r0], r3 -+ subs r1, #2 -+ vst1.8 {q0-q1}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ----------------------------------------------------------------------------- -+@ -+@ 10 Bit versions -+@ -+@ There is no actual bit depth dependency in this code except that our -+@ intermediate results will overflow the 16 bits they are stored in -+@ All there functions are good to 10 bits - with the worst case being -+@ in dc_32 where we use all 16 bits. -+ -+ -+@ ff_hevc_rpi_pred_dc_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_4_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vld1.16 {d0}, [r1] -+ mov r1, #2 -+ vld1.16 {d1}, [r2] -+T lsl r3, #1 -+ vmov.i16 q2, #3 -+A add r2, r0, r3, lsl #1 -+T add r2, r0, r3 -+ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] -+A lsl r3, #2 -+T lsl r3, #1 -+ vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vmov.i64 d7, #0xffff -+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) -+ vpadd.i16 d6, d6 @ 1 (all the same) -+ vrshr.u16 d6, #3 -+ vmla.i16 q0, q2, d6[0] -+ vrshr.u16 q0, #2 -+ -+ @ Store top line -+ vst1.16 {d0}, [r0], r3 -+ -+ @ Store the rest -+ vshr.u64 d3, d1, #1*16 -+ vshr.u64 d4, d1, #2*16 -+ vshr.u64 d5, d1, #3*16 -+ vbif d3, d6, d7 -+ vbif d4, d6, d7 -+ vst1.16 {d3}, [r2], r3 -+ vbif d5, d6, d7 -+ vst1.16 {d4}, [r0] -+ vst1.16 {d5}, [r2] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] (In pels - needs * 4) -+ -+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vld1.8 {q0}, [r1] -+ vld1.8 {q1}, [r2] -+A add r2, r0, r3, lsl #2 -+A lsl r3, #3 -+T lsl r3, #2 -+T add r2, r0, r3 -+T lsl r3, #1 -+ vadd.i16 q0, q1 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ vpadd.i32 d2, d0, d0 @ This adds U & V separately -+ vpadd.i32 d3, d0, d0 -+ vrshr.u16 q0, q1, #3 -+ -+ vst1.16 {q0}, [r0], r3 -+ vst1.16 {q0}, [r2], r3 -+ vst1.16 {q0}, [r0] -+ vst1.16 {q0}, [r2] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_8_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vld1.16 {q0}, [r1] -+ mov r1, #2 -+ vld1.16 {q8}, [r2] -+T lsl r3, #1 -+ vmov.i16 q2, #3 -+A add r2, r0, r3, lsl #1 -+T add r2, r0, r3 -+ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] -+A lsl r3, #2 -+T lsl r3, #1 -+ vmov.i64 d7, #0xffff -+ vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vadd.i16 d6, d2, d3 @ d6 has 4 vals -+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) -+ vpadd.i16 d6, d6 @ 1 (all the same) -+ vrshr.u16 d6, #4 -+ vmla.i16 q8, q2, d6[0] -+ vmla.i16 q0, q2, d6[0] -+ vdup.16 q2, d6[0] -+ vdup.16 q9, d6[0] -+ vrshr.u16 q8, q8, #2 -+ vrshr.u16 q0, q0, #2 -+ vext.16 q1, q8, q8, #1 -+ -+ @ Store top line -+ vst1.16 {q0}, [r0], r3 -+ -+ @ Store the rest -+ vbit d18, d2, d7 -+ vst1.16 {q9}, [r2], r3 -+ mov r1, #6 -+1: -+ vext.16 q8, q8, q8, #2 -+ subs r1, #2 -+ vext.16 q1, q1, q1, #2 -+ vbit d4, d16, d7 -+ vst1.16 {q2}, [r0], r3 -+ vbit d18, d2, d7 -+ vst1.16 {q9}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] (In pels - needs * 4) -+ -+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vld1.16 {q0-q1}, [r1] -+ mov r1, #8 -+ vld1.16 {q2-q3}, [r2] -+T lsl r3, #2 -+ vadd.i16 q1, q0 -+A add r2, r0, r3, lsl #2 -+A lsl r3, #3 -+T add r2, r0, r3 -+T lsl r3, #1 -+ vadd.i16 q2, q3 -+ vadd.i16 q1, q2 -+ vadd.i16 d3, d2 @ d3 has 2 val pairs -+ vpadd.i32 d2, d3, d3 @ This add U & V separately -+ vpadd.i32 d3, d3, d3 -+ vrshr.u16 q0, q1, #4 -+ vrshr.u16 q1, q1, #4 -+ -+ @ Store -+1: -+ vst1.8 {q0-q1}, [r0], r3 -+ subs r1, #2 -+ vst1.8 {q0-q1}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_dc_16_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vld1.16 {q8-q9}, [r1] -+ mov r1, #2 -+ vld1.16 {q10-q11}, [r2] -+ lsl r3, #1 @ stride given in pels -+ vadd.i16 q0, q8, q9 -+ vadd.i16 q1, q10, q11 -+ vmov.i16 q3, #3 -+ vadd.i16 q1, q0 -+ vadd.i16 d0, d16, d20 -+ vmov.i64 d31, #0xffff -+ vadd.i16 d3, d2 -+ vmov.16 d6[0], r1 @ 2, 3, 3, 3... -+ -+ @ top line gets some smoothing -+ @ (top[i] + 3*dc + 2) >> 2 -+ @ as does left -+ @ topline[0] is extra special -+ @ (top[0] + left[0] + 2*dc + 2) >> 2 -+ -+ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] -+ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) -+ vpadd.i16 d3, d3 @ 1 (all the same) -+ vrshr.u16 d2, d3, #5 -+ vrshr.u16 d3, d3, #5 -+ vmov q0, q1 -+ vmla.i16 q10, q1, d6[1] -+ vmla.i16 q11, q1, d6[1] -+ vmla.i16 q8, q1, q3 -+ vmla.i16 q9, q1, d6[1] -+ vrshr.u16 q2, q10, #2 -+ vrshr.u16 q3, q11, #2 -+ vrshr.u16 q8, #2 -+ vrshr.u16 q9, #2 -+ vext.16 q2, q2, q2, #1 -+ mov r1, #7<<29 -+ -+ @ Store top line -+ vst1.16 {q8-q9}, [r0], r3 -+ -+ @ Store the rest -+1: -+ vbit d0, d4, d31 -+ vext.16 q2, q2, q2, #1 -+ subs r1, #1<<29 -+ vst1.16 {q0-q1}, [r0], r3 -+ bne 1b -+1: -+ vbit d0, d6, d31 -+ vext.16 q3, q3, q3, #1 -+ subs r1, #1<<29 -+ vst1.16 {q0-q1}, [r0], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_c_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] (In pels - needs * 4) -+ -+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 -+ -+ @ Average the els of top & left -+ vldm r1, {q0-q3} -+ vldm r2, {q8-q11} -+ vadd.i16 q0, q1 -+ mov r1, #16 -+ vadd.i16 q2, q3 -+ add r2, r0, #32 -+ vadd.i16 q8, q9 -+ lsl r3, #2 -+ vadd.i16 q10, q11 -+ vadd.u16 q0, q2 -+ vadd.u16 q8, q10 -+ vadd.i16 q0, q8 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ vpadd.i32 d4, d0, d0 @ This adds U & V separately -+ vpadd.i32 d5, d0, d0 -+ vrshr.u16 q0, q2, #5 -+ vrshr.u16 q1, q2, #5 -+ -+ @ Store -+1: -+ vst1.16 {q0-q1}, [r0], r3 -+ subs r1, #1 -+ vst1.16 {q0-q1}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_dc_32_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] (In pels) -+ -+function ff_hevc_rpi_pred_dc_32_neon_10, export=1 -+ -+ @ Average the els of top & left -+ @ With 10 bits we are (just) safe from overflow in i16 -+ vldm r1, {q0-q3} -+ vldm r2, {q8-q11} -+ vadd.i16 q0, q1 -+ mov r1, #32 -+ vadd.i16 q2, q3 -+ add r2, r0, #32 -+ vadd.i16 q8, q9 -+ lsl r3, #1 -+ vadd.i16 q10, q11 -+ vadd.u16 q0, q2 -+ vadd.u16 q8, q10 -+ vadd.i16 q0, q8 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d4, d0, d0 @ 1 (all the same) -+ vpadd.i16 d5, d0, d0 -+ vrshr.u16 q0, q2, #6 -+ vrshr.u16 q1, q2, #6 -+ -+ @ Store -+1: -+ vst1.16 {q0-q1}, [r0], r3 -+ subs r1, #1 -+ vst1.16 {q0-q1}, [r2], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S -new file mode 100644 -index 0000000000..f6969d3591 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S -@@ -0,0 +1,881 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+@ All functions have the call -+@ -+@ int ff_hevc_rpi_intra_filter_N_neon_PW( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+@ -+@ Assumptions: -+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware -+@ if reuseing this code) -+@ -+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for -+@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore -+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger -+@ -+@ We always have at least 64 pixel H frame width rounding - this lets us -+@ load UR widthout having to worry about exactly how many pixels are actually -+@ within the frame. As partial loads will only occur very occasionally this -+@ should be a win in nearly all cases. -+@ -+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters -+@ so we do no maths on the contents -+@ -+@ No filtering in 32bit fns as they are chroma only -+ -+ -+.equ AVAIL_UR, 1 -+.equ AVAIL_U, 2 -+.equ AVAIL_UL, 4 -+.equ AVAIL_L, 8 -+.equ AVAIL_DL, 16 -+ -+.equ FILTER_LIGHT, 0x40 -+.equ FILTER_STRONG, 0x80 -+ -+.equ AVAIL_S_UR_N_U_C, 32 - 1 -+.equ AVAIL_S_U_N_UL_C, 32 - 2 -+.equ AVAIL_S_UL_N_L_C, 32 - 3 -+.equ AVAIL_S_L_N_DL_C, 32 - 4 -+ -+.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr -+ -+@ On entry -+@ r2 req -+@ r3 avail -+@ [sp, #sp_offset...] args -+@ -+@ On Exit: -+@ -+@ Extend values: -+@ d_l scalar contains value for L & DL -+@ if DL avail then this is is DL[0] so we don't need to load that -+@ d_ul scalar containing value for UL -+@ d_u scalar containing value for U -+@ d_ur scalar containing value for UR -+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... -+@ This means that L-light-filter works even if nreq DL (we never filter -+@ req-DL without req-L, but we do filter req-L without req-DL) -+@ If UR avail then d_ur == a_ur so U-filter good too -+@ -+@ Data load pointers (only load if req & avail): -+@ r4 DL + stride -+@ r10 L -+@ r6 U -+@ r5 UR -+@ -+@ Others: -+@ r2 req -+@ r7 req & avail -+@ r3 L + stride -+@ r8 DL + stride * 2 -+@ r9 stride * 2 -+@ cs Load U -+@ mi Load UR -+@ -+@ Clobbered: -+@ r12 -+ -+.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur -+ -+.equ src_l\@, \sp_offset + 0 -+.equ src_u\@, \sp_offset + 4 -+.equ src_ur\@, \sp_offset + 8 -+.equ stride\@, \sp_offset + 12 -+.equ pw\@, (1 << \pw_s) @ pel width in bytes -+.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes -+ -+@ r9 stride -+@ r7 = ab_ul, r6 = a_u, r5 = a_ur -+@ r4 = b_dl, r10 = b_l, r8 = b_u -+ -+ ldr r5, [sp, #src_ur\@] -+ lsl r12, r3, #AVAIL_S_U_DL_CPSR -+ ldr r10, [sp, #src_l\@] -+ ldr r9, [sp, #stride\@] -+ ldr r6, [sp, #src_u\@] -+ -+ @ This is quite a slow instruction but it replaces -+ @ a decent number of tests that yield a max of 2 flags/op -+ @ It is annoying we can't branch on Q! -+ @ If L navail (ne) then DL must be navail (pl) -+ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur -+ -+ mov r4, r5 -+ sub r7, r10, r9 -+ it vs -+ movvs r4, r6 -+ add r8, r6, #b_size\@ - pw\@ -+ it cs -+ movcs r4, r7 -+ ite ne -+ movne r10, r4 -+ addeq r4, r7, r9, lsl #\log2_s -+ it cc -+ movcc r7, r10 -+ it mi -+ addmi r4, r10, r9, lsl #\log2_s -+ vld1.\d_type {\d_ul}, [r7] -+ itt vc -+ movvc r8, r7 -+ movvc r6, r7 -+ vld1.\d_type {\d_l }, [r4], r9 -+ tst r3, #AVAIL_UR -+ vld1.\d_type {\d_u }, [r6] -+ it eq -+ moveq r5, r8 -+ and r7, r2, r3 -+ add r8, r4, r9 -+ vld1.\d_type {\d_ur}, [r5] -+ lsls r12, r7, #AVAIL_S_UR_N_U_C -+ add r3, r10, r9 -+ lsl r9, #1 -+.endm -+ -+ -+ -+@ int ff_hevc_rpi_intra_filter_4_neon_8( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set pw_s, 0 -+.set pw, (1 << pw_s) -+.set log2_s, 2 -+ -+function ff_hevc_rpi_intra_filter_4_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] -+ -+ it cs -+ vldrcs s2, [r6] -+ ite pl -+ vmovpl s3, s4 -+ vldrmi s3, [r5] -+ -+ lsls r7, #AVAIL_S_L_N_DL_C -+ add r12, r0, #-pw -+ bpl 1f -+ -+ vld1.8 {d0[0]}, [r10], r9 -+ vld1.8 {d0[1]}, [r3], r9 -+ vld1.8 {d0[2]}, [r10] -+ vld1.8 {d0[3]}, [r3] -+1: -+ bcc 1f -+ vld1.8 {d0[5]}, [r4], r9 -+ vld1.8 {d0[6]}, [r8] -+ vld1.8 {d0[7]}, [r4] -+1: -+ vstr d1, [r1] @ Up -+ vst1.8 {d31[7]}, [r12] -+ vstr d0, [r0] @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+ -+@ int ff_hevc_rpi_intra_filter_4_neon_16( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set pw_s, 1 -+.set pw, (1 << pw_s) -+.set log2_s, 2 -+ -+function ff_hevc_rpi_intra_filter_4_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] -+ -+ it cs -+ vldrcs d2, [r6] -+ it mi -+ vldrmi d3, [r5] -+ lsls r7, #AVAIL_S_L_N_DL_C -+ add r12, r0, #-pw -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10] -+ vld1.16 {d0[3]}, [r3] -+1: -+ bcc 1f -+ vld1.16 {d1[1]}, [r4], r9 -+ vld1.16 {d1[2]}, [r8] -+ vld1.16 {d1[3]}, [r4] -+1: -+ vst1.16 {q1}, [r1] @ Up -+ vst1.16 {d31[3]}, [r12] -+ vst1.16 {q0}, [r0] @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+ -+@ int ff_hevc_rpi_intra_filter_8_neon_8( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set pw_s, 0 -+.set pw, (1 << pw_s) -+.set log2_s, 3 -+ -+function ff_hevc_rpi_intra_filter_8_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] -+ -+ it cs -+ vldrcs d4, [r6] -+ it mi -+ vldrmi d5, [r5] -+ -+ lsls r7, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.8 {d0[0]}, [r10], r9 -+ vld1.8 {d0[1]}, [r3], r9 -+ vld1.8 {d0[2]}, [r10], r9 -+ vld1.8 {d0[3]}, [r3], r9 -+ vld1.8 {d0[4]}, [r10], r9 -+ vld1.8 {d0[5]}, [r3], r9 -+ vld1.8 {d0[6]}, [r10] -+ vld1.8 {d0[7]}, [r3] -+1: -+ bcc 1f -+ vld1.8 {d1[1]}, [r4], r9 -+ vld1.8 {d1[2]}, [r8], r9 -+ vld1.8 {d1[3]}, [r4], r9 -+ vld1.8 {d1[4]}, [r8], r9 -+ vld1.8 {d1[5]}, [r4], r9 -+ vld1.8 {d1[6]}, [r8] -+ vld1.8 {d1[7]}, [r4] -+1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f -+ -+ @ Luma light filter -+ vext.8 q8, q15, q2, #15 -+ vext.8 q12, q15, q0, #15 -+ vaddl.u8 q9, d17, d5 -+ vaddl.u8 q8, d16, d4 -+ vaddl.u8 q13, d25, d1 -+ vaddl.u8 q12, d24, d0 -+ vmov.u8 r3, d5[7] @ Save final pel -+ vmov.u8 r2, d1[7] @ Save final pel -+ -+ vext.16 q2, q8, q9, #1 -+ vext.16 q3, q9, q9, #1 -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q13, #1 -+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q2, q8 -+ vadd.u16 q3, q9 -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ -+ vrshrn.u16 d4, q2, #2 -+ vrshrn.u16 d5, q3, #2 -+ vrshrn.u16 d0, q0, #2 -+ vrshrn.u16 d1, q1, #2 -+ vrshr.u16 d30, #2 -+ vmov.u8 d5[7], r3 @ Restore final pel -+ vmov.u8 d1[7], r2 @ Restore final pel -+ vdup.u8 d31, d30[0] @ d31[3] = d30[0] -+ -+10: -+ vst1.8 {q2 }, [r1] @ Up -+ vst1.8 {d31[7]}, [r12] @ Up-left -+ vst1.8 {q0 }, [r0] @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+ -+@ int ff_hevc_rpi_intra_filter_8_neon_16( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set ur_size, sp_base + 16 -+.set dl_size, sp_base + 20 -+.set pw_s, 1 -+.set pw, (1 << pw_s) -+.set log2_s, 3 -+.set p_size, (1 << log2_s) @ size in pels -+ -+function ff_hevc_rpi_intra_filter_8_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" -+ -+ it cs -+ vldmcs r6, {d4, d5} -+ ldr r12, [sp, #ur_size] -+ bpl 1f -+ cmp r12, #4 -+ vldm r5, {d6, d7} -+ bgt 1f -+ vdup.16 d7, d6[3] -+1: -+ lsls r12, r7, #AVAIL_S_L_N_DL_C -+ vdup.16 q1, d0[0] -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10], r9 -+ vld1.16 {d0[3]}, [r3], r9 -+ vld1.16 {d1[0]}, [r10], r9 -+ vld1.16 {d1[1]}, [r3], r9 -+ vld1.16 {d1[2]}, [r10] -+ vld1.16 {d1[3]}, [r3] -+1: -+ bcc 1f -+ ldr r12, [sp, #dl_size] -+ vld1.16 {d2[1]}, [r4], r9 -+ cmp r12, #p_size -+ vld1.16 {d2[2]}, [r8], r9 -+ vld1.16 {d2[3]}, [r4], r9 -+ blt 2f -+ vld1.16 {d3[0]}, [r8], r9 -+ vld1.16 {d3[1]}, [r4], r9 -+ vld1.16 {d3[2]}, [r8] -+ vld1.16 {d3[3]}, [r4] -+ b 1f -+2: -+ vdup.16 d3, d2[3] -+1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f -+ -+ @ Luma light filter -+ vext.16 q9, q2, q3, #7 -+ vext.16 q8, q15, q2, #7 -+ vext.16 q13, q0, q1, #7 -+ vext.16 q12, q15, q0, #7 -+ vadd.u16 q9, q3 -+ vadd.u16 q8, q2 -+ vadd.u16 q13, q1 -+ vadd.u16 q12, q0 -+ vmov.u16 r3, d7[3] @ Save final pel -+ vmov.u16 r2, d3[3] @ Save final pel -+ -+ vext.16 q2, q8, q9, #1 -+ vext.16 q3, q9, q9, #1 -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q13, #1 -+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q2, q8 -+ vadd.u16 q3, q9 -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ -+ vrshr.u16 q2, #2 -+ vrshr.u16 q3, #2 -+ vrshr.u16 q0, #2 -+ vrshr.u16 q1, #2 -+ vrshr.u16 d30, #2 -+ vmov.u16 d7[3], r3 @ Restore final pel -+ vmov.u16 d3[3], r2 @ Restore final pel -+ vdup.u16 d31, d30[0] @ d31[3] = d30[0] -+ -+10: -+ vst1.16 {q2, q3}, [r1] @ Up -+ vst1.16 {d31[3]}, [r12] @ Up-left -+ vst1.16 {q0, q1}, [r0] @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+@ int ff_hevc_rpi_intra_filter_16_neon_16( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set ur_size, sp_base + 16 -+.set dl_size, sp_base + 20 -+.set pw_s, 1 -+.set pw, (1 << pw_s) -+.set log2_s, 4 -+.set p_size, (1 << log2_s) @ size in pels -+ -+function ff_hevc_rpi_intra_filter_16_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" -+ -+ vdup.16 q9, d16[0] -+ vdup.16 q11, d20[0] -+ -+ it cs -+ vldmcs r6, {d16-d19} -+ ldr r12, [sp, #ur_size] -+ bpl 1f -+ cmp r12, #12 -+ @ Given chroma frame layout, if UR exists then it is always legit to -+ @ load all of it even if most of it is outside the frame. -+ vldm r5, {d20-d23} -+ bgt 1f -+ bge 4f -+ cmp r12, #8 -+ bge 3f -+ vdup.16 d21, d20[3] -+3: vdup.16 d22, d21[3] -+4: vdup.16 d23, d22[3] -+ -+1: -+ lsls r7, #AVAIL_S_L_N_DL_C -+ ldr r12, [sp, #dl_size] -+ vdup.16 q1, d0[0] -+ vdup.16 q2, d0[0] -+ vdup.16 q3, d0[0] -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10], r9 -+ vld1.16 {d0[3]}, [r3], r9 -+ vld1.16 {d1[0]}, [r10], r9 -+ vld1.16 {d1[1]}, [r3], r9 -+ vld1.16 {d1[2]}, [r10], r9 -+ vld1.16 {d1[3]}, [r3], r9 -+ vld1.16 {d2[0]}, [r10], r9 -+ vld1.16 {d2[1]}, [r3], r9 -+ vld1.16 {d2[2]}, [r10], r9 -+ vld1.16 {d2[3]}, [r3], r9 -+ vld1.16 {d3[0]}, [r10], r9 -+ vld1.16 {d3[1]}, [r3], r9 -+ vld1.16 {d3[2]}, [r10] -+ vld1.16 {d3[3]}, [r3] -+1: -+ bcc 1f -+ vld1.16 {d4[1]}, [r4], r9 -+ cmp r12, #4 -+ vld1.16 {d4[2]}, [r8], r9 -+ vld1.16 {d4[3]}, [r4], r9 -+ ble 2f -+ vld1.16 {d5[0]}, [r8], r9 -+ vld1.16 {d5[1]}, [r4], r9 -+ cmp r12, #12 -+ vld1.16 {d5[2]}, [r8], r9 -+ vld1.16 {d5[3]}, [r4], r9 -+ blt 3f -+ vld1.16 {d6[0]}, [r8], r9 -+ vld1.16 {d6[1]}, [r4], r9 -+ vld1.16 {d6[2]}, [r8], r9 -+ vld1.16 {d6[3]}, [r4], r9 -+ ble 4f -+ vld1.16 {d7[0]}, [r8], r9 -+ vld1.16 {d7[1]}, [r4], r9 -+ vld1.16 {d7[2]}, [r8] -+ vld1.16 {d7[3]}, [r4] -+ b 1f -+2: vdup.16 d5, d4[3] -+3: vdup.16 d6, d5[3] -+4: vdup.16 d7, d6[3] -+1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f -+ -+ vpush {q5} -+ @ Luma light filter -+ @ Left -+ vext.16 q5, q2, q3, #7 -+ vext.16 q14, q1, q2, #7 -+ vext.16 q13, q0, q1, #7 -+ vext.16 q12, q15, q0, #7 -+ -+ vadd.u16 q5, q3 -+ vadd.u16 q14, q2 -+ vadd.u16 q13, q1 -+ vadd.u16 q12, q0 -+ vmov.u16 r2, d7[3] @ Save final pel -+ -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q14, #1 -+ vext.16 q2, q14, q5, #1 -+ vext.16 q3, q5, q5, #1 -+ -+ vmov d30, d24 @ d30[0] = l[0] + ul -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ vadd.u16 q2, q14 -+ vadd.u16 q3, q5 -+ -+ vrshr.u16 q0, #2 -+ vrshr.u16 q1, #2 -+ vrshr.u16 q2, #2 -+ vrshr.u16 q3, #2 -+ -+ @ Up -+ vext.16 q5, q10, q11, #7 -+ vext.16 q14, q9, q10, #7 -+ vext.16 q13, q8, q9, #7 -+ vext.16 q12, q15, q8, #7 -+ -+ vadd.u16 q5, q11 -+ vadd.u16 q14, q10 -+ vadd.u16 q13, q9 -+ vadd.u16 q12, q8 -+ vmov.u16 r3, d23[3] @ Save final pel -+ -+ vext.16 q8, q12, q13, #1 -+ vext.16 q9, q13, q14, #1 -+ vext.16 q10, q14, q5, #1 -+ vext.16 q11, q5, q5, #1 -+ -+ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q8, q12 -+ vadd.u16 q9, q13 -+ vadd.u16 q10, q14 -+ vadd.u16 q11, q5 -+ -+ vrshr.u16 q8, #2 -+ vrshr.u16 q9, #2 -+ vrshr.u16 q10, #2 -+ vrshr.u16 q11, #2 -+ -+ @ Misc -+ vrshr.u16 d30, #2 -+ vmov.u16 d7[3], r2 @ Restore final pel -+ vmov.u16 d23[3], r3 @ Restore final pel -+ vdup.u16 d31, d30[0] @ d31[3] = d30[0] -+ vpop {q5} -+ -+10: -+ vstm r1, {d16-d23} @ Up -+ vst1.16 {d31[3]}, [r12] @ Up-left -+ vstm r0, { d0-d7 } @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+@ int ff_hevc_rpi_intra_filter_4_neon_32( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set pw_s, 2 -+.set pw, (1 << pw_s) -+.set log2_s, 2 -+ -+function ff_hevc_rpi_intra_filter_4_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" -+ -+ it cs -+ vldmcs r6, {d4, d5} -+ it mi -+ vldmmi r5, {d6, d7} -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, d0[0] -+ add r12, r0, #-pw -+ bpl 1f -+ vld1.32 {d0[0]}, [r10], r9 -+ vld1.32 {d0[1]}, [r3], r9 -+ vld1.32 {d1[0]}, [r10] -+ vld1.32 {d1[1]}, [r3] -+1: -+ bcc 1f -+ vld1.32 {d2[1]}, [r4], r9 -+ vld1.32 {d3[0]}, [r8] -+ vld1.32 {d3[1]}, [r4] -+1: -+ vst1.32 {q2, q3 }, [r1] @ Up -+ vst1.32 {d31[1]}, [r12] -+ vst1.32 {q0, q1 }, [r0] @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+ -+@ int ff_hevc_rpi_intra_filter_8_neon_32( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set ur_size, sp_base + 16 -+.set dl_size, sp_base + 20 -+.set pw_s, 2 -+.set pw, (1 << pw_s) -+.set log2_s, 3 -+.set p_size, (1 << log2_s) @ size in pels -+ -+function ff_hevc_rpi_intra_filter_8_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" -+ -+ vdup.32 q9, d16[0] -+ vdup.32 q11, d20[0] -+ -+ it cs -+ vldmcs r6, {q8, q9 } -+ ldr r12, [sp, #ur_size] -+ bpl 1f -+ cmp r12, #p_size -+ vldm r5, {q10, q11} -+ bge 1f -+ vdup.32 q11, d21[1] -+1: -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, d0[0] -+ vdup.32 q2, d0[0] -+ vdup.32 q3, d0[0] -+ bpl 1f -+ vld1.32 {d0[0]}, [r10], r9 -+ vld1.32 {d0[1]}, [r3], r9 -+ vld1.32 {d1[0]}, [r10], r9 -+ vld1.32 {d1[1]}, [r3], r9 -+ vld1.32 {d2[0]}, [r10], r9 -+ vld1.32 {d2[1]}, [r3], r9 -+ vld1.32 {d3[0]}, [r10] -+ vld1.32 {d3[1]}, [r3] -+1: -+ bcc 1f -+ ldr r12, [sp, #dl_size] -+ vld1.32 {d4[1]}, [r4], r9 -+ cmp r12, #p_size -+ vld1.32 {d5[0]}, [r8], r9 -+ vld1.32 {d5[1]}, [r4], r9 -+ blt 2f -+ vld1.32 {d6[0]}, [r8], r9 -+ vld1.32 {d6[1]}, [r4], r9 -+ vld1.32 {d7[0]}, [r8] -+ vld1.32 {d7[1]}, [r4] -+ b 1f -+2: -+ vdup.32 q3, d5[1] -+1: -+ add r12, r0, #-pw -+ vstm r1, { q8-q11} @ Up -+ vst1.32 {d31[1]}, [r12] -+ vstm r0, { q0-q3 } @ Left -+ pop {r4-r10, pc} -+endfunc -+ -+ -+@ int ff_hevc_rpi_intra_filter_16_neon_32( -+@ pixel * const left, [r0] -+@ pixel * const top, [r1] -+@ const unsigned int req, [r2] -+@ const unsigned int avail, [r3] -+@ const pixel * const src_l, [sp, #0] -+@ const pixel * const src_u, [sp, #4] -+@ const pixel * const src_ur, [sp, #8] -+@ const unsigned int stride, [sp, #12] (pels) -+@ const unsigned int top_right_size, [sp, #16] -+@ const unsigned int down_left_size) [sp, #20] -+ -+.set sp_base, 8*4 -+.set ur_size, sp_base + 16 -+.set dl_size, sp_base + 20 -+.set pw_s, 2 -+.set pw, (1 << pw_s) -+.set log2_s, 4 -+.set p_size, (1 << log2_s) @ size in pels -+ -+function ff_hevc_rpi_intra_filter_16_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] -+ -+ @ Once we get this big we have run out of neon regs to store -+ @ everything at once so do in pieces -+ -+ @ Up (have) -+ it cs -+ vldmcs r6, { q0-q3 } -+ ldr r12, [sp, #ur_size] -+ it mi -+ vldmmi r5, { q8-q11} -+ it cs -+ vstmcs r1, { q0-q3 } -+ bpl 1f -+ cmp r12, #12 -+ add lr, r1, #(pw << log2_s) -+ bgt 2f -+ cmp r12, #8 -+ bge 3f -+ vdup.16 q9, d17[1] -+4: vdup.16 d10, d19[1] -+3: vdup.16 q11, d21[1] -+2: vstm lr, { q8-q11} -+1: -+ -+ @ Left (have) -+ add lr, r0, #-pw -+ lsls r12, r7, #AVAIL_S_L_N_DL_C -+ vst1.32 {d30[1]}, [lr] @ UL -+ bpl 1f -+ vld1.32 { d0[0]}, [r10], r9 -+ vld1.32 { d0[1]}, [r3], r9 -+ vld1.32 { d1[0]}, [r10], r9 -+ vld1.32 { d1[1]}, [r3], r9 -+ vld1.32 { d2[0]}, [r10], r9 -+ vld1.32 { d2[1]}, [r3], r9 -+ vld1.32 { d3[0]}, [r10], r9 -+ vld1.32 { d3[1]}, [r3], r9 -+ vld1.32 { d4[0]}, [r10], r9 -+ vld1.32 { d4[1]}, [r3], r9 -+ vld1.32 { d5[0]}, [r10], r9 -+ vld1.32 { d5[1]}, [r3], r9 -+ vld1.32 { d6[0]}, [r10], r9 -+ vld1.32 { d6[1]}, [r3], r9 -+ vld1.32 { d7[0]}, [r10] -+ vld1.32 { d7[1]}, [r3] -+ vstm r0, { q0-q3 } -+1: -+ bcc 1f -+ ldr r12, [sp, #dl_size] -+ vdup.32 d16, d30[0] @ d16[0] = d30[0] -+ add lr, r0, #(pw << log2_s) -+ vld1.32 {d16[1]}, [r4], r9 -+ cmp r12, #4 -+ vld1.32 {d17[0]}, [r8], r9 -+ vld1.32 {d17[1]}, [r4], r9 -+ ble 2f -+ vld1.32 {d18[0]}, [r8], r9 -+ vld1.32 {d18[1]}, [r4], r9 -+ cmp r12, #12 -+ vld1.32 {d19[0]}, [r8], r9 -+ vld1.32 {d19[1]}, [r4], r9 -+ blt 3f -+ vld1.32 {d20[0]}, [r8], r9 -+ vld1.32 {d20[1]}, [r4], r9 -+ vld1.32 {d21[0]}, [r8], r9 -+ vld1.32 {d21[1]}, [r4], r9 -+ ble 4f -+ vld1.32 {d22[0]}, [r8], r9 -+ vld1.32 {d22[1]}, [r4], r9 -+ vld1.32 {d23[0]}, [r8] -+ vld1.32 {d23[1]}, [r4] -+ b 5f -+2: vdup.32 q9, d17[1] -+3: vdup.32 q10, d19[1] -+4: vdup.32 q11, d21[1] -+5: vstm lr, { q8-q11} -+1: -+ eors r7, r2 -+ beq 99f -+ -+ lsls r12, r7, #AVAIL_S_UR_N_U_C -+ vdup.32 q0, d31[0] -+ vdup.32 q1, d31[0] -+ vdup.32 q2, d31[0] -+ vdup.32 q3, d31[0] -+ add lr, r1, #(pw << log2_s) -+ vdup.32 q8, d31[1] -+ vdup.32 q9, d31[1] -+ vdup.32 q10, d31[1] -+ vdup.32 q11, d31[1] -+ it cs -+ vstmcs r1, { q0-q3 } -+ it mi -+ vstmmi lr, { q8-q11} -+ -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q0, d30[0] -+ vdup.32 q1, d30[0] -+ vdup.32 q2, d30[0] -+ vdup.32 q3, d30[0] -+ add lr, r0, #(pw << log2_s) -+ it mi -+ vstmmi r0, { q0-q3 } -+ it cs -+ vstmcs lr, { q0-q3 } -+ -+99: -+ pop {r4-r10, pc} -+endfunc -+ -+ -+ -+ -diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S -new file mode 100644 -index 0000000000..56819ae439 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S -@@ -0,0 +1,920 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+/* -+ * Horizontal & Vertical special cases of angular intra pred -+ * -+ * Split out because: -+ * Vertical, at least, is relatively common -+ * Much simpler code than the general angular case -+ * Luma with size < 32 has extra filtering that doesn't happen anywhere else -+ * -+ * *** Currently luma filtering is mandatory where it occurs, but there are -+ * cases where it should be turned off (rdpcm & an extension sps flag). -+ * These don't occur in the standard conformance suite for Main Profile -+ */ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+@ ff_hevc_rpi_pred_vertical_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.32 {d0[0]}, [r2 :32] @ Left -+ add r2, r0, r3 -+ vld1.8 {d1[]}, [r1] -+ lsl r3, #1 -+ vdup.8 d4, ip -+ vmov.i8 d2, #128 -+ vhsub.u8 d4, d0, d4 -+ veor d1, d2 -+ vld1.32 {d0[0]}, [r1 :32] @ Top -+ vqadd.s8 d1, d4 -+ vmov.i64 d3, #0xff -+ vmov d4, d0 -+ veor d5, d1, d2 -+ veor d1, d1, d2 -+ vbit d0, d1, d3 -+ vshr.u64 d5, #8 -+ vst1.32 {d0[0]}, [r0], r3 -+ vshr.u64 d1, #16 -+ vbit d4, d5, d3 -+ vshr.u64 d5, #16 -+ vst1.32 {d4[0]}, [r2], r3 -+ vbit d0, d1, d3 -+ vst1.32 {d0[0]}, [r0] -+ vbit d4, d5, d3 -+ vst1.32 {d4[0]}, [r2] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.8 {d0}, [r2 :64] @ Left -+ vmov.i8 d1, #128 -+ vld1.8 {d2[]}, [r1] -+ vld1.8 {d3}, [r1 :64] @ Top -+ vdup.8 d4, ip -+ vhsub.u8 d4, d0, d4 -+ veor d2, d1 -+ vmov.i64 d0, #0xff -+ mov r1, #8 -+ vqadd.s8 d2, d4, d2 -+ veor d1, d2, d1 -+1: -+ vbit d3, d1, d0 -+ vshr.u64 d1, #8 -+ vst1.8 {d3}, [r0 :64], r3 -+ subs r1, #2 -+ vbit d3, d1, d0 -+ vshr.u64 d1, #8 -+ vst1.8 {d3}, [r0 :64], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.8 {q0}, [r2 :128] @ Left -+ vdup.8 q1, ip -+ vld1.8 {d4[],d5[]}, [r1] -+ vhsub.u8 q0, q1 -+ vmov.i8 q1, #128 -+ veor q2, q1 -+ vmov.i64 d16, #0xff -+ vqadd.s8 q0, q2 -+ vld1.8 {q3}, [r1 :128] @ Top -+ mov r1, #16 -+ veor q0, q1 -+ vmov q1, q3 -+ vext.8 q2, q0, q0, #1 -+1: -+ vbit d2, d0, d16 -+ vbit d6, d4, d16 -+ vext.8 q0, q0, q0, #2 -+ subs r1, #2 -+ vst1.8 {q1}, [r0 :128], r3 -+ vext.8 q2, q2, q2, #2 -+ vst1.8 {q3}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vert_32_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 -+ vld1.8 {q0, q1 }, [r1 :128] @ Up -+ add r2, r0, r3 -+ lsl r3, #1 -+ mov r1, #16 -+1: -+ vst1.8 {q0, q1 }, [r0 :128], r3 -+ subs r1, #1 -+ vst1.8 {q0, q1 }, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 -+ vld1.16 {d0 }, [r1 :64] @ Up -+ add r2, r0, r3, lsl #1 -+ lsl r3, #2 -+ -+ vst1.16 {d0 }, [r0 :64], r3 -+ vst1.16 {d0 }, [r2 :64], r3 -+ vst1.16 {d0 }, [r0 :64] -+ vst1.16 {d0 }, [r2 :64] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 -+ vld1.16 {q0 }, [r1 :128] @ Up -+ add r2, r0, r3, lsl #1 -+ lsl r3, #2 -+ mov r1, #4 -+1: -+ vst1.16 {q0 }, [r0 :128], r3 -+ subs r1, #2 -+ vst1.16 {q0 }, [r2 :128], r3 -+ vst1.16 {q0 }, [r0 :128], r3 -+ vst1.16 {q0 }, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 -+ vld1.16 {q0, q1 }, [r1 :128] @ Up -+ add r2, r0, r3, lsl #1 -+ lsl r3, #2 -+ mov r1, #8 -+1: -+ vst1.16 {q0, q1 }, [r0 :128], r3 -+ subs r1, #1 -+ vst1.16 {q0, q1 }, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontalal_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+@ ? Might be faster as simple arm -+ -+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.32 {d0[0]}, [r1 :32] @ Top -+ add r1, r2, #3 -+ vld1.8 {d1[]}, [r2]! -+ vdup.8 d2, ip -+ vmov.i8 d3, #128 -+ vhsub.u8 d0, d2 -+ veor d1, d3 -+ vld1.8 {d2[]}, [r2]! -+ add ip, r0, r3 -+ vqadd.s8 d0, d0, d1 -+ lsl r3, #1 -+ vld1.8 {d1[]}, [r2] -+ vld1.8 {d4[]}, [r1] -+ veor d0, d3 -+ vst1.32 {d0[0]}, [r0 :32], r3 -+ vst1.32 {d2[0]}, [ip :32], r3 -+ vst1.32 {d1[0]}, [r0 :32] -+ vst1.32 {d4[0]}, [ip :32] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.8 {d0}, [r1 :64] @ Top -+ vmov.i8 d1, #128 -+ vld1.8 {d2[]}, [r2]! -+ mov r1, #8-2 -+ vdup.8 d3, ip -+ vhsub.u8 d0, d3 -+ veor d2, d1 -+ vqadd.s8 d0, d2 -+ vld1.8 {d2[]}, [r2]! -+ veor d0, d1 -+ vst1.8 {d0}, [r0], r3 -+1: -+ vld1.8 {d0[]}, [r2]! -+ subs r1, #2 -+ vst1.8 {d2}, [r0 :64], r3 -+ vld1.8 {d2[]}, [r2]! -+ vst1.8 {d0}, [r0 :64], r3 -+ bne 1b -+ -+ vst1.8 {d2}, [r0 :64] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 -+ ldrb ip, [r2, #-1] @ Top-left -+ vld1.8 {q0}, [r1 :64] @ Top -+ mov r1, #16-2 -+ vld1.8 {d4[],d5[]}, [r2]! -+ vdup.8 q3, ip -+ vhsub.u8 q0, q3 -+ vmov.i8 q1, #128 -+ veor q2, q1 -+ vqadd.s8 q0, q2 -+ vld1.8 {d4[],d5[]}, [r2]! -+ veor q0, q1 -+ vst1.8 {q0}, [r0], r3 -+1: -+ vld1.8 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.8 {q2}, [r0 :64], r3 -+ vld1.8 {d4[],d5[]}, [r2]! -+ vst1.8 {q0}, [r0 :64], r3 -+ bne 1b -+ -+ vst1.8 {q2}, [r0 :64] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_32_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 -+ vld1.8 {d0[],d1[]}, [r2]! -+ add ip, r0, #16 -+ mov r1, #32-2 -+ vld1.8 {d2[],d3[]}, [r2]! -+ vst1.8 {q0}, [r0 :128], r3 -+ vst1.8 {q0}, [ip :128], r3 -+1: -+ vld1.8 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.8 {q1}, [r0 :128], r3 -+ vst1.8 {q1}, [ip :128], r3 -+ vld1.8 {d2[],d3[]}, [r2]! -+ vst1.8 {q0}, [r0 :128], r3 -+ vst1.8 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.8 {q1}, [r0 :128] -+ vst1.8 {q1}, [ip :128] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 -+ add r1, r2, #2 -+ vld1.16 {d0[]}, [r2] -+ add r2, #4 -+ vld1.16 {d1[]}, [r1] -+ add r1, #4 -+ vld1.16 {d2[]}, [r2] -+A add r2, r0, r3, lsl #1 -+T lsl r3, #1 -+T add r2, r0, r3 -+ vld1.16 {d3[]}, [r1] -+A lsl r3, #2 -+T lsl r3, #1 -+ vst1.16 {d0}, [r0 :64], r3 -+ vst1.16 {d1}, [r2 :64], r3 -+ vst1.16 {d2}, [r0 :64] -+ vst1.16 {d3}, [r2 :64] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 -+ vld1.16 {d0[],d1[]}, [r2]! -+ lsl r3, #1 -+ vld1.16 {d2[],d3[]}, [r2]! -+ mov r1, #8-2 -+ vst1.16 {q0}, [r0 :64], r3 -+1: -+ vld1.16 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.16 {q1}, [r0 :64], r3 -+ vld1.16 {d2[],d3[]}, [r2]! -+ vst1.16 {q0}, [r0 :64], r3 -+ bne 1b -+ -+ vst1.16 {q1}, [r0 :64] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 -+ vld1.16 {d0[],d1[]}, [r2]! -+ lsl r3, #1 -+ add ip, r0, #16 -+ mov r1, #16-2 -+ vld1.16 {d2[],d3[]}, [r2]! -+ vst1.16 {q0}, [r0 :128], r3 -+ vst1.16 {q0}, [ip :128], r3 -+1: -+ vld1.16 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.16 {q1}, [r0 :128], r3 -+ vst1.16 {q1}, [ip :128], r3 -+ vld1.16 {d2[],d3[]}, [r2]! -+ vst1.16 {q0}, [r0 :128], r3 -+ vst1.16 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.16 {q1}, [r0 :128] -+ vst1.16 {q1}, [ip :128] -+ bx lr -+endfunc -+ -+ -+@------------------------------------------------------------------------------ -+@ -+@ 10 Bit -+@ Has clipping constants so 10-bit only but could easily be macroed up to -+@ 14-bit before we run out of bits -+ -+ -+@ ff_hevc_rpi_pred_vertical_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {d0}, [r2 :64] @ Left -+ vmov.i16 d2, #0 -+ vld1.16 {d1[]}, [r1] -+T lsl r3, #1 -+ vdup.16 d4, ip -+ vmov.i16 d3, #0x3ff -+ vld1.16 {d5}, [r1 :64] @ Top -+ vhsub.u16 d4, d0, d4 -+ vmov.i64 d0, #0xffff -+A add r2, r0, r3, lsl #1 -+T add r2, r0, r3 -+ vadd.i16 d1, d1, d4 -+ vmov d6, d5 -+ vmax.s16 d1, d1, d2 -+ vmin.s16 d2, d1, d3 -+ vmin.s16 d1, d1, d3 -+ vbit d5, d1, d0 -+A lsl r3, #2 -+T lsl r3, #1 -+ vshr.u64 d2, #16 -+ vshr.u64 d1, #32 -+ vbit d6, d2, d0 -+ vst1.16 {d5}, [r0], r3 -+ vshr.u64 d2, #32 -+ vst1.16 {d6}, [r2], r3 -+ vbit d5, d1, d0 -+ vst1.16 {d5}, [r0] -+ vbit d6, d2, d0 -+ vst1.16 {d6}, [r2] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {q0}, [r2 :128] @ Left -+ lsl r3, #1 -+ vdup.16 q1, ip -+ vld1.16 {d4[],d5[]}, [r1] -+ vhsub.u16 q0, q0, q1 -+ vmov.i16 q1, #0 -+ vadd.i16 q0, q2 -+ vmov.i16 q2, #0x3ff -+ vld1.16 {q3}, [r1 :128] @ Top -+ mov r1, #8 -+ vmax.s16 q0, q1 -+ vmov q1, q3 -+ vmin.s16 q0, q2 -+ vmov.i64 d16, #0xffff -+ vext.16 q2, q0, q0, #1 -+1: -+ vbit d2, d0, d16 -+ vbit d6, d4, d16 -+ vext.16 q0, q0, q0, #2 -+ subs r1, #2 -+ vst1.16 {q1}, [r0 :128], r3 -+ vext.16 q2, q2, q2, #2 -+ vst1.16 {q3}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {q0-q1}, [r2 :128] @ Left -+T lsl r3, #1 -+ vdup.16 q2, ip -+A add r2, r0, r3, lsl #1 -+T add r2, r0, r3 -+ vld1.16 {d6[],d7[]}, [r1] -+A lsl r3, #2 -+T lsl r3, #1 -+ vhsub.u16 q0, q2 -+ vhsub.u16 q1, q2 -+ vadd.i16 q0, q3 -+ vadd.i16 q1, q3 -+ vmov.i16 q2, #0 -+ vld1.16 {q8-q9}, [r1 :128] @ Top -+ mov r1, #0 -+ vmov.i16 q3, #0x3ff -+ vmax.s16 q0, q2 -+ vmax.s16 q1, q2 -+ vmin.s16 q0, q3 -+ vmin.s16 q1, q3 -+ vmov q10, q8 -+ vmov q11, q9 -+ vext.16 q2, q0, q1, #1 -+ vext.16 q3, q1, q1, #1 -+ vmov.i64 d24, #0xffff -+1: -+ vbit d16, d0, d24 -+ vbit d20, d4, d24 -+ vext.16 q0, q0, q0, #2 -+ subs r1, #1<<30 -+ vst1.16 {q8-q9}, [r0 :128], r3 -+ vext.16 q2, q2, q2, #2 -+ vst1.16 {q10-q11}, [r2 :128], r3 -+ bne 1b -+1: -+ vbit d16, d2, d24 -+ vbit d20, d6, d24 -+ vext.16 q1, q1, q1, #2 -+ subs r1, #1<<30 -+ vst1.16 {q8-q9}, [r0 :128], r3 -+ vext.16 q3, q3, q3, #2 -+ vst1.16 {q10-q11}, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_32_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 -+ vldm r1, { q0-q3 } @ Up -+ lsl r3, #1 -+ mov r1, #32 -+ add r2, r0, #32 -+1: -+ vst1.16 {q0-q1}, [r0 :128], r3 -+ subs r1, #1 -+ vst1.16 {q2-q3}, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 -+ vld1.16 {q0 }, [r1 :128] @ Up -+ add r2, r0, r3, lsl #2 -+ lsl r3, #3 -+ -+ vst1.16 {q0 }, [r0 :128], r3 -+ vst1.16 {q0 }, [r2 :128], r3 -+ vst1.16 {q0 }, [r0 :128] -+ vst1.16 {q0 }, [r2 :128] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 -+ vld1.16 {q0, q1 }, [r1 :128] @ Up -+ add r2, r0, r3, lsl #2 -+ lsl r3, #3 -+ mov r1, #4 -+1: -+ vst1.16 {q0, q1 }, [r0 :128], r3 -+ subs r1, #1 -+ vst1.16 {q0, q1 }, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_vertical_c_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 -+ vldm r1, { q0-q3 } @ Up -+ lsl r3, #2 -+ mov r1, #16 -+ add r2, r0, #32 -+1: -+ vst1.16 {q0-q1}, [r0 :128], r3 -+ subs r1, #1 -+ vst1.16 {q2-q3}, [r2 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+@ ff_hevc_rpi_pred_horizontal_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {d0}, [r1 :64] @ Top -+ vmov.i16 d1, #0 -+ vld1.16 {d2[]}, [r2]! -+T lsl r3, #1 -+ vdup.16 d3, ip -+ vmov.i16 d4, #0x3ff -+ vhsub.u16 d0, d3 -+A add ip, r0, r3, lsl #1 -+T add ip, r0, r3 -+ vld1.16 {d3[]}, [r2]! -+A lsl r3, #2 -+T lsl r3, #1 -+ vadd.i16 d0, d2 -+ vld1.16 {d2[]}, [r2]! -+ vmax.s16 d0, d1 -+ vld1.16 {d1[]}, [r2] -+ vmin.s16 d0, d4 -+ vst1.16 {d0}, [r0 :64], r3 -+ vst1.16 {d3}, [ip :64], r3 -+ vst1.16 {d2}, [r0 :64] -+ vst1.16 {d1}, [ip :64] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {q0}, [r1 :128] @ Top -+ lsl r3, #1 -+ vdup.16 q1, ip -+ mov r1, #8-2 -+ vhsub.u16 q0, q1 -+ vld1.16 {d2[],d3[]}, [r2]! -+ vmov.i16 q2, #0 -+ vadd.i16 q0, q1 -+ vmov.i16 q1, #0x3ff -+ vmax.s16 q0, q2 -+ vld1.16 {d4[],d5[]}, [r2]! -+ vmin.s16 q0, q1 -+ vst1.16 {q0}, [r0 :128], r3 -+1: -+ vld1.16 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.16 {q2}, [r0 :128], r3 -+ vld1.16 {d4[],d5[]}, [r2]! -+ vst1.16 {q0}, [r0 :128], r3 -+ bne 1b -+ -+ vst1.16 {q2}, [r0 :128] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontalal_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 -+ ldrh ip, [r2, #-2] @ Top-left -+ vld1.16 {q0-q1}, [r1 :128] @ Top -+ lsl r3, #1 -+ vdup.16 q2, ip -+ add ip, r0, r3 -+ vhsub.u16 q0, q2 -+ add ip, #16 -+ vhsub.u16 q1, q2 -+ mov r1, #16-2 -+ vld1.16 {d4[],d5[]}, [r2]! -+ vmov.i16 q3, #0 -+ vadd.u16 q0, q2 -+ vadd.i16 q1, q2 -+ vmov.i16 q2, #0x3ff -+ vmax.s16 q0, q3 -+ vmax.s16 q1, q3 -+ vld1.16 {d6[],d7[]}, [r2]! -+ vmin.s16 q0, q2 -+ vmin.s16 q1, q2 -+ vst1.16 {q0-q1}, [r0 :128], r3 -+1: -+ vld1.16 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.16 {q3}, [r0 :128], r3 -+ vst1.16 {q3}, [ip :128], r3 -+ vld1.16 {d6[],d7[]}, [r2]! -+ vst1.16 {q0}, [r0 :128], r3 -+ vst1.16 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.16 {q3}, [r0 :128] -+ vst1.16 {q3}, [ip :128] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_32_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 -+ vld1.16 {d0[],d1[]}, [r2]! -+ add ip, r0, #16 -+ push {lr} -+ mov lr, #32 -+ vld1.16 {d2[],d3[]}, [r2]! -+ lsl r3, #1 -+ vst1.16 {q0}, [r0 :128], lr -+ sub r3, #32 -+ vst1.16 {q0}, [ip :128], lr -+ mov r1, #32-2 -+ vst1.16 {q0}, [r0 :128], r3 -+ vst1.16 {q0}, [ip :128], r3 -+1: -+ vld1.16 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.16 {q1}, [r0 :128], lr -+ vst1.16 {q1}, [ip :128], lr -+ vst1.16 {q1}, [r0 :128], r3 -+ vst1.16 {q1}, [ip :128], r3 -+ vld1.16 {d2[],d3[]}, [r2]! -+ vst1.16 {q0}, [r0 :128], lr -+ vst1.16 {q0}, [ip :128], lr -+ vst1.16 {q0}, [r0 :128], r3 -+ vst1.16 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.16 {q1}, [r0 :128], lr -+ vst1.16 {q1}, [ip :128], lr -+ vst1.16 {q1}, [r0 :128] -+ vst1.16 {q1}, [ip :128] -+ pop {pc} -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 -+ add r1, r2, #4 -+ vld1.32 {d0[],d1[]}, [r2] -+ add r2, #8 -+ vld1.32 {d2[],d3[]}, [r1] -+ add r1, #8 -+ vld1.32 {d4[],d5[]}, [r2] -+A add r2, r0, r3, lsl #2 -+T lsl r3, #2 -+T add r2, r0, r3 -+ vld1.32 {d6[],d7[]}, [r1] -+A lsl r3, #3 -+T lsl r3, #1 -+ vst1.32 {q0}, [r0 :128], r3 -+ vst1.32 {q1}, [r2 :128], r3 -+ vst1.32 {q2}, [r0 :128] -+ vst1.32 {q3}, [r2 :128] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 -+ vld1.32 {d0[],d1[]}, [r2]! -+ lsl r3, #2 -+ add ip, r0, #16 -+ mov r1, #8-2 -+ vld1.32 {d2[],d3[]}, [r2]! -+ vst1.32 {q0}, [r0 :128], r3 -+ vst1.32 {q0}, [ip :128], r3 -+1: -+ vld1.32 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.32 {q1}, [r0 :128], r3 -+ vst1.32 {q1}, [ip :128], r3 -+ vld1.32 {d2[],d3[]}, [r2]! -+ vst1.32 {q0}, [r0 :128], r3 -+ vst1.32 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.32 {q1}, [r0 :128] -+ vst1.32 {q1}, [ip :128] -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 -+ vld1.32 {d0[],d1[]}, [r2]! -+ add ip, r0, #16 -+ push {lr} -+ mov lr, #32 -+ vld1.32 {d2[],d3[]}, [r2]! -+ lsl r3, #2 -+ vst1.32 {q0}, [r0 :128], lr -+ sub r3, #32 -+ vst1.32 {q0}, [ip :128], lr -+ mov r1, #16-2 -+ vst1.32 {q0}, [r0 :128], r3 -+ vst1.32 {q0}, [ip :128], r3 -+1: -+ vld1.32 {d0[],d1[]}, [r2]! -+ subs r1, #2 -+ vst1.32 {q1}, [r0 :128], lr -+ vst1.32 {q1}, [ip :128], lr -+ vst1.32 {q1}, [r0 :128], r3 -+ vst1.32 {q1}, [ip :128], r3 -+ vld1.32 {d2[],d3[]}, [r2]! -+ vst1.32 {q0}, [r0 :128], lr -+ vst1.32 {q0}, [ip :128], lr -+ vst1.32 {q0}, [r0 :128], r3 -+ vst1.32 {q0}, [ip :128], r3 -+ bne 1b -+ -+ vst1.32 {q1}, [r0 :128], lr -+ vst1.32 {q1}, [ip :128], lr -+ vst1.32 {q1}, [r0 :128] -+ vst1.32 {q1}, [ip :128] -+ pop {pc} -+endfunc -+ -+ -+ -diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S -new file mode 100644 -index 0000000000..af8c4c03f0 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S -@@ -0,0 +1,1043 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+@ Planar intra pred (8.4.4.2.4) -+@ -+@ predSamples[ x ][ y ] = -+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + -+@ ( x + 1 ) * p[ nTbS ][ -1 ] + -+@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + -+@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) -+ -+@ All 10-bit functions would work with 9 -+ -+ -+@ ff_hevc_rpi_pred_planar_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_4_neon_8, export=1 -+ -+ vld1.8 {d0}, [r1] @ Top -+ adr ip, nb_3_0_1_4 -+ vld1.8 {d1}, [r2] @ Left -+ vmov.i64 d2, #0xffffffff -+ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4} -+ add r1, r0, r3 -+ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3} -+ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4} -+ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4} -+ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0} -+ vshll.u8 q8, d4, #2 -+ lsl r3, #1 -+ vsubl.u8 q2, d5, d4 -+ vmlal.u8 q8, d0, d3 -+ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0} -+ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1} -+ vshl.s16 q9, q2, #1 -+ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1} -+ vadd.i16 d16, d4 -+ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2} -+ vadd.i16 d17, d18 -+ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3} -+ vadd.i16 q2, q8, q9 -+ vmlal.u8 q8, d0, d6 -+ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3} -+ vmlal.u8 q2, d0, d7 -+ vrshrn.i16 d0, q8, #3 -+ vst1.32 d0[0], [r0 :32], r3 -+ vst1.32 d0[1], [r1 :32], r3 -+ vrshrn.i16 d0, q2, #3 -+ vst1.32 d0[0], [r0 :32] -+ vst1.32 d0[1], [r1 :32] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_4_neon_10, export=1 -+ @ Load from bytes & expand later - at the very least this uses less -+ @ memory than having a short table -+ vld1.16 {q0}, [r1 :64] @ Top -+ adr ip, nbh_3_0_1_4 -+ vldr d2, [r2, #8] @ Left (lower) -+ vldr d3, [ip, #8] @ {1,2,3,4} -+T lsl r3, #1 -+ vshl.s16 d4, d0, #2 -+ vdup.16 d1, d1[0] @ {t4,t4,t4,t4} -+ vldr d5, [r2] @ Left (upper) -+ vdup.16 d2, d2[0] @ {l4,l4,l4,l4} -+ vldr d6, [ip] @ {3,2,1,0} -+ vmla.i16 d4, d3, d1 @ Acc set up -+ vsub.i16 d0, d2, d0 @ Add set up -+ vmov d7, d6 -+ vdup.16 d2, d5[0] -+ vdup.16 d3, d5[1] -+ vdup.16 d16, d5[2] -+ vadd.i16 d18, d0, d4 -+ vshl.s16 d0, #1 @ x2 -+ vadd.i16 d19, d0, d4 -+ vdup.16 d17, d5[3] -+ vadd.i16 d4, d0, d18 -+A add r1, r0, r3, lsl #1 -+T add r1, r0, r3 -+ vadd.i16 d5, d0, d19 -+A lsl r3, #2 -+T lsl r3, #1 -+ vmla.i16 q9, q1, q3 -+ vmla.i16 q2, q8, q3 -+ vrshr.u16 q0, q9, #3 -+ vst1.16 {d0}, [r0], r3 -+ vrshr.u16 d2, d4, #3 -+ vst1.16 {d1}, [r1], r3 -+ vrshr.u16 d3, d5, #3 -+ vst1.16 {d2}, [r0] -+ vst1.16 {d3}, [r1] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_8_neon_8, export=1 -+ -+ vld1.8 {q0}, [r1] @ Top -+ adr ip, nb_7_0_1_8 -+ vldr d2, [r2, #8] @ Left (lower) -+ mov r1, #8 -+ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8} -+ vshll.u8 q2, d0, #3 -+ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8} -+ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8} -+ vldr d6, [r2] @ Left (upper) -+ vmlal.u8 q2, d3, d1 -+ vsubl.u8 q0, d2, d0 -+ vldr d7, [ip] @ {7,6,5,4,3,2,1,0} -+ -+@ u8 7..0 [1] d7 -+@ u8 left[y] [1] d6 -+@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] -+ -+ vdup.8 d2, d6[0] -+ vadd.i16 q2, q0 -+ vdup.8 d3, d6[1] -+ vadd.i16 q8, q2, q0 -+1: -+ vmlal.u8 q2, d7, d2 -+ subs r1, #2 -+ vadd.i16 q9, q8, q0 -+ vmlal.u8 q8, d7, d3 -+ vdup.8 d2, d6[2] -+ vdup.8 d3, d6[3] -+ vrshrn.i16 d20, q2, #4 -+ vshr.u64 d6, #16 -+ vmov q2, q9 -+ vst1.8 {d20}, [r0], r3 -+ vrshrn.i16 d20, q8, #4 -+ vadd.i16 q8, q2, q0 -+ vst1.8 {d20}, [r0], r3 -+ bne 1b -+ -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_8_neon_10, export=1 -+ -+ adr ip, nb_7_0_1_8 -+ vld1.16 {q0}, [r1 :128]! @ Top (left) -+ lsl r3, #1 -+ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8} -+ add ip, r2, #16 -+ vld1.16 {d4[],d5[]}, [r1] @ Top (right) -+ mov r1, #8-2 -+ vshl.s16 q3, q0, #3 -+ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8} -+ vld1.16 {d18[],d19[]}, [ip] @ Left (lower) -+ vmla.i16 q3, q8, q2 @ Acc set up -+ vsub.i16 q0, q9, q0 @ Add set up -+ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0} -+ vadd.i16 q2, q3, q0 -+ -+@ u16 7..0 [1] q1 -+@ u32 left[y] [1] [r2] -+@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] -+ -+ vld1.16 {d6[],d7[]}, [r2]! -+ vadd.i16 q8, q2, q0 -+ vld1.16 {d18[],d19[]}, [r2]! -+ vmla.i16 q2, q1, q3 -+ vadd.i16 q3, q8, q0 -+ vmla.i16 q8, q1, q9 -+1: -+ vrshr.u16 q9, q2, #4 -+ subs r1, #2 -+ vmov q2, q3 -+ vrshr.u16 q10, q8, #4 -+ vld1.16 {d6[],d7[]}, [r2]! -+ vst1.16 {q9}, [r0 :128], r3 -+ vadd.i16 q8, q2, q0 -+ vld1.16 {d18[],d19[]}, [r2]! -+ vmla.i16 q2, q1, q3 -+ vadd.i16 q3, q8, q0 -+ vmla.i16 q8, q1, q9 -+ vst1.16 {q10}, [r0 :128], r3 -+ bne 1b -+ -+ vrshr.u16 q9, q2, #4 -+ add r3, r0 -+ vrshr.u16 q10, q8, #4 -+ vst1.16 {q9}, [r0 :128] -+ vst1.16 {q10}, [r3 :128] -+ -+ bx lr -+endfunc -+ -+ -+@------------------------------------------------------------------------------ -+@ -+@ Data - has to be in two lumps to ensure we can always reach using adr -+ -+ .balign 64 -+ -+nb_31_0_1_32: -+ .byte 31, 30, 29, 28, 27, 26, 25, 24 -+ .byte 23, 22, 21, 20, 19, 18, 17, 16 -+nb_15_0_1_16: -+ .byte 15, 14, 13, 12, 11, 10, 9, 8 -+ .byte 7, 6, 5, 4, 3, 2, 1, 0 -+ .byte 1, 2, 3, 4, 5, 6, 7, 8 -+ .byte 9, 10, 11, 12, 13, 14, 15, 16 -+ .byte 17, 18, 19, 20, 21, 22, 23, 24 -+ .byte 25, 26, 27, 28, 29, 30, 31, 32 -+ -+ @ should be back on a 64-byte boundary here -+ -+ @ These could be extracted from the above array, but separate out -+ @ out for better (16 byte) alignment -+nb_3_0_1_4: -+ .byte 3, 2, 1, 0, 3, 2, 1, 0 -+ .byte 1, 2, 3, 4, 1, 2, 3, 4 -+nb_7_0_1_8: -+ .byte 7, 6, 5, 4, 3, 2, 1, 0 -+ .byte 1, 2, 3, 4, 5, 6, 7, 8 -+nbh_3_0_1_4: -+ .short 3, 2, 1, 0, 1, 2, 3, 4 -+ -+@------------------------------------------------------------------------------ -+ -+ -+@ ff_hevc_rpi_pred_planar_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_16_neon_8, export=1 -+ -+ adr ip, nb_15_0_1_16 + 16 -+ vld1.8 {q0}, [r1 :128]! @ Top (left) -+ add r2, #16 -+ vld1.8 {q1}, [ip: 128] @ {1,2,3...16} -+ vld1.8 {d4[]}, [r1] @ Top (right) -+ sub ip, #16 -+ vshll.u8 q3, d0, #4 -+ mov r1, #16 -+ vshll.u8 q8, d1, #4 -+ vld1.8 {d5[]}, [r2] @ Left (lower) -+ sub r2, #16 -+ vmlal.u8 q3, d2, d4 -+ vmlal.u8 q8, d3, d4 @ Acc set up -+ vsubl.u8 q1, d5, d0 -+ vsubl.u8 q0, d5, d1 @ Add set up -+ vld1.8 {q2}, [ip :128] @ {15,14,13...0} -+ -+@ u8 15..0 [1] q2 -+@ u8 left[y] [1] [r2] -+@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] -+ -+ vadd.i16 q3, q1 -+ vadd.i16 q8, q0 -+1: -+ vadd.i16 q10, q3, q1 -+ subs r1, #2 -+ vld1.8 {d18[]}, [r2]! -+ vadd.i16 q11, q8, q0 -+ vld1.8 {d19[]}, [r2]! -+ vmlal.u8 q3, d4, d18 -+ vmlal.u8 q8, d5, d18 -+ vadd.i16 q12, q10, q1 -+ vmlal.u8 q10, d4, d19 -+ vadd.i16 q13, q11, q0 -+ vmlal.u8 q11, d5, d19 -+ vrshrn.u16 d18, q3, #5 -+ vrshrn.u16 d19, q8, #5 -+ vmov q3, q12 -+ vst1.8 {q9}, [r0 :128], r3 -+ vrshrn.u16 d18, q10, #5 -+ vrshrn.u16 d19, q11, #5 -+ vmov q8, q13 -+ vst1.8 {q9}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_16_neon_10, export=1 -+ -+ @ Load from bytes & expand later - at the very least this uses less -+ @ memory than having a short table -+ adr ip, nb_15_0_1_16 + 16 -+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) -+ add r2, #32 -+ vld1.8 {q2}, [ip :128] @ {1,2,3...16} -+ lsl r3, #1 -+ vld1.16 {d6[],d7[]}, [r1] @ Top (right) -+ sub ip, #16 -+ vmovl.u8 q8, d4 -+ mov r1, #16 -+ vshl.i16 q9, q0, #4 -+ vmovl.u8 q2, d5 -+ vshl.i16 q10, q1, #4 -+ vld1.16 {d22[],d23[]}, [r2] @ Left (lower) -+ sub r2, #32 -+ vld1.8 {q12}, [ip] @ {15,14,13...0} -+ vmla.i16 q9, q8, q3 -+ vmla.i16 q10, q2, q3 @ Acc set up -+ vsub.i16 q0, q11, q0 -+ vsub.i16 q1, q11, q1 @ Add set up -+ vadd.i16 q2, q9, q0 -+ vadd.i16 q3, q10, q1 -+ vmovl.u8 q8, d24 -+ vmovl.u8 q9, d25 -+ -+@ u16 15..0 [2] q8,q9 -+@ u32 left[y] [2] [r2] -+@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] -+ -+1: -+ vadd.i16 q10, q2, q0 -+ subs r1, #2 -+ vld1.16 {d24[],d25[]}, [r2]! -+ vadd.i16 q11, q3, q1 -+ vld1.16 {d28[],d29[]}, [r2]! -+ vmla.i16 q2, q8, q12 -+ vmla.i16 q3, q9, q12 -+ vadd.i16 q12, q10, q0 -+ vmla.i16 q10, q8, q14 -+ vadd.i16 q13, q11, q1 -+ vmla.i16 q11, q9, q14 -+ vrshr.u16 q14, q2, #5 -+ vrshr.u16 q15, q3, #5 -+ vmov q2, q12 -+ vst1.16 {q14-q15}, [r0 :128], r3 -+ vrshr.u16 q14, q10, #5 -+ vrshr.u16 q15, q11, #5 -+ vmov q3, q13 -+ vst1.16 {q14-q15}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_32_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_32_neon_8, export=1 -+ -+ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) -+ adr ip, nb_31_0_1_32 + 32 -+ vpush {d8-d12} -+ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32} -+ add r2, #32 -+ vld1.8 {d8[]}, [r1] @ Top (right) -+ sub ip, #32 -+ vshll.u8 q8, d0, #5 -+ mov r1, #32 -+ vld1.8 {d9[]}, [r2] @ Left (lower) -+ sub r2, #32 -+ vshll.u8 q9, d1, #5 -+ vshll.u8 q10, d2, #5 -+ vshll.u8 q11, d3, #5 -+ vmlal.u8 q8, d4, d8 -+ vsubl.u8 q12, d9, d0 -+ vmlal.u8 q9, d5, d8 -+ vsubl.u8 q13, d9, d1 -+ vmlal.u8 q10, d6, d8 -+ vsubl.u8 q14, d9, d2 -+ vmlal.u8 q11, d7, d8 @ Acc set up -+ vsubl.u8 q15, d9, d3 @ Add set up -+ vadd.i16 q8, q12 -+ vadd.i16 q9, q13 -+ vadd.i16 q10, q14 -+ vadd.i16 q11, q15 -+ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0} -+ -+@ u8 31..0 [2] q4,q5 -+@ u8 left[y] [2] [r2] -+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] -+ -+ vld1.8 {d12[]}, [r2]! -+ vadd.i16 q0, q8, q12 -+ b 2f -+1: -+ vld1.8 {d12[]}, [r2]! -+ vrshrn.u16 d3, q1, #6 -+ vrshrn.u16 d2, q0, #6 -+ vadd.i16 q0, q8, q12 -+ vrshrn.u16 d4, q2, #6 -+ vrshrn.u16 d5, q3, #6 -+ vst1.8 {q1-q2}, [r0 :128], r3 -+2: vadd.i16 q1, q9, q13 -+ subs r1, #2 -+ vadd.i16 q2, q10, q14 -+ vadd.i16 q3, q11, q15 -+ vmlal.u8 q8, d8, d12 -+ vmlal.u8 q9, d9, d12 -+ vmlal.u8 q10, d10, d12 -+ vmlal.u8 q11, d11, d12 -+ vld1.8 {d12[]}, [r2]! -+ vrshrn.u16 d19, q9, #6 -+ vrshrn.u16 d18, q8, #6 -+ vadd.i16 q8, q0, q12 -+ vrshrn.u16 d20, q10, #6 -+ vrshrn.u16 d21, q11, #6 -+ vst1.8 {q9-q10}, [r0 :128], r3 -+ vadd.i16 q9, q1, q13 -+ vadd.i16 q10, q2, q14 -+ vadd.i16 q11, q3, q15 -+ vmlal.u8 q0, d8, d12 -+ vmlal.u8 q1, d9, d12 -+ vmlal.u8 q2, d10, d12 -+ vmlal.u8 q3, d11, d12 -+ -+ bne 1b -+ -+ vpop {d8-d12} -+ -+ vrshrn.u16 d3, q1, #6 -+ vrshrn.u16 d2, q0, #6 -+ vrshrn.u16 d4, q2, #6 -+ vrshrn.u16 d5, q3, #6 -+ vst1.8 {q1-q2}, [r0 :128] -+ -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_32_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_32_neon_10, export=1 -+ -+ @ Load from bytes & expand later - at the very least this uses less -+ @ memory than having a short table -+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) -+ adr ip, nb_31_0_1_32 + 32 -+ vpush {q4-q7} -+ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) -+ add r2, #64 -+ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32} -+T lsl r3, #1 -+ vld1.16 {d8[],d9[]}, [r1] @ Top (right) -+ sub ip, #32 -+ vmovl.u8 q12, d28 -+ mov r1, #32 -+ vmovl.u8 q13, d29 -+ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0} -+ vmovl.u8 q14, d30 -+ vmovl.u8 q15, d31 -+ vld1.16 {d10[],d11[]}, [r2] @ Left (lower) -+ sub r2, #64 -+ vshl.i16 q8, q0, #5 -+ vshl.i16 q9, q1, #5 -+ vshl.i16 q10, q2, #5 -+ vshl.i16 q11, q3, #5 -+ vmla.i16 q8, q12, q4 -+ vsub.i16 q0, q5, q0 -+ vmla.i16 q9, q13, q4 -+ vsub.i16 q1, q5, q1 -+ vmla.i16 q10, q14, q4 -+ vmov.u16 ip, d0[0] -+ vsub.i16 q2, q5, q2 -+ vmla.i16 q11, q15, q4 @ Acc set up -+ vsub.i16 q3, q5, q3 @ Add set up -+ vadd.i16 q8, q0 -+ vadd.i16 q9, q1 -+ vadd.i16 q10, q2 -+ vadd.i16 q11, q3 -+ vmovl.u8 q4, d12 -+ vmovl.u8 q5, d13 -+ vmovl.u8 q6, d14 -+ vmovl.u8 q7, d15 -+ -+@ u16 31..0 [4] q4-q7 -+@ u16 left[y] [4] [r2] -+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] -+ -+ vadd.i16 q12, q8, q0 -+A sub r0, r0, r3, lsl #1 -+T sub r0, r3 -+1: -+ vld1.16 {d0[0]}, [r2]! -+A add r0, r0, r3, lsl #1 -+T add r0, r3 -+ vadd.i16 q13, q9, q1 -+ subs r1, #2 -+ vadd.i16 q14, q10, q2 -+ vadd.i16 q15, q11, q3 -+ vmla.i16 q8, q4, d0[0] -+ vmla.i16 q9, q5, d0[0] -+ vmla.i16 q10, q6, d0[0] -+ vmla.i16 q11, q7, d0[0] -+ vmov.16 d0[0], ip -+ vrshr.u16 q8, #6 -+ vrshr.u16 q9, #6 -+ vrshr.u16 q10, #6 -+ vrshr.u16 q11, #6 -+ vstm r0, {q8-q11} -+ vadd.i16 q8, q12, q0 -+A add r0, r0, r3, lsl #1 -+T add r0, r3 -+ vld1.16 {d0[0]}, [r2]! -+ vadd.i16 q9, q13, q1 -+ vadd.i16 q10, q14, q2 -+ vadd.i16 q11, q15, q3 -+ vmla.i16 q12, q4, d0[0] -+ vmla.i16 q13, q5, d0[0] -+ vmla.i16 q14, q6, d0[0] -+ vmla.i16 q15, q7, d0[0] -+ vmov.16 d0[0], ip -+ vrshr.u16 q12, #6 -+ vrshr.u16 q13, #6 -+ vrshr.u16 q14, #6 -+ vrshr.u16 q15, #6 -+ vstm r0, {q12-q15} -+ vadd.i16 q12, q8, q0 -+ bne 1b -+ -+ vpop {q4-q7} -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_c_4_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 -+ -+ vld1.8 {q0}, [r1] @ Top -+ adr ip, nbx2_3_0_1_4 -+ vldr d2, [r2, #8] @ Left (lower) -+ mov r1, #4 -+ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4} -+ lsl r3, #1 -+ vshll.u8 q2, d0, #2 -+ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4} -+ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4} -+ vldr d6, [r2] @ Left (upper) -+ vmlal.u8 q2, d3, d1 -+ vsubl.u8 q0, d2, d0 -+ vldr d7, [ip] @ {3,3,2,2,1,1,0,0} -+ -+@ u8 3..0 [1] d7 -+@ u8 left[y] [1] d6 -+@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] -+ -+ vdup.16 d2, d6[0] -+ vadd.i16 q2, q0 -+ vdup.16 d3, d6[1] -+ vadd.i16 q8, q2, q0 -+1: -+ vmlal.u8 q2, d7, d2 -+ subs r1, #2 -+ vadd.i16 q9, q8, q0 -+ vmlal.u8 q8, d7, d3 -+ vdup.16 d2, d6[2] -+ vdup.16 d3, d6[3] -+ vrshrn.i16 d20, q2, #3 -+ vmov q2, q9 -+ vst1.8 {d20}, [r0], r3 -+ vrshrn.i16 d20, q8, #3 -+ vadd.i16 q8, q2, q0 -+ vst1.8 {d20}, [r0], r3 -+ bne 1b -+ -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_c_4_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 -+ -+ adr ip, nbx2_3_0_1_4 -+ vld1.16 {q0}, [r1 :128]! @ Top (left) -+ lsl r3, #2 -+ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4} -+ add ip, r2, #16 -+ vld1.32 {d4[],d5[]}, [r1] @ Top (right) -+ vshl.s16 q3, q0, #2 -+ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4} -+ vld1.32 {d18[],d19[]}, [ip] @ Left (lower) -+ vmla.i16 q3, q8, q2 @ Acc set up -+ vsub.i16 q0, q9, q0 @ Add set up -+ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0} -+ vadd.i16 q2, q3, q0 -+ -+@ u16 3..0 [1] q1 -+@ u32 left[y] [1] [r2] -+@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] -+ -+ vld1.32 {d6[],d7[]}, [r2]! -+ vadd.i16 q8, q2, q0 -+ vld1.32 {d18[],d19[]}, [r2]! -+ vmla.i16 q2, q1, q3 -+ vadd.i16 q3, q8, q0 -+ vmla.i16 q8, q1, q9 -+ -+ vrshr.u16 q9, q2, #3 -+ vmov q2, q3 -+ vrshr.u16 q10, q8, #3 -+ vld1.32 {d6[],d7[]}, [r2]! -+ vst1.16 {q9}, [r0 :128], r3 -+ vadd.i16 q8, q2, q0 -+ vld1.32 {d18[],d19[]}, [r2]! -+ vmla.i16 q2, q1, q3 -+ vadd.i16 q3, q8, q0 -+ vmla.i16 q8, q1, q9 -+ vst1.16 {q10}, [r0 :128], r3 -+ -+ vrshr.u16 q9, q2, #3 -+ add r3, r0 -+ vrshr.u16 q10, q8, #3 -+ vst1.16 {q9}, [r0 :128] -+ vst1.16 {q10}, [r3 :128] -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_c_8_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 -+ -+ adr ip, nbx2_7_0_1_8 + 16 -+ vld1.8 {q0}, [r1 :128]! @ Top (left) -+ add r2, #16 -+ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8} -+ lsl r3, #1 -+ vld1.16 {d4[]}, [r1] @ Top (right) -+ sub ip, #16 -+ vshll.u8 q3, d0, #3 -+ mov r1, #8 -+ vshll.u8 q8, d1, #3 -+ vld1.16 {d5[]}, [r2] @ Left (lower) -+ sub r2, #16 -+ vmlal.u8 q3, d2, d4 -+ vmlal.u8 q8, d3, d4 @ Acc set up -+ vsubl.u8 q1, d5, d0 -+ vsubl.u8 q0, d5, d1 @ Add set up -+ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0} -+ -+@ u8 7..0 [1] q2 -+@ u8 left[y] [1] [r2] -+@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] -+ -+ vadd.i16 q3, q1 -+ vadd.i16 q8, q0 -+1: -+ vadd.i16 q10, q3, q1 -+ subs r1, #2 -+ vld1.16 {d18[]}, [r2]! -+ vadd.i16 q11, q8, q0 -+ vld1.16 {d19[]}, [r2]! -+ vmlal.u8 q3, d4, d18 -+ vmlal.u8 q8, d5, d18 -+ vadd.i16 q12, q10, q1 -+ vmlal.u8 q10, d4, d19 -+ vadd.i16 q13, q11, q0 -+ vmlal.u8 q11, d5, d19 -+ vrshrn.u16 d18, q3, #4 -+ vrshrn.u16 d19, q8, #4 -+ vmov q3, q12 -+ vst1.8 {q9}, [r0 :128], r3 -+ vrshrn.u16 d18, q10, #4 -+ vrshrn.u16 d19, q11, #4 -+ vmov q8, q13 -+ vst1.8 {q9}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+ -+endfunc -+ -+ -+@------------------------------------------------------------------------------ -+@ -+@ Data - has to be in two lumps to ensure we can always reach using adr -+ -+ .balign 64 -+ -+nbx2_15_0_1_16: -+ .byte 15, 15, 14, 14, 13, 13, 12, 12 -+ .byte 11, 11, 10, 10, 9, 9, 8, 8 -+nbx2_7_0_1_8: -+ .byte 7, 7, 6, 6, 5, 5, 4, 4 -+ .byte 3, 3, 2, 2, 1, 1, 0, 0 -+ .byte 1, 1, 2, 2, 3, 3, 4, 4 -+ .byte 5, 5, 6, 6, 7, 7, 8, 8 -+ .byte 9, 9, 10, 10, 11, 11, 12, 12 -+ .byte 13, 13, 14, 14, 15, 15, 16, 16 -+ -+ @ should be back on a 64-byte boundary here -+ -+nbx2_3_0_1_4: -+ .byte 3, 3, 2, 2, 1, 1, 0, 0 -+ .byte 1, 1, 2, 2, 3, 3, 4, 4 -+ -+@------------------------------------------------------------------------------ -+ -+ -+@ ff_hevc_rpi_pred_planar_c_8_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 -+ -+ @ Load from bytes & expand later - at the very least this uses less -+ @ memory than having a short table -+ adr ip, nbx2_7_0_1_8 + 16 -+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) -+ add r2, #32 -+ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8} -+ lsl r3, #2 -+ vld1.32 {d6[],d7[]}, [r1] @ Top (right) -+ sub ip, #16 -+ vmovl.u8 q8, d4 -+ mov r1, #8 -+ vshl.i16 q9, q0, #3 -+ vmovl.u8 q2, d5 -+ vshl.i16 q10, q1, #3 -+ vld1.32 {d22[],d23[]}, [r2] @ Left (lower) -+ sub r2, #32 -+ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0} -+ vmla.i16 q9, q8, q3 -+ vmla.i16 q10, q2, q3 @ Acc set up -+ vsub.i16 q0, q11, q0 -+ vsub.i16 q1, q11, q1 @ Add set up -+ vadd.i16 q2, q9, q0 -+ vadd.i16 q3, q10, q1 -+ vmovl.u8 q8, d24 -+ vmovl.u8 q9, d25 -+ -+@ u16 7..0 [2] q8,q9 -+@ u32 left[y] [2] [r2] -+@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] -+ -+1: -+ vadd.i16 q10, q2, q0 -+ subs r1, #2 -+ vld1.32 {d24[],d25[]}, [r2]! -+ vadd.i16 q11, q3, q1 -+ vld1.32 {d28[],d29[]}, [r2]! -+ vmla.i16 q2, q8, q12 -+ vmla.i16 q3, q9, q12 -+ vadd.i16 q12, q10, q0 -+ vmla.i16 q10, q8, q14 -+ vadd.i16 q13, q11, q1 -+ vmla.i16 q11, q9, q14 -+ vrshr.u16 q14, q2, #4 -+ vrshr.u16 q15, q3, #4 -+ vmov q2, q12 -+ vst1.16 {q14-q15}, [r0 :128], r3 -+ vrshr.u16 q14, q10, #4 -+ vrshr.u16 q15, q11, #4 -+ vmov q3, q13 -+ vst1.16 {q14-q15}, [r0 :128], r3 -+ bne 1b -+ -+ bx lr -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_c_16_neon_8 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 -+ -+ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) -+ adr ip, nbx2_15_0_1_16 + 32 -+ vpush {d8-d12} -+ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16} -+ add r2, #32 -+ vld1.16 {d8[]}, [r1] @ Top (right) -+ sub ip, #32 -+ vshll.u8 q8, d0, #4 -+ mov r1, #16 -+ vld1.16 {d9[]}, [r2] @ Left (lower) -+ sub r2, #32 -+ vshll.u8 q9, d1, #4 -+ lsl r3, #1 -+ vshll.u8 q10, d2, #4 -+ vshll.u8 q11, d3, #4 -+ vmlal.u8 q8, d4, d8 -+ vsubl.u8 q12, d9, d0 -+ vmlal.u8 q9, d5, d8 -+ vsubl.u8 q13, d9, d1 -+ vmlal.u8 q10, d6, d8 -+ vsubl.u8 q14, d9, d2 -+ vmlal.u8 q11, d7, d8 @ Acc set up -+ vsubl.u8 q15, d9, d3 @ Add set up -+ vadd.i16 q8, q12 -+ vadd.i16 q9, q13 -+ vadd.i16 q10, q14 -+ vadd.i16 q11, q15 -+ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0} -+ -+@ u8 15..0 [2] q4,q5 -+@ u8 left[y] [2] [r2] -+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] -+ -+ vld1.16 {d12[]}, [r2]! -+ vadd.i16 q0, q8, q12 -+ b 2f -+1: -+ vld1.16 {d12[]}, [r2]! -+ vrshrn.u16 d3, q1, #5 -+ vrshrn.u16 d2, q0, #5 -+ vadd.i16 q0, q8, q12 -+ vrshrn.u16 d4, q2, #5 -+ vrshrn.u16 d5, q3, #5 -+ vst1.8 {q1-q2}, [r0 :128], r3 -+2: vadd.i16 q1, q9, q13 -+ subs r1, #2 -+ vadd.i16 q2, q10, q14 -+ vadd.i16 q3, q11, q15 -+ vmlal.u8 q8, d8, d12 -+ vmlal.u8 q9, d9, d12 -+ vmlal.u8 q10, d10, d12 -+ vmlal.u8 q11, d11, d12 -+ vld1.16 {d12[]}, [r2]! -+ vrshrn.u16 d19, q9, #5 -+ vrshrn.u16 d18, q8, #5 -+ vadd.i16 q8, q0, q12 -+ vrshrn.u16 d20, q10, #5 -+ vrshrn.u16 d21, q11, #5 -+ vst1.8 {q9-q10}, [r0 :128], r3 -+ vadd.i16 q9, q1, q13 -+ vadd.i16 q10, q2, q14 -+ vadd.i16 q11, q3, q15 -+ vmlal.u8 q0, d8, d12 -+ vmlal.u8 q1, d9, d12 -+ vmlal.u8 q2, d10, d12 -+ vmlal.u8 q3, d11, d12 -+ -+ bne 1b -+ -+ vpop {d8-d12} -+ -+ vrshrn.u16 d3, q1, #5 -+ vrshrn.u16 d2, q0, #5 -+ vrshrn.u16 d4, q2, #5 -+ vrshrn.u16 d5, q3, #5 -+ vst1.8 {q1-q2}, [r0 :128] -+ -+ bx lr -+ -+endfunc -+ -+ -+@ ff_hevc_rpi_pred_planar_c_16_neon_10 -+@ uint8_t *_src, [r0] -+@ const uint8_t *_top, [r1] -+@ const uint8_t *_left, [r2] -+@ ptrdiff_t stride) [r3] -+ -+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 -+ -+ @ Load from bytes & expand later - at the very least this uses less -+ @ memory than having a short table -+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) -+ adr ip, nbx2_15_0_1_16 + 32 -+ vpush {q4-q7} -+ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) -+ add r2, #64 -+ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16} -+T lsl r3, #2 -+ vld1.32 {d8[],d9[]}, [r1] @ Top (right) -+ sub ip, #32 -+ vmovl.u8 q12, d28 -+ mov r1, #16 -+ vmovl.u8 q13, d29 -+ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0} -+ vmovl.u8 q14, d30 -+ vmovl.u8 q15, d31 -+ vld1.32 {d10[],d11[]}, [r2] @ Left (lower) -+ sub r2, #64 -+ vshl.i16 q8, q0, #4 -+ vshl.i16 q9, q1, #4 -+ vshl.i16 q10, q2, #4 -+ vshl.i16 q11, q3, #4 -+ vmla.i16 q8, q12, q4 -+ vsub.i16 q0, q5, q0 -+ vmla.i16 q9, q13, q4 -+ vpush {q0} -+ vsub.i16 q1, q5, q1 -+ vmla.i16 q10, q14, q4 -+ vsub.i16 q2, q5, q2 -+ vmla.i16 q11, q15, q4 @ Acc set up -+ vsub.i16 q3, q5, q3 @ Add set up -+ vadd.i16 q8, q0 -+ vadd.i16 q9, q1 -+ vadd.i16 q10, q2 -+ vadd.i16 q11, q3 -+ vmovl.u8 q4, d12 -+ vmovl.u8 q5, d13 -+ vmovl.u8 q6, d14 -+ vmovl.u8 q7, d15 -+ -+@ u16 31..0 [4] q4-q7 -+@ u16 left[y] [4] [r2] -+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially -+@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] -+ -+ vadd.i16 q12, q8, q0 -+A sub r0, r0, r3, lsl #2 -+T sub r0, r3 -+1: -+ vld1.32 {d0[],d1[]}, [r2]! -+A add r0, r0, r3, lsl #2 -+T add r0, r3 -+ vadd.i16 q13, q9, q1 -+ subs r1, #2 -+ vadd.i16 q14, q10, q2 -+ vadd.i16 q15, q11, q3 -+ vmla.i16 q8, q4, q0 -+ vmla.i16 q9, q5, q0 -+ vmla.i16 q10, q6, q0 -+ vmla.i16 q11, q7, q0 -+ vld1.16 {q0}, [sp] -+ vrshr.u16 q8, #5 -+ vrshr.u16 q9, #5 -+ vrshr.u16 q10, #5 -+ vrshr.u16 q11, #5 -+ vstm r0, {q8-q11} -+ vadd.i16 q8, q12, q0 -+A add r0, r0, r3, lsl #2 -+T add r0, r3 -+ vld1.32 {d0[],d1[]}, [r2]! -+ vadd.i16 q9, q13, q1 -+ vadd.i16 q10, q14, q2 -+ vadd.i16 q11, q15, q3 -+ vmla.i16 q12, q4, q0 -+ vmla.i16 q13, q5, q0 -+ vmla.i16 q14, q6, q0 -+ vmla.i16 q15, q7, q0 -+ vld1.16 {q0}, [sp] -+ vrshr.u16 q12, #5 -+ vrshr.u16 q13, #5 -+ vrshr.u16 q14, #5 -+ vrshr.u16 q15, #5 -+ vstm r0, {q12-q15} -+ vadd.i16 q12, q8, q0 -+ bne 1b -+ -+ vpop {q3-q7} -+ bx lr -+ -+endfunc -diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c -index 2cca784f5a..48cb816b70 100644 ---- a/libavcodec/arm/vc1dsp_init_neon.c -+++ b/libavcodec/arm/vc1dsp_init_neon.c -@@ -19,6 +19,7 @@ - #include - - #include "libavutil/attributes.h" -+#include "libavutil/intreadwrite.h" - #include "libavcodec/vc1dsp.h" - #include "vc1dsp.h" - -@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc - void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); - void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); - -+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); -+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); -+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); -+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); -+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); -+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); -+ - void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int rnd); - -@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, - int h, int x, int y); - -+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); -+ -+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) -+{ -+ /* Dealing with starting and stopping, and removing escape bytes, are -+ * comparatively less time-sensitive, so are more clearly expressed using -+ * a C wrapper around the assembly inner loop. Note that we assume a -+ * little-endian machine that supports unaligned loads. */ -+ int dsize = 0; -+ while (size >= 4) -+ { -+ int found = 0; -+ while (!found && (((uintptr_t) dst) & 7) && size >= 4) -+ { -+ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; -+ if (!found) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ } -+ if (!found) -+ { -+ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); -+ dst += skip; -+ src += skip; -+ size -= skip; -+ dsize += skip; -+ while (!found && size >= 4) -+ { -+ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; -+ if (!found) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ } -+ } -+ if (found) -+ { -+ *dst++ = *src++; -+ *dst++ = *src++; -+ ++src; -+ size -= 3; -+ dsize += 2; -+ } -+ } -+ while (size > 0) -+ { -+ *dst++ = *src++; -+ --size; -+ ++dsize; -+ } -+ return dsize; -+} -+ - #define FN_ASSIGN(X, Y) \ - dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ - dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon -@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) - dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; - dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; - -+ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; -+ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; -+ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; -+ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; -+ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; -+ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; -+ - dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; - FN_ASSIGN(1, 0); - FN_ASSIGN(2, 0); -@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) - dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; - dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; - dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; -+ -+ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; - } -diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S -index 93f043bf08..96014fbebc 100644 ---- a/libavcodec/arm/vc1dsp_neon.S -+++ b/libavcodec/arm/vc1dsp_neon.S -@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 - vst1.32 {d1[1]}, [r0,:32] - bx lr - endfunc -+ -+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of lower block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter4_neon, export=1 -+ sub r3, r0, r1, lsl #2 -+ vldr d0, .Lcoeffs -+ vld1.32 {d1[0]}, [r0], r1 @ P5 -+ vld1.32 {d2[0]}, [r3], r1 @ P1 -+ vld1.32 {d3[0]}, [r3], r1 @ P2 -+ vld1.32 {d4[0]}, [r0], r1 @ P6 -+ vld1.32 {d5[0]}, [r3], r1 @ P3 -+ vld1.32 {d6[0]}, [r0], r1 @ P7 -+ vld1.32 {d7[0]}, [r3] @ P4 -+ vld1.32 {d16[0]}, [r0] @ P8 -+ vshll.u8 q9, d1, #1 @ 2*P5 -+ vdup.16 d17, r2 @ pq -+ vshll.u8 q10, d2, #1 @ 2*P1 -+ vmovl.u8 q11, d3 @ P2 -+ vmovl.u8 q1, d4 @ P6 -+ vmovl.u8 q12, d5 @ P3 -+ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 -+ vmovl.u8 q11, d6 @ P7 -+ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 -+ vshll.u8 q2, d5, #1 @ 2*P3 -+ vmovl.u8 q3, d7 @ P4 -+ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 -+ vmovl.u8 q11, d16 @ P8 -+ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 -+ vmovl.u8 q12, d1 @ P5 -+ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 -+ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 -+ vsub.i16 d1, d6, d24 @ P4-P5 -+ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 -+ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 -+ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 -+ vabs.s16 d2, d1 -+ vrshr.s16 d3, d18, #3 -+ vrshr.s16 d5, d20, #3 -+ vshr.s16 d2, d2, #1 @ clip -+ vrshr.s16 d4, d4, #3 -+ vabs.s16 d3, d3 @ a2 -+ vshr.s16 d1, d1, #8 @ clip_sign -+ vabs.s16 d5, d5 @ a1 -+ vceq.i16 d7, d2, #0 @ test clip == 0 -+ vabs.s16 d16, d4 @ a0 -+ vshr.s16 d4, d4, #8 @ a0_sign -+ vcge.s16 d18, d5, d3 @ test a1 >= a2 -+ vcge.s16 d17, d16, d17 @ test a0 >= pq -+ vbsl d18, d3, d5 @ a3 -+ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign -+ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq -+ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 d5, d18, d16 @ test a3 >= a0 -+ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 -+ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 -+ vmov.32 r0, d4[1] @ move to gp reg -+ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ vcge.s16 d4, d0, d2 -+ tst r0, #1 -+ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered -+ vbsl d4, d2, d0 @ FFMIN(d, clip) -+ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ vqmovun.s16 d0, q3 -+ vqmovun.s16 d1, q12 -+ vst1.32 {d0[0]}, [r3], r1 -+ vst1.32 {d1[0]}, [r3] -+1: bx lr -+endfunc -+ -+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of right block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter4_neon, export=1 -+ sub r3, r0, #4 @ where to start reading -+ vldr d0, .Lcoeffs -+ vld1.32 {d2}, [r3], r1 -+ sub r0, r0, #1 @ where to start writing -+ vld1.32 {d4}, [r3], r1 -+ vld1.32 {d3}, [r3], r1 -+ vld1.32 {d5}, [r3] -+ vdup.16 d1, r2 @ pq -+ vtrn.8 q1, q2 -+ vtrn.16 d2, d3 @ P1, P5, P3, P7 -+ vtrn.16 d4, d5 @ P2, P6, P4, P8 -+ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 -+ vmovl.u8 q8, d4 @ P2, P6 -+ vmovl.u8 q9, d3 @ P3, P7 -+ vmovl.u8 q2, d5 @ P4, P8 -+ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 -+ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 -+ vmovl.u8 q1, d2 @ P1, P5 -+ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 -+ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 -+ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later -+ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 -+ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 -+ vsub.i16 d3, d4, d2 @ P4-P5 -+ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 -+ vrshr.s16 q3, q3, #3 -+ vabs.s16 d5, d3 -+ vshr.s16 d3, d3, #8 @ clip_sign -+ vrshr.s16 d16, d20, #3 -+ vabs.s16 q3, q3 @ a1, a2 -+ vshr.s16 d5, d5, #1 @ clip -+ vabs.s16 d17, d16 @ a0 -+ vceq.i16 d18, d5, #0 @ test clip == 0 -+ vshr.s16 d16, d16, #8 @ a0_sign -+ vcge.s16 d19, d6, d7 @ test a1 >= a2 -+ vcge.s16 d1, d17, d1 @ test a0 >= pq -+ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign -+ vbsl d19, d7, d6 @ a3 -+ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq -+ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 d6, d19, d17 @ test a3 >= a0 @ -+ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 -+ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 -+ vmov.32 r2, d3[1] @ move to gp reg -+ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ vcge.s16 d3, d0, d5 -+ tst r2, #1 -+ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered -+ vbsl d3, d5, d0 @ FFMIN(d, clip) -+ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ vqmovun.s16 d1, q1 -+ vqmovun.s16 d0, q2 -+ vst2.8 {d0[0], d1[0]}, [r0], r1 -+ vst2.8 {d0[1], d1[1]}, [r0], r1 -+ vst2.8 {d0[2], d1[2]}, [r0], r1 -+ vst2.8 {d0[3], d1[3]}, [r0] -+1: bx lr -+endfunc -+ -+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of lower block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter8_neon, export=1 -+ sub r3, r0, r1, lsl #2 -+ vldr d0, .Lcoeffs -+ vld1.32 {d1}, [r0 :64], r1 @ P5 -+ vld1.32 {d2}, [r3 :64], r1 @ P1 -+ vld1.32 {d3}, [r3 :64], r1 @ P2 -+ vld1.32 {d4}, [r0 :64], r1 @ P6 -+ vld1.32 {d5}, [r3 :64], r1 @ P3 -+ vld1.32 {d6}, [r0 :64], r1 @ P7 -+ vshll.u8 q8, d1, #1 @ 2*P5 -+ vshll.u8 q9, d2, #1 @ 2*P1 -+ vld1.32 {d7}, [r3 :64] @ P4 -+ vmovl.u8 q1, d3 @ P2 -+ vld1.32 {d20}, [r0 :64] @ P8 -+ vmovl.u8 q11, d4 @ P6 -+ vdup.16 q12, r2 @ pq -+ vmovl.u8 q13, d5 @ P3 -+ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 -+ vmovl.u8 q1, d6 @ P7 -+ vshll.u8 q2, d5, #1 @ 2*P3 -+ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 -+ vmovl.u8 q3, d7 @ P4 -+ vmovl.u8 q10, d20 @ P8 -+ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 -+ vmovl.u8 q1, d1 @ P5 -+ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 -+ vsub.i16 q13, q3, q1 @ P4-P5 -+ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 -+ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 -+ vabs.s16 q10, q13 -+ vshr.s16 q13, q13, #8 @ clip_sign -+ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 -+ vshr.s16 q10, q10, #1 @ clip -+ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 -+ vrshr.s16 q8, q8, #3 -+ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 -+ vceq.i16 q11, q10, #0 @ test clip == 0 -+ vrshr.s16 q9, q9, #3 -+ vabs.s16 q8, q8 @ a2 -+ vabs.s16 q9, q9 @ a1 -+ vrshr.s16 q2, q2, #3 -+ vcge.s16 q14, q9, q8 @ test a1 >= a2 -+ vabs.s16 q15, q2 @ a0 -+ vshr.s16 q2, q2, #8 @ a0_sign -+ vbsl q14, q8, q9 @ a3 -+ vcge.s16 q8, q15, q12 @ test a0 >= pq -+ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign -+ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 q12, q14, q15 @ test a3 >= a0 -+ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq -+ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 -+ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 -+ vshl.i64 q11, q9, #16 -+ vmov.32 r0, d18[1] @ move to gp reg -+ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ vmov.32 r2, d19[1] -+ vshr.s64 q9, q11, #48 -+ vcge.s16 q11, q0, q10 -+ vorr q8, q8, q9 -+ and r0, r0, r2 -+ vbsl q11, q10, q0 @ FFMIN(d, clip) -+ tst r0, #1 -+ bne 1f @ none of the 8 pixel pairs should be updated in this case -+ vbic q0, q11, q8 @ set each d to zero if it should not be filtered -+ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ vqmovun.s16 d0, q3 -+ vqmovun.s16 d1, q1 -+ vst1.32 {d0}, [r3 :64], r1 -+ vst1.32 {d1}, [r3 :64] -+1: bx lr -+endfunc -+ -+.align 5 -+.Lcoeffs: -+.quad 0x00050002 -+ -+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of right block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter8_neon, export=1 -+ push {lr} -+ sub r3, r0, #4 @ where to start reading -+ vldr d0, .Lcoeffs -+ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... -+ sub r0, r0, #1 @ where to start writing -+ vld1.32 {d4}, [r3], r1 -+ add r12, r0, r1, lsl #2 -+ vld1.32 {d3}, [r3], r1 -+ vld1.32 {d5}, [r3], r1 -+ vld1.32 {d6}, [r3], r1 -+ vld1.32 {d16}, [r3], r1 -+ vld1.32 {d7}, [r3], r1 -+ vld1.32 {d17}, [r3] -+ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... -+ vdup.16 q9, r2 @ pq -+ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... -+ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... -+ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... -+ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... -+ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... -+ vtrn.32 d2, d6 @ P1, P5 -+ vtrn.32 d4, d16 @ P2, P6 -+ vtrn.32 d3, d7 @ P3, P7 -+ vtrn.32 d5, d17 @ P4, P8 -+ vshll.u8 q10, d2, #1 @ 2*P1 -+ vshll.u8 q11, d6, #1 @ 2*P5 -+ vmovl.u8 q12, d4 @ P2 -+ vmovl.u8 q13, d16 @ P6 -+ vmovl.u8 q14, d3 @ P3 -+ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 -+ vmovl.u8 q12, d7 @ P7 -+ vshll.u8 q1, d3, #1 @ 2*P3 -+ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 -+ vmovl.u8 q2, d5 @ P4 -+ vmovl.u8 q8, d17 @ P8 -+ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 -+ vmovl.u8 q3, d6 @ P5 -+ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 -+ vsub.i16 q12, q2, q3 @ P4-P5 -+ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 -+ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 -+ vabs.s16 q8, q12 -+ vshr.s16 q12, q12, #8 @ clip_sign -+ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 -+ vshr.s16 q8, q8, #1 @ clip -+ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 -+ vrshr.s16 q11, q11, #3 -+ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 -+ vceq.i16 q13, q8, #0 @ test clip == 0 -+ vrshr.s16 q10, q10, #3 -+ vabs.s16 q11, q11 @ a2 -+ vabs.s16 q10, q10 @ a1 -+ vrshr.s16 q1, q1, #3 -+ vcge.s16 q14, q10, q11 @ test a1 >= a2 -+ vabs.s16 q15, q1 @ a0 -+ vshr.s16 q1, q1, #8 @ a0_sign -+ vbsl q14, q11, q10 @ a3 -+ vcge.s16 q9, q15, q9 @ test a0 >= pq -+ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign -+ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 q11, q14, q15 @ test a3 >= a0 -+ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq -+ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 -+ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 -+ vmov.32 r2, d20[1] @ move to gp reg -+ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 -+ vmov.32 r3, d21[1] -+ vcge.s16 q10, q0, q8 -+ and r14, r2, r3 -+ vbsl q10, q8, q0 @ FFMIN(d, clip) -+ tst r14, #1 -+ bne 2f @ none of the 8 pixel pairs should be updated in this case -+ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) -+ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 -+ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 -+ vqmovun.s16 d1, q3 -+ vqmovun.s16 d0, q2 -+ tst r2, #1 -+ bne 1f @ none of the first 4 pixel pairs should be updated if so -+ vst2.8 {d0[0], d1[0]}, [r0], r1 -+ vst2.8 {d0[1], d1[1]}, [r0], r1 -+ vst2.8 {d0[2], d1[2]}, [r0], r1 -+ vst2.8 {d0[3], d1[3]}, [r0] -+1: tst r3, #1 -+ bne 2f @ none of the second 4 pixel pairs should be updated if so -+ vst2.8 {d0[4], d1[4]}, [r12], r1 -+ vst2.8 {d0[5], d1[5]}, [r12], r1 -+ vst2.8 {d0[6], d1[6]}, [r12], r1 -+ vst2.8 {d0[7], d1[7]}, [r12] -+2: pop {pc} -+endfunc -+ -+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of lower block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_v_loop_filter16_neon, export=1 -+ vpush {d8-d15} -+ sub r3, r0, r1, lsl #2 -+ vldr d0, .Lcoeffs -+ vld1.64 {q1}, [r0 :128], r1 @ P5 -+ vld1.64 {q2}, [r3 :128], r1 @ P1 -+ vld1.64 {q3}, [r3 :128], r1 @ P2 -+ vld1.64 {q4}, [r0 :128], r1 @ P6 -+ vld1.64 {q5}, [r3 :128], r1 @ P3 -+ vld1.64 {q6}, [r0 :128], r1 @ P7 -+ vshll.u8 q7, d2, #1 @ 2*P5[0..7] -+ vshll.u8 q8, d4, #1 @ 2*P1[0..7] -+ vld1.64 {q9}, [r3 :128] @ P4 -+ vmovl.u8 q10, d6 @ P2[0..7] -+ vld1.64 {q11}, [r0 :128] @ P8 -+ vmovl.u8 q12, d8 @ P6[0..7] -+ vdup.16 q13, r2 @ pq -+ vshll.u8 q2, d5, #1 @ 2*P1[8..15] -+ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] -+ vshll.u8 q10, d3, #1 @ 2*P5[8..15] -+ vmovl.u8 q3, d7 @ P2[8..15] -+ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] -+ vmovl.u8 q4, d9 @ P6[8..15] -+ vmovl.u8 q14, d10 @ P3[0..7] -+ vmovl.u8 q15, d12 @ P7[0..7] -+ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] -+ vshll.u8 q3, d10, #1 @ 2*P3[0..7] -+ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] -+ vmovl.u8 q6, d13 @ P7[8..15] -+ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] -+ vmovl.u8 q14, d18 @ P4[0..7] -+ vmovl.u8 q9, d19 @ P4[8..15] -+ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] -+ vmovl.u8 q15, d11 @ P3[8..15] -+ vshll.u8 q5, d11, #1 @ 2*P3[8..15] -+ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] -+ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] -+ vmovl.u8 q15, d22 @ P8[0..7] -+ vmovl.u8 q11, d23 @ P8[8..15] -+ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] -+ vmovl.u8 q6, d2 @ P5[0..7] -+ vmovl.u8 q1, d3 @ P5[8..15] -+ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] -+ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] -+ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] -+ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] -+ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] -+ vrshr.s16 q8, q8, #3 -+ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] -+ vrshr.s16 q7, q7, #3 -+ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] -+ vabs.s16 q11, q15 -+ vabs.s16 q8, q8 @ a1[0..7] -+ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] -+ vshr.s16 q15, q15, #8 @ clip_sign[0..7] -+ vrshr.s16 q2, q2, #3 -+ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] -+ vabs.s16 q7, q7 @ a2[0..7] -+ vrshr.s16 q10, q10, #3 -+ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] -+ vshr.s16 q11, q11, #1 @ clip[0..7] -+ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] -+ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] -+ vabs.s16 q2, q2 @ a1[8..15] -+ vrshr.s16 q3, q3, #3 -+ vabs.s16 q10, q10 @ a2[8..15] -+ vbsl q4, q7, q8 @ a3[0..7] -+ vabs.s16 q7, q12 -+ vshr.s16 q8, q12, #8 @ clip_sign[8..15] -+ vrshr.s16 q5, q5, #3 -+ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] -+ vshr.s16 q7, q7, #1 @ clip[8..15] -+ vbsl q12, q10, q2 @ a3[8..15] -+ vabs.s16 q2, q3 @ a0[0..7] -+ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 -+ vshr.s16 q3, q3, #8 @ a0_sign[0..7] -+ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] -+ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq -+ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq -+ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] -+ vabs.s16 q4, q5 @ a0[8..15] -+ vshr.s16 q5, q5, #8 @ a0_sign[8..15] -+ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 -+ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq -+ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] -+ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] -+ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 -+ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 -+ vmov.32 r0, d4[1] @ move to gp reg -+ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq -+ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vmov.32 r2, d5[1] -+ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] -+ vshl.i64 q2, q2, #16 -+ vcge.s16 q12, q15, q11 -+ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 -+ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] -+ vshr.s64 q2, q2, #48 -+ and r0, r0, r2 -+ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) -+ vshl.i64 q11, q4, #16 -+ vmov.32 r2, d8[1] -+ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 -+ vorr q2, q10, q2 -+ vmov.32 r12, d9[1] -+ vshr.s64 q4, q11, #48 -+ vcge.s16 q10, q0, q7 -+ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) -+ vorr q4, q8, q4 -+ and r2, r2, r12 -+ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) -+ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] -+ and r0, r0, r2 -+ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) -+ tst r0, #1 -+ bne 1f @ none of the 16 pixel pairs should be updated in this case -+ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] -+ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] -+ vqmovun.s16 d4, q14 -+ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] -+ vqmovun.s16 d0, q6 -+ vqmovun.s16 d5, q9 -+ vqmovun.s16 d1, q1 -+ vst1.64 {q2}, [r3 :128], r1 -+ vst1.64 {q0}, [r3 :128] -+1: vpop {d8-d15} -+ bx lr -+endfunc -+ -+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks -+@ On entry: -+@ r0 -> top-left pel of right block -+@ r1 = row stride, bytes -+@ r2 = PQUANT bitstream parameter -+function ff_vc1_h_loop_filter16_neon, export=1 -+ push {r4-r6,lr} -+ vpush {d8-d15} -+ sub r3, r0, #4 @ where to start reading -+ vldr d0, .Lcoeffs -+ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... -+ sub r0, r0, #1 @ where to start writing -+ vld1.32 {d3}, [r3], r1 -+ add r4, r0, r1, lsl #2 -+ vld1.32 {d10}, [r3], r1 -+ vld1.32 {d11}, [r3], r1 -+ vld1.32 {d16}, [r3], r1 -+ vld1.32 {d4}, [r3], r1 -+ vld1.32 {d8}, [r3], r1 -+ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... -+ vld1.32 {d14}, [r3], r1 -+ vld1.32 {d5}, [r3], r1 -+ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... -+ vld1.32 {d6}, [r3], r1 -+ vld1.32 {d12}, [r3], r1 -+ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... -+ vld1.32 {d13}, [r3], r1 -+ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... -+ vld1.32 {d1}, [r3], r1 -+ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... -+ vld1.32 {d7}, [r3], r1 -+ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... -+ vld1.32 {d9}, [r3], r1 -+ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... -+ vld1.32 {d15}, [r3] -+ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... -+ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... -+ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... -+ vdup.16 q9, r2 @ pq -+ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... -+ vtrn.32 d2, d16 @ P1[0..7], P5[0..7] -+ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... -+ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... -+ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... -+ vtrn.32 d3, d4 @ P2[0..7], P6[0..7] -+ vshll.u8 q10, d2, #1 @ 2*P1[0..7] -+ vtrn.32 d10, d8 @ P3[0..7], P7[0..7] -+ vshll.u8 q11, d16, #1 @ 2*P5[0..7] -+ vtrn.32 d11, d14 @ P4[0..7], P8[0..7] -+ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... -+ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... -+ vmovl.u8 q1, d3 @ P2[0..7] -+ vmovl.u8 q12, d4 @ P6[0..7] -+ vtrn.32 d5, d1 @ P1[8..15], P5[8..15] -+ vtrn.32 d6, d7 @ P2[8..15], P6[8..15] -+ vtrn.32 d12, d9 @ P3[8..15], P7[8..15] -+ vtrn.32 d13, d15 @ P4[8..15], P8[8..15] -+ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] -+ vmovl.u8 q1, d10 @ P3[0..7] -+ vshll.u8 q2, d5, #1 @ 2*P1[8..15] -+ vshll.u8 q13, d1, #1 @ 2*P5[8..15] -+ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] -+ vmovl.u8 q14, d6 @ P2[8..15] -+ vmovl.u8 q3, d7 @ P6[8..15] -+ vmovl.u8 q15, d8 @ P7[0..7] -+ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] -+ vmovl.u8 q1, d12 @ P3[8..15] -+ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] -+ vmovl.u8 q4, d9 @ P7[8..15] -+ vshll.u8 q14, d10, #1 @ 2*P3[0..7] -+ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] -+ vmovl.u8 q5, d11 @ P4[0..7] -+ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] -+ vshll.u8 q15, d12, #1 @ 2*P3[8..15] -+ vmovl.u8 q6, d13 @ P4[8..15] -+ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] -+ vmovl.u8 q1, d14 @ P8[0..7] -+ vmovl.u8 q7, d15 @ P8[8..15] -+ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] -+ vmovl.u8 q4, d16 @ P5[0..7] -+ vmovl.u8 q8, d1 @ P5[8..15] -+ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] -+ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] -+ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] -+ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] -+ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] -+ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] -+ vrshr.s16 q10, q10, #3 -+ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] -+ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] -+ vrshr.s16 q11, q11, #3 -+ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] -+ vrshr.s16 q2, q2, #3 -+ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] -+ vabs.s16 q10, q10 @ a1[0..7] -+ vrshr.s16 q13, q13, #3 -+ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] -+ vabs.s16 q3, q11 @ a2[0..7] -+ vabs.s16 q2, q2 @ a1[8..15] -+ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] -+ vabs.s16 q11, q1 -+ vabs.s16 q12, q13 @ a2[8..15] -+ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] -+ vshr.s16 q1, q1, #8 @ clip_sign[0..7] -+ vrshr.s16 q15, q15, #3 -+ vshr.s16 q11, q11, #1 @ clip[0..7] -+ vrshr.s16 q14, q14, #3 -+ vbsl q13, q3, q10 @ a3[0..7] -+ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] -+ vabs.s16 q10, q15 @ a0[8..15] -+ vshr.s16 q15, q15, #8 @ a0_sign[8..15] -+ vbsl q3, q12, q2 @ a3[8..15] -+ vabs.s16 q2, q14 @ a0[0..7] -+ vabs.s16 q12, q7 -+ vshr.s16 q7, q7, #8 @ clip_sign[8..15] -+ vshr.s16 q14, q14, #8 @ a0_sign[0..7] -+ vshr.s16 q12, q12, #1 @ clip[8..15] -+ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] -+ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] -+ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq -+ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq -+ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] -+ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) -+ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] -+ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 -+ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 -+ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 -+ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq -+ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 -+ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 -+ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] -+ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 -+ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq -+ vcge.s16 q14, q13, q12 -+ vmov.32 r2, d4[1] @ move to gp reg -+ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] -+ vmov.32 r3, d5[1] -+ vcge.s16 q2, q0, q11 -+ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) -+ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) -+ vmov.32 r5, d6[1] -+ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) -+ vmov.32 r6, d7[1] -+ and r12, r2, r3 -+ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) -+ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 -+ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 -+ and r14, r5, r6 -+ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 -+ and r12, r12, r14 -+ vqmovun.s16 d4, q6 -+ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 -+ tst r12, #1 -+ bne 4f @ none of the 16 pixel pairs should be updated in this case -+ vqmovun.s16 d2, q5 -+ vqmovun.s16 d3, q4 -+ vqmovun.s16 d5, q8 -+ tst r2, #1 -+ bne 1f -+ vst2.8 {d2[0], d3[0]}, [r0], r1 -+ vst2.8 {d2[1], d3[1]}, [r0], r1 -+ vst2.8 {d2[2], d3[2]}, [r0], r1 -+ vst2.8 {d2[3], d3[3]}, [r0] -+1: add r0, r4, r1, lsl #2 -+ tst r3, #1 -+ bne 2f -+ vst2.8 {d2[4], d3[4]}, [r4], r1 -+ vst2.8 {d2[5], d3[5]}, [r4], r1 -+ vst2.8 {d2[6], d3[6]}, [r4], r1 -+ vst2.8 {d2[7], d3[7]}, [r4] -+2: add r4, r0, r1, lsl #2 -+ tst r5, #1 -+ bne 3f -+ vst2.8 {d4[0], d5[0]}, [r0], r1 -+ vst2.8 {d4[1], d5[1]}, [r0], r1 -+ vst2.8 {d4[2], d5[2]}, [r0], r1 -+ vst2.8 {d4[3], d5[3]}, [r0] -+3: tst r6, #1 -+ bne 4f -+ vst2.8 {d4[4], d5[4]}, [r4], r1 -+ vst2.8 {d4[5], d5[5]}, [r4], r1 -+ vst2.8 {d4[6], d5[6]}, [r4], r1 -+ vst2.8 {d4[7], d5[7]}, [r4] -+4: vpop {d8-d15} -+ pop {r4-r6,pc} -+endfunc -+ -+@ Copy at most the specified number of bytes from source to destination buffer, -+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence -+@ On entry: -+@ r0 -> source buffer -+@ r1 = max number of bytes to copy -+@ r2 -> destination buffer, optimally 8-byte aligned -+@ On exit: -+@ r0 = number of bytes not copied -+function ff_vc1_unescape_buffer_helper_neon, export=1 -+ @ Offset by 48 to screen out cases that are too short for us to handle, -+ @ and also make it easy to test for loop termination, or to determine -+ @ whether we need an odd number of half-iterations of the loop. -+ subs r1, r1, #48 -+ bmi 90f -+ -+ @ Set up useful constants -+ vmov.i32 q0, #0x3000000 -+ vmov.i32 q1, #0x30000 -+ -+ tst r1, #16 -+ bne 1f -+ -+ vld1.8 {q8, q9}, [r0]! -+ vbic q12, q8, q0 -+ vext.8 q13, q8, q9, #1 -+ vext.8 q14, q8, q9, #2 -+ vext.8 q15, q8, q9, #3 -+ veor q12, q12, q1 -+ vbic q13, q13, q0 -+ vbic q14, q14, q0 -+ vbic q15, q15, q0 -+ vceq.i32 q12, q12, #0 -+ veor q13, q13, q1 -+ veor q14, q14, q1 -+ veor q15, q15, q1 -+ vceq.i32 q13, q13, #0 -+ vceq.i32 q14, q14, #0 -+ vceq.i32 q15, q15, #0 -+ add r1, r1, #16 -+ b 3f -+ -+1: vld1.8 {q10, q11}, [r0]! -+ vbic q12, q10, q0 -+ vext.8 q13, q10, q11, #1 -+ vext.8 q14, q10, q11, #2 -+ vext.8 q15, q10, q11, #3 -+ veor q12, q12, q1 -+ vbic q13, q13, q0 -+ vbic q14, q14, q0 -+ vbic q15, q15, q0 -+ vceq.i32 q12, q12, #0 -+ veor q13, q13, q1 -+ veor q14, q14, q1 -+ veor q15, q15, q1 -+ vceq.i32 q13, q13, #0 -+ vceq.i32 q14, q14, #0 -+ vceq.i32 q15, q15, #0 -+ @ Drop through... -+2: vmov q8, q11 -+ vld1.8 {q9}, [r0]! -+ vorr q13, q12, q13 -+ vorr q15, q14, q15 -+ vbic q12, q8, q0 -+ vorr q3, q13, q15 -+ vext.8 q13, q8, q9, #1 -+ vext.8 q14, q8, q9, #2 -+ vext.8 q15, q8, q9, #3 -+ veor q12, q12, q1 -+ vorr d6, d6, d7 -+ vbic q13, q13, q0 -+ vbic q14, q14, q0 -+ vbic q15, q15, q0 -+ vceq.i32 q12, q12, #0 -+ vmov r3, r12, d6 -+ veor q13, q13, q1 -+ veor q14, q14, q1 -+ veor q15, q15, q1 -+ vceq.i32 q13, q13, #0 -+ vceq.i32 q14, q14, #0 -+ vceq.i32 q15, q15, #0 -+ orrs r3, r3, r12 -+ bne 90f -+ vst1.64 {q10}, [r2]! -+3: vmov q10, q9 -+ vld1.8 {q11}, [r0]! -+ vorr q13, q12, q13 -+ vorr q15, q14, q15 -+ vbic q12, q10, q0 -+ vorr q3, q13, q15 -+ vext.8 q13, q10, q11, #1 -+ vext.8 q14, q10, q11, #2 -+ vext.8 q15, q10, q11, #3 -+ veor q12, q12, q1 -+ vorr d6, d6, d7 -+ vbic q13, q13, q0 -+ vbic q14, q14, q0 -+ vbic q15, q15, q0 -+ vceq.i32 q12, q12, #0 -+ vmov r3, r12, d6 -+ veor q13, q13, q1 -+ veor q14, q14, q1 -+ veor q15, q15, q1 -+ vceq.i32 q13, q13, #0 -+ vceq.i32 q14, q14, #0 -+ vceq.i32 q15, q15, #0 -+ orrs r3, r3, r12 -+ bne 91f -+ vst1.64 {q8}, [r2]! -+ subs r1, r1, #32 -+ bpl 2b -+ -+90: add r0, r1, #48 -+ bx lr -+ -+91: sub r1, r1, #16 -+ b 90b -+endfunc -diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 8a71c04230..53644506e5 100644 ---- a/libavcodec/avcodec.h -+++ b/libavcodec/avcodec.h -@@ -2595,6 +2595,17 @@ typedef struct AVHWAccel { - * that avctx->hwaccel_priv_data is invalid. - */ - int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); -+ -+ /** -+ * Called if parsing fails -+ * -+ * An error has occured, end_frame will not be called -+ * start_frame & decode_slice may or may not have been called -+ * Optional -+ * -+ * @param avctx the codec context -+ */ -+ void (*abort_frame)(AVCodecContext *avctx); - } AVHWAccel; - - /** -diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h -index 38d06b2842..bbf5d70560 100644 ---- a/libavcodec/cabac.h -+++ b/libavcodec/cabac.h -@@ -44,6 +44,10 @@ typedef struct CABACContext{ - const uint8_t *bytestream_start; - const uint8_t *bytestream; - const uint8_t *bytestream_end; -+ struct { -+ uint16_t bits; -+ uint16_t range; -+ } by22; - }CABACContext; - - int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); -diff --git a/libavcodec/codec.h b/libavcodec/codec.h -index 50a22f6e3c..5acf572ef4 100644 ---- a/libavcodec/codec.h -+++ b/libavcodec/codec.h -@@ -367,6 +367,17 @@ const AVCodec *av_codec_iterate(void **opaque); - */ - AVCodec *avcodec_find_decoder(enum AVCodecID id); - -+/** -+ * Find a registered decoder with a matching codec ID and pix_fmt. -+ * A decoder will pix_fmt set to NULL will match any fmt. -+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL. -+ * -+ * @param id AVCodecID of the requested decoder -+ * @param fmt AVPixelForma that msut be supported by decoder -+ * @return A decoder if one was found, NULL otherwise. -+ */ -+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt); -+ - /** - * Find a registered decoder with the specified name. - * -diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h -new file mode 100644 -index 0000000000..72cbba0953 ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v1.h -@@ -0,0 +1,229 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 -+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 rps; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 num_active_dpb_entries; -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 num_rps_poc_st_curr_before; -+ __u8 num_rps_poc_st_curr_after; -+ __u8 num_rps_poc_lt_curr; -+ -+ __u8 padding; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h -new file mode 100644 -index 0000000000..7cbbbf055f ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v2.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 -+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 -+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 rps; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 padding[5]; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u8 num_active_dpb_entries; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ -+#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) -+/* -+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - -+ * the number of data (in bits) to skip in the -+ * slice segment header. -+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" -+ * to before syntax element "slice_temporal_mvp_enabled_flag". -+ * If IDR, the skipped bits are just "pic_output_flag" -+ * (separate_colour_plane_flag is not supported). -+ */ -+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h -new file mode 100644 -index 0000000000..4e35bd583d ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v3.h -@@ -0,0 +1,255 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * These are the HEVC state controls for use with stateless HEVC -+ * codec drivers. -+ * -+ * It turns out that these structs are not stable yet and will undergo -+ * more changes. So keep them private until they are stable and ready to -+ * become part of the official public API. -+ */ -+ -+#ifndef _HEVC_CTRLS_H_ -+#define _HEVC_CTRLS_H_ -+ -+#include -+ -+/* The pixel format isn't stable at the moment and will likely be renamed. */ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) -+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) -+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) -+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) -+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) -+ -+/* enum v4l2_ctrl_type type values */ -+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 -+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 -+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 -+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 -+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 -+ -+enum v4l2_mpeg_video_hevc_decode_mode { -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_mpeg_video_hevc_start_code { -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, -+ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/* The controls are not stable at the moment and will likely be reworked. */ -+struct v4l2_ctrl_hevc_sps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+struct v4l2_ctrl_hevc_pps { -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ -+ __u8 padding[4]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 flags; -+ __u8 field_pic; -+ __u16 pic_order_cnt[2]; -+ __u8 padding[2]; -+}; -+ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 padding[6]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_bit_offset; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __u16 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ -+ __u8 padding[5]; -+ -+ __u32 entry_point_offset_minus1[256]; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u8 num_active_dpb_entries; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ -+#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) -+/* -+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - -+ * the number of data (in bits) to skip in the -+ * slice segment header. -+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" -+ * to before syntax element "slice_temporal_mvp_enabled_flag". -+ * If IDR, the skipped bits are just "pic_output_flag" -+ * (separate_colour_plane_flag is not supported). -+ */ -+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) -+ -+#endif -diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h -new file mode 100644 -index 0000000000..c02fdbe5a8 ---- /dev/null -+++ b/libavcodec/hevc-ctrls-v4.h -@@ -0,0 +1,524 @@ -+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ -+/* -+ * Video for Linux Two controls header file -+ * -+ * Copyright (C) 1999-2012 the contributors -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * Alternatively you can redistribute this file under the terms of the -+ * BSD license as stated below: -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in -+ * the documentation and/or other materials provided with the -+ * distribution. -+ * 3. The names of its contributors may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ * -+ * The contents of this header was split off from videodev2.h. All control -+ * definitions should be added to this header, which is included by -+ * videodev2.h. -+ */ -+ -+#ifndef AVCODEC_HEVC_CTRLS_V4_H -+#define AVCODEC_HEVC_CTRLS_V4_H -+ -+#include -+#include -+ -+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS -+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000 /* Stateless codecs controls */ -+#endif -+#ifndef V4L2_CID_CODEC_STATELESS_BASE -+#define V4L2_CID_CODEC_STATELESS_BASE (V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900) -+#endif -+ -+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ -+ -+#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) -+#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) -+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) -+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403) -+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404) -+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) -+#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) -+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) -+ -+enum v4l2_stateless_hevc_decode_mode { -+ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, -+ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED, -+}; -+ -+enum v4l2_stateless_hevc_start_code { -+ V4L2_STATELESS_HEVC_START_CODE_NONE, -+ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, -+}; -+ -+#define V4L2_HEVC_SLICE_TYPE_B 0 -+#define V4L2_HEVC_SLICE_TYPE_P 1 -+#define V4L2_HEVC_SLICE_TYPE_I 2 -+ -+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) -+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) -+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) -+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) -+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) -+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) -+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) -+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) -+ -+/** -+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set -+ * -+ * @video_parameter_set_id: specifies the value of the -+ * vps_video_parameter_set_id of the active VPS -+ * @seq_parameter_set_id: provides an identifier for the SPS for -+ * reference by other syntax elements -+ * @pic_width_in_luma_samples: specifies the width of each decoded picture -+ * in units of luma samples -+ * @pic_height_in_luma_samples: specifies the height of each decoded picture -+ * in units of luma samples -+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the -+ * samples of the luma array -+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the -+ * samples of the chroma arrays -+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of -+ * the variable MaxPicOrderCntLsb -+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum -+ * required size of the decoded picture -+ * buffer for the codec video sequence -+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures -+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the -+ * value of SpsMaxLatencyPictures array -+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum -+ * luma coding block size -+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between -+ * the maximum and minimum luma -+ * coding block size -+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma -+ * transform block size -+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between -+ * the maximum and minimum luma -+ * transform block size -+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy -+ * depth for transform units of -+ * coding units coded in inter -+ * prediction mode -+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy -+ * depth for transform units of -+ * coding units coded in intra -+ * prediction mode -+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of -+ * bits used to represent each of PCM sample -+ * values of the luma component -+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number -+ * of bits used to represent each of PCM -+ * sample values of the chroma components -+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the -+ * minimum size of coding blocks -+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between -+ * the maximum and minimum size of -+ * coding blocks -+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set() -+ * syntax structures included in the SPS -+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term -+ * reference pictures that are specified in the SPS -+ * @chroma_format_idc: specifies the chroma sampling -+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number -+ * of temporal sub-layers -+ * @reserved: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_SPS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_sps { -+ __u8 video_parameter_set_id; -+ __u8 seq_parameter_set_id; -+ __u16 pic_width_in_luma_samples; -+ __u16 pic_height_in_luma_samples; -+ __u8 bit_depth_luma_minus8; -+ __u8 bit_depth_chroma_minus8; -+ __u8 log2_max_pic_order_cnt_lsb_minus4; -+ __u8 sps_max_dec_pic_buffering_minus1; -+ __u8 sps_max_num_reorder_pics; -+ __u8 sps_max_latency_increase_plus1; -+ __u8 log2_min_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_luma_coding_block_size; -+ __u8 log2_min_luma_transform_block_size_minus2; -+ __u8 log2_diff_max_min_luma_transform_block_size; -+ __u8 max_transform_hierarchy_depth_inter; -+ __u8 max_transform_hierarchy_depth_intra; -+ __u8 pcm_sample_bit_depth_luma_minus1; -+ __u8 pcm_sample_bit_depth_chroma_minus1; -+ __u8 log2_min_pcm_luma_coding_block_size_minus3; -+ __u8 log2_diff_max_min_pcm_luma_coding_block_size; -+ __u8 num_short_term_ref_pic_sets; -+ __u8 num_long_term_ref_pics_sps; -+ __u8 chroma_format_idc; -+ __u8 sps_max_sub_layers_minus1; -+ -+ __u8 reserved[6]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) -+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) -+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) -+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) -+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) -+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) -+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) -+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) -+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) -+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) -+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) -+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) -+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) -+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) -+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) -+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) -+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) -+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) -+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) -+ -+/** -+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set -+ * -+ * @pic_parameter_set_id: identifies the PPS for reference by other -+ * syntax elements -+ * @num_extra_slice_header_bits: specifies the number of extra slice header -+ * bits that are present in the slice header RBSP -+ * for coded pictures referring to the PPS. -+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the -+ * inferred value of num_ref_idx_l0_active_minus1 -+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the -+ * inferred value of num_ref_idx_l1_active_minus1 -+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for -+ * each slice referring to the PPS -+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding -+ * tree block size and the minimum luma coding block -+ * size of coding units that convey cu_qp_delta_abs -+ * and cu_qp_delta_sign_flag -+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb -+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr -+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns -+ * partitioning the picture -+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning -+ * the picture -+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in -+ * units of coding tree blocks -+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in -+ * units of coding tree blocks -+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for -+ * beta divided by 2 -+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC -+ * divided by 2 -+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of -+ * the variable Log2ParMrgLevel -+ * @reserved: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_PPS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_pps { -+ __u8 pic_parameter_set_id; -+ __u8 num_extra_slice_header_bits; -+ __u8 num_ref_idx_l0_default_active_minus1; -+ __u8 num_ref_idx_l1_default_active_minus1; -+ __s8 init_qp_minus26; -+ __u8 diff_cu_qp_delta_depth; -+ __s8 pps_cb_qp_offset; -+ __s8 pps_cr_qp_offset; -+ __u8 num_tile_columns_minus1; -+ __u8 num_tile_rows_minus1; -+ __u8 column_width_minus1[20]; -+ __u8 row_height_minus1[22]; -+ __s8 pps_beta_offset_div2; -+ __s8 pps_tc_offset_div2; -+ __u8 log2_parallel_merge_level_minus2; -+ __u8 reserved; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 -+ -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6 -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7 -+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10 -+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11 -+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12 -+ -+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 -+ -+/** -+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry -+ * -+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference. -+ * @flags: long term flag for the reference frame -+ * @field_pic: whether the reference is a field picture or a frame. -+ * @reserved: padding field. Should be zeroed by applications. -+ * @pic_order_cnt_val: the picture order count of the current picture. -+ */ -+struct v4l2_hevc_dpb_entry { -+ __u64 timestamp; -+ __u8 flags; -+ __u8 field_pic; -+ __u16 reserved; -+ __s32 pic_order_cnt_val; -+}; -+ -+/** -+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters -+ * -+ * @delta_luma_weight_l0: the difference of the weighting factor applied -+ * to the luma prediction value for list 0 -+ * @luma_offset_l0: the additive offset applied to the luma prediction value -+ * for list 0 -+ * @delta_chroma_weight_l0: the difference of the weighting factor applied -+ * to the chroma prediction values for list 0 -+ * @chroma_offset_l0: the difference of the additive offset applied to -+ * the chroma prediction values for list 0 -+ * @delta_luma_weight_l1: the difference of the weighting factor applied -+ * to the luma prediction value for list 1 -+ * @luma_offset_l1: the additive offset applied to the luma prediction value -+ * for list 1 -+ * @delta_chroma_weight_l1: the difference of the weighting factor applied -+ * to the chroma prediction values for list 1 -+ * @chroma_offset_l1: the difference of the additive offset applied to -+ * the chroma prediction values for list 1 -+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for -+ * all luma weighting factors -+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm -+ * of the denominator for all chroma -+ * weighting factors -+ */ -+struct v4l2_hevc_pred_weight_table { -+ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; -+ -+ __u8 luma_log2_weight_denom; -+ __s8 delta_chroma_log2_weight_denom; -+}; -+ -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) -+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) -+ -+/** -+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters -+ * -+ * This control is a dynamically sized 1-dimensional array, -+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it. -+ * -+ * @bit_size: size (in bits) of the current slice data -+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data -+ * @num_entry_point_offsets: specifies the number of entry point offset syntax -+ * elements in the slice header. -+ * @nal_unit_type: specifies the coding type of the slice (B, P or I) -+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit -+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{} -+ * @colour_plane_id: specifies the colour plane associated with the current slice -+ * @slice_pic_order_cnt: specifies the picture order count -+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum -+ * reference index for reference picture list 0 -+ * that may be used to decode the slice -+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum -+ * reference index for reference picture list 1 -+ * that may be used to decode the slice -+ * @collocated_ref_idx: specifies the reference index of the collocated picture used -+ * for temporal motion vector prediction -+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging -+ * motion vector prediction candidates supported in -+ * the slice subtracted from 5 -+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding -+ * blocks in the slice -+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset -+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset -+ * @slice_act_y_qp_offset: screen content extension parameters -+ * @slice_act_cb_qp_offset: screen content extension parameters -+ * @slice_act_cr_qp_offset: screen content extension parameters -+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2 -+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2 -+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or -+ * more fields -+ * @reserved0: padding field. Should be zeroed by applications. -+ * @slice_segment_addr: specifies the address of the first coding tree block in -+ * the slice segment -+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB -+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB -+ * @short_term_ref_pic_set_size: specifies the size of short-term reference -+ * pictures set included in the SPS -+ * @long_term_ref_pic_set_size: specifies the size of long-term reference -+ * pictures set include in the SPS -+ * @pred_weight_table: the prediction weight coefficients for inter-picture -+ * prediction -+ * @reserved1: padding field. Should be zeroed by applications. -+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_slice_params { -+ __u32 bit_size; -+ __u32 data_byte_offset; -+ __u32 num_entry_point_offsets; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ __u8 nal_unit_type; -+ __u8 nuh_temporal_id_plus1; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u8 slice_type; -+ __u8 colour_plane_id; -+ __s32 slice_pic_order_cnt; -+ __u8 num_ref_idx_l0_active_minus1; -+ __u8 num_ref_idx_l1_active_minus1; -+ __u8 collocated_ref_idx; -+ __u8 five_minus_max_num_merge_cand; -+ __s8 slice_qp_delta; -+ __s8 slice_cb_qp_offset; -+ __s8 slice_cr_qp_offset; -+ __s8 slice_act_y_qp_offset; -+ __s8 slice_act_cb_qp_offset; -+ __s8 slice_act_cr_qp_offset; -+ __s8 slice_beta_offset_div2; -+ __s8 slice_tc_offset_div2; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ __u8 pic_struct; -+ -+ __u8 reserved0[3]; -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ __u32 slice_segment_addr; -+ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u16 short_term_ref_pic_set_size; -+ __u16 long_term_ref_pic_set_size; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ -+ struct v4l2_hevc_pred_weight_table pred_weight_table; -+ -+ __u8 reserved1[2]; -+ __u64 flags; -+}; -+ -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 -+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 -+ -+/** -+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters -+ * -+ * @pic_order_cnt_val: picture order count -+ * @short_term_ref_pic_set_size: specifies the size of short-term reference -+ * pictures set included in the SPS of the first slice -+ * @long_term_ref_pic_set_size: specifies the size of long-term reference -+ * pictures set include in the SPS of the first slice -+ * @num_active_dpb_entries: the number of entries in dpb -+ * @num_poc_st_curr_before: the number of reference pictures in the short-term -+ * set that come before the current frame -+ * @num_poc_st_curr_after: the number of reference pictures in the short-term -+ * set that come after the current frame -+ * @num_poc_lt_curr: the number of reference pictures in the long-term set -+ * @poc_st_curr_before: provides the index of the short term before references -+ * in DPB array -+ * @poc_st_curr_after: provides the index of the short term after references -+ * in DPB array -+ * @poc_lt_curr: provides the index of the long term references in DPB array -+ * @reserved: padding field. Should be zeroed by applications. -+ * @dpb: the decoded picture buffer, for meta-data about reference frames -+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} -+ */ -+struct v4l2_ctrl_hevc_decode_params { -+ __s32 pic_order_cnt_val; -+ __u16 short_term_ref_pic_set_size; -+ __u16 long_term_ref_pic_set_size; -+ __u8 num_active_dpb_entries; -+ __u8 num_poc_st_curr_before; -+ __u8 num_poc_st_curr_after; -+ __u8 num_poc_lt_curr; -+ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u8 reserved[4]; -+ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; -+ __u64 flags; -+}; -+ -+/** -+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters -+ * -+ * @scaling_list_4x4: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_8x8: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_16x16: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_32x32: scaling list is used for the scaling process for -+ * transform coefficients. The values on each scaling -+ * list are expected in raster scan order -+ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process -+ * for transform coefficients. The values on each -+ * scaling list are expected in raster scan order. -+ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process -+ * for transform coefficients. The values on each -+ * scaling list are expected in raster scan order. -+ */ -+struct v4l2_ctrl_hevc_scaling_matrix { -+ __u8 scaling_list_4x4[6][16]; -+ __u8 scaling_list_8x8[6][64]; -+ __u8 scaling_list_16x16[6][64]; -+ __u8 scaling_list_32x32[2][64]; -+ __u8 scaling_list_dc_coef_16x16[6]; -+ __u8 scaling_list_dc_coef_32x32[2]; -+}; -+ -+#endif -diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c -index 463d352055..7feff43c28 100644 ---- a/libavcodec/hevc_parser.c -+++ b/libavcodec/hevc_parser.c -@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal, - avctx->profile = ps->sps->ptl.general_ptl.profile_idc; - avctx->level = ps->sps->ptl.general_ptl.level_idc; - -+ if (ps->sps->chroma_format_idc == 1) { -+ avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ? -+ ps->sps->vui.chroma_sample_loc_type_top_field + 1 : -+ AVCHROMA_LOC_LEFT; -+ } -+ else if (ps->sps->chroma_format_idc == 2 || -+ ps->sps->chroma_format_idc == 3) { -+ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; -+ } -+ else { -+ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; -+ } -+ - if (ps->vps->vps_timing_info_present_flag) { - num = ps->vps->vps_num_units_in_tick; - den = ps->vps->vps_time_scale; -diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c -index 4f6d985ae6..eefae71275 100644 ---- a/libavcodec/hevc_refs.c -+++ b/libavcodec/hevc_refs.c -@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s) - if (!frame->rpl_buf) - goto fail; - -- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); -- if (!frame->tab_mvf_buf) -- goto fail; -- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; -+ if (s->tab_mvf_pool) { -+ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); -+ if (!frame->tab_mvf_buf) -+ goto fail; -+ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; -+ } - -- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); -- if (!frame->rpl_tab_buf) -- goto fail; -- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; -- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; -- for (j = 0; j < frame->ctb_count; j++) -- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; -+ if (s->rpl_tab_pool) { -+ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); -+ if (!frame->rpl_tab_buf) -+ goto fail; -+ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; -+ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; -+ for (j = 0; j < frame->ctb_count; j++) -+ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; -+ } - - frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; - frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); -@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s) - int ctb_count = frame->ctb_count; - int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; - int i; -+ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; - - if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) - return AVERROR_INVALIDDATA; - -- for (i = ctb_addr_ts; i < ctb_count; i++) -- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; -+ if (frame->rpl_tab) { -+ for (i = ctb_addr_ts; i < ctb_count; i++) -+ frame->rpl_tab[i] = tab; -+ } - -- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; -+ frame->refPicList = tab->refPicList; - - return 0; - } -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 2231aed259..7b05b41441 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps) - - ff_set_sar(avctx, sps->vui.sar); - -+ if (sps->chroma_format_idc == 1) { -+ avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ? -+ sps->vui.chroma_sample_loc_type_top_field + 1 : -+ AVCHROMA_LOC_LEFT; -+ } -+ else if (sps->chroma_format_idc == 2 || -+ sps->chroma_format_idc == 3) { -+ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; -+ } -+ else { -+ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; -+ } -+ - if (sps->vui.video_signal_type_present_flag) - avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG - : AVCOL_RANGE_MPEG; -@@ -392,14 +405,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ - CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ - CONFIG_HEVC_NVDEC_HWACCEL + \ -+ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ - CONFIG_HEVC_VAAPI_HWACCEL + \ - CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ -+ CONFIG_HEVC_RPI4_8_HWACCEL + \ -+ CONFIG_HEVC_RPI4_10_HWACCEL + \ - CONFIG_HEVC_VDPAU_HWACCEL) - enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; - - switch (sps->pix_fmt) { - case AV_PIX_FMT_YUV420P: - case AV_PIX_FMT_YUVJ420P: -+#if CONFIG_HEVC_RPI4_8_HWACCEL -+ *fmt++ = AV_PIX_FMT_RPI4_8; -+#endif - #if CONFIG_HEVC_DXVA2_HWACCEL - *fmt++ = AV_PIX_FMT_DXVA2_VLD; - #endif -@@ -418,9 +437,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #endif - #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL - *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; -+#endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ *fmt++ = AV_PIX_FMT_DRM_PRIME; - #endif - break; - case AV_PIX_FMT_YUV420P10: -+#if CONFIG_HEVC_RPI4_10_HWACCEL -+ *fmt++ = AV_PIX_FMT_RPI4_10; -+#endif - #if CONFIG_HEVC_DXVA2_HWACCEL - *fmt++ = AV_PIX_FMT_DXVA2_VLD; - #endif -@@ -439,6 +464,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #endif - #if CONFIG_HEVC_NVDEC_HWACCEL - *fmt++ = AV_PIX_FMT_CUDA; -+#endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ *fmt++ = AV_PIX_FMT_DRM_PRIME; - #endif - break; - case AV_PIX_FMT_YUV444P: -@@ -485,6 +513,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, - if (!sps) - return 0; - -+ // If hwaccel then we don't need all the s/w decode helper arrays -+ if (s->avctx->hwaccel) { -+ export_stream_params(s, sps); -+ -+ s->avctx->pix_fmt = pix_fmt; -+ s->ps.sps = sps; -+ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; -+ return 0; -+ } -+ - ret = pic_arrays_init(s, sps); - if (ret < 0) - goto fail; -@@ -2901,11 +2939,13 @@ static int hevc_frame_start(HEVCContext *s) - ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); - int ret; - -- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); -- memset(s->vertical_bs, 0, s->bs_width * s->bs_height); -- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); -- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); -- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); -+ if (s->horizontal_bs) { -+ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); -+ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); -+ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); -+ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); -+ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); -+ } - - s->is_decoded = 0; - s->first_nal_type = s->nal_unit_type; -@@ -3327,7 +3367,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output, - s->ref = NULL; - ret = decode_nal_units(s, avpkt->data, avpkt->size); - if (ret < 0) -+ { -+ // Ensure that hwaccel knows this frame is over -+ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) { -+ s->avctx->hwaccel->abort_frame(s->avctx); -+ } -+ - return ret; -+ } - - if (avctx->hwaccel) { - if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { -@@ -3370,15 +3417,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src) - if (ret < 0) - return ret; - -- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); -- if (!dst->tab_mvf_buf) -- goto fail; -- dst->tab_mvf = src->tab_mvf; -+ if (src->tab_mvf_buf) { -+ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); -+ if (!dst->tab_mvf_buf) -+ goto fail; -+ dst->tab_mvf = src->tab_mvf; -+ } - -- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); -- if (!dst->rpl_tab_buf) -- goto fail; -- dst->rpl_tab = src->rpl_tab; -+ if (src->rpl_tab_buf) { -+ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); -+ if (!dst->rpl_tab_buf) -+ goto fail; -+ dst->rpl_tab = src->rpl_tab; -+ } - - dst->rpl_buf = av_buffer_ref(src->rpl_buf); - if (!dst->rpl_buf) -@@ -3697,6 +3748,15 @@ AVCodec ff_hevc_decoder = { - #if CONFIG_HEVC_NVDEC_HWACCEL - HWACCEL_NVDEC(hevc), - #endif -+#if CONFIG_HEVC_RPI4_8_HWACCEL -+ HWACCEL_RPI4_8(hevc), -+#endif -+#if CONFIG_HEVC_RPI4_10_HWACCEL -+ HWACCEL_RPI4_10(hevc), -+#endif -+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL -+ HWACCEL_V4L2REQUEST(hevc), -+#endif - #if CONFIG_HEVC_VAAPI_HWACCEL - HWACCEL_VAAPI(hevc), - #endif -diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h -index 8e54cf73f9..2277aadf75 100644 ---- a/libavcodec/hwaccels.h -+++ b/libavcodec/hwaccels.h -@@ -39,6 +39,9 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; - extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; - extern const AVHWAccel ff_hevc_dxva2_hwaccel; - extern const AVHWAccel ff_hevc_nvdec_hwaccel; -+extern const AVHWAccel ff_hevc_rpi4_8_hwaccel; -+extern const AVHWAccel ff_hevc_rpi4_10_hwaccel; -+extern const AVHWAccel ff_hevc_v4l2request_hwaccel; - extern const AVHWAccel ff_hevc_vaapi_hwaccel; - extern const AVHWAccel ff_hevc_vdpau_hwaccel; - extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; -diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h -index f421dc909f..f93283b893 100644 ---- a/libavcodec/hwconfig.h -+++ b/libavcodec/hwconfig.h -@@ -24,6 +24,7 @@ - - - #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) -+#define HWACCEL_CAP_MT_SAFE (1 << 1) - - - typedef struct AVCodecHWConfigInternal { -@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal { - HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) - #define HWACCEL_NVDEC(codec) \ - HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) -+#define HWACCEL_RPI4_8(codec) \ -+ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel) -+#define HWACCEL_RPI4_10(codec) \ -+ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel) -+#define HWACCEL_V4L2REQUEST(codec) \ -+ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) - #define HWACCEL_VAAPI(codec) \ - HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) - #define HWACCEL_VDPAU(codec) \ -diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index cb15ac072a..f6261db962 100644 ---- a/libavcodec/mmaldec.c -+++ b/libavcodec/mmaldec.c -@@ -24,6 +24,9 @@ - * MMAL Video Decoder - */ - -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" - #include - #include - #include -@@ -31,6 +34,7 @@ - #include - #include - #include -+#pragma GCC diagnostic pop - #include - - #include "avcodec.h" -diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c -index 9176027f15..0b0ff03c18 100644 ---- a/libavcodec/pthread_frame.c -+++ b/libavcodec/pthread_frame.c -@@ -209,7 +209,8 @@ FF_ENABLE_DEPRECATION_WARNINGS - - /* if the previous thread uses hwaccel then we take the lock to ensure - * the threads don't run concurrently */ -- if (avctx->hwaccel) { -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { - pthread_mutex_lock(&p->parent->hwaccel_mutex); - p->hwaccel_serializing = 1; - } -@@ -636,7 +637,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { - - if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; - -- if (avctx->hwaccel && !p->hwaccel_serializing) { -+ if (avctx->hwaccel && -+ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && -+ !p->hwaccel_serializing) { - pthread_mutex_lock(&p->parent->hwaccel_mutex); - p->hwaccel_serializing = 1; - } -diff --git a/libavcodec/raw.c b/libavcodec/raw.c -index 079d5c5d10..0781f28615 100644 ---- a/libavcodec/raw.c -+++ b/libavcodec/raw.c -@@ -294,6 +294,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { - { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ - { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ - -+ /* RPI (Might as well define for everything) */ -+ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, -+ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, -+ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, -+ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, -+ - { AV_PIX_FMT_NONE, 0 }, - }; - -diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index d181b74570..b943dd0379 100644 ---- a/libavcodec/rawenc.c -+++ b/libavcodec/rawenc.c -@@ -24,6 +24,7 @@ - * Raw Video Encoder - */ - -+#include "config.h" - #include "avcodec.h" - #include "raw.h" - #include "internal.h" -@@ -31,6 +32,10 @@ - #include "libavutil/intreadwrite.h" - #include "libavutil/imgutils.h" - #include "libavutil/internal.h" -+#include "libavutil/avassert.h" -+#if CONFIG_SAND -+#include "libavutil/rpi_sand_fns.h" -+#endif - - static av_cold int raw_encode_init(AVCodecContext *avctx) - { -@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS - return 0; - } - -+#if CONFIG_SAND -+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3 / 2; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); -+ dst += width * height; -+ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); -+ return 0; -+} -+ -+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); -+ dst += width * height * 2; -+ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); -+ return 0; -+} -+ -+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); -+ dst += width * height * 2; -+ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); -+ return 0; -+} -+#endif -+ -+ - static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, -- const AVFrame *frame, int *got_packet) -+ const AVFrame *src_frame, int *got_packet) - { -- int ret = av_image_get_buffer_size(frame->format, -- frame->width, frame->height, 1); -+ int ret; -+ AVFrame * frame = NULL; - -- if (ret < 0) -+#if CONFIG_SAND -+ if (av_rpi_is_sand_frame(src_frame)) { -+ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) : -+ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) : -+ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1; -+ *got_packet = (ret == 0); - return ret; -+ } -+#endif -+ -+ if ((frame = av_frame_clone(src_frame)) == NULL) { -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } -+ -+ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0) -+ goto fail; -+ -+ ret = av_image_get_buffer_size(frame->format, -+ frame->width, frame->height, 1); -+ if (ret < 0) -+ goto fail; - - if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) -- return ret; -+ goto fail; - if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, - (const uint8_t **)frame->data, frame->linesize, - frame->format, - frame->width, frame->height, 1)) < 0) -- return ret; -+ goto fail; - - if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && - frame->format == AV_PIX_FMT_YUYV422) { -@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - } - } - pkt->flags |= AV_PKT_FLAG_KEY; -+ av_frame_free(&frame); - *got_packet = 1; - return 0; -+ -+fail: -+ av_frame_free(&frame); -+ *got_packet = 0; -+ return ret; - } - - AVCodec ff_rawvideo_encoder = { -diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c -new file mode 100644 -index 0000000000..58c094c5f8 ---- /dev/null -+++ b/libavcodec/rpi_hevc_cabac.c -@@ -0,0 +1,2257 @@ -+/* -+ * HEVC CABAC decoding -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#define UNCHECKED_BITSTREAM_READER 1 -+ -+#include "libavutil/attributes.h" -+#include "libavutil/common.h" -+ -+#include "cabac_functions.h" -+#include "rpi_hevc_data.h" -+#include "hevc.h" -+#include "rpi_hevcdec.h" -+#include "rpi_hevc_cabac_fns.h" -+ -+#include "libavutil/rpi_sand_fns.h" -+ -+// BY22 is probably faster than simple bypass if the processor has -+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction -+// x86 has fast int divide -+// Arm doesn't have divide or general fast 64 bit, but does have the multiply -+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used -+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86) -+// Use native divide if we have a fast one - otherwise use mpy 1/x -+// x86 has a fast integer divide - arm doesn't - unsure about other -+// architectures -+#define USE_BY22_DIV ARCH_X86 -+ -+// Special case blocks with a single significant ceoff -+// Decreases the complexity of the code for a common case but increases the -+// code size. -+#define USE_N_END_1 1 -+ -+#if !USE_BY22_DIV -+// * 1/x @ 32 bits gets us 22 bits of accuracy -+#define CABAC_BY22_PEEK_BITS 22 -+#else -+// A real 32-bit divide gets us another bit -+// If we have a 64 bit int & a unit time divider then we should get a lot -+// of bits (55) but that is untested and it is unclear if it would give -+// us a large advantage -+#define CABAC_BY22_PEEK_BITS 23 -+#endif -+ -+#define CABAC_MAX_BIN 31 -+ -+ -+#if USE_BY22 && !USE_BY22_DIV -+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL) -+ -+static const uint32_t cabac_by22_inv_range[256] = { -+ 0, I(257), I(258), I(259), -+ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269), -+ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279), -+ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289), -+ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299), -+ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309), -+ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319), -+ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329), -+ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339), -+ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349), -+ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359), -+ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369), -+ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379), -+ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389), -+ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399), -+ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409), -+ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419), -+ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429), -+ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439), -+ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449), -+ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459), -+ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469), -+ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479), -+ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489), -+ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499), -+ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509), -+ I(510), I(511) -+}; -+#undef I -+#endif // USE_BY22 -+ -+#if ARCH_ARM -+#include "arm/rpi_hevc_cabac.h" -+#endif -+ -+/** -+ * number of bin by SyntaxElement. -+ */ -+static const int8_t num_bins_in_se[] = { -+ 1, // sao_merge_flag -+ 1, // sao_type_idx -+ 0, // sao_eo_class -+ 0, // sao_band_position -+ 0, // sao_offset_abs -+ 0, // sao_offset_sign -+ 0, // end_of_slice_flag -+ 3, // split_coding_unit_flag -+ 1, // cu_transquant_bypass_flag -+ 3, // skip_flag -+ 3, // cu_qp_delta -+ 1, // pred_mode -+ 4, // part_mode -+ 0, // pcm_flag -+ 1, // prev_intra_luma_pred_mode -+ 0, // mpm_idx -+ 0, // rem_intra_luma_pred_mode -+ 2, // intra_chroma_pred_mode -+ 1, // merge_flag -+ 1, // merge_idx -+ 5, // inter_pred_idc -+ 2, // ref_idx_l0 -+ 2, // ref_idx_l1 -+ 2, // abs_mvd_greater0_flag -+ 2, // abs_mvd_greater1_flag -+ 0, // abs_mvd_minus2 -+ 0, // mvd_sign_flag -+ 1, // mvp_lx_flag -+ 1, // no_residual_data_flag -+ 3, // split_transform_flag -+ 2, // cbf_luma -+ 4, // cbf_cb, cbf_cr -+ 2, // transform_skip_flag[][] -+ 2, // explicit_rdpcm_flag[][] -+ 2, // explicit_rdpcm_dir_flag[][] -+ 18, // last_significant_coeff_x_prefix -+ 18, // last_significant_coeff_y_prefix -+ 0, // last_significant_coeff_x_suffix -+ 0, // last_significant_coeff_y_suffix -+ 4, // significant_coeff_group_flag -+ 44, // significant_coeff_flag -+ 24, // coeff_abs_level_greater1_flag -+ 6, // coeff_abs_level_greater2_flag -+ 0, // coeff_abs_level_remaining -+ 0, // coeff_sign_flag -+ 8, // log2_res_scale_abs -+ 2, // res_scale_sign_flag -+ 1, // cu_chroma_qp_offset_flag -+ 1, // cu_chroma_qp_offset_idx -+}; -+ -+/** -+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement. -+ */ -+static const int elem_offset[sizeof(num_bins_in_se)] = { -+ 0, // sao_merge_flag -+ 1, // sao_type_idx -+ 2, // sao_eo_class -+ 2, // sao_band_position -+ 2, // sao_offset_abs -+ 2, // sao_offset_sign -+ 2, // end_of_slice_flag -+ 2, // split_coding_unit_flag -+ 5, // cu_transquant_bypass_flag -+ 6, // skip_flag -+ 9, // cu_qp_delta -+ 12, // pred_mode -+ 13, // part_mode -+ 17, // pcm_flag -+ 17, // prev_intra_luma_pred_mode -+ 18, // mpm_idx -+ 18, // rem_intra_luma_pred_mode -+ 18, // intra_chroma_pred_mode -+ 20, // merge_flag -+ 21, // merge_idx -+ 22, // inter_pred_idc -+ 27, // ref_idx_l0 -+ 29, // ref_idx_l1 -+ 31, // abs_mvd_greater0_flag -+ 33, // abs_mvd_greater1_flag -+ 35, // abs_mvd_minus2 -+ 35, // mvd_sign_flag -+ 35, // mvp_lx_flag -+ 36, // no_residual_data_flag -+ 37, // split_transform_flag -+ 40, // cbf_luma -+ 42, // cbf_cb, cbf_cr -+ 46, // transform_skip_flag[][] -+ 48, // explicit_rdpcm_flag[][] -+ 50, // explicit_rdpcm_dir_flag[][] -+ 52, // last_significant_coeff_x_prefix -+ 70, // last_significant_coeff_y_prefix -+ 88, // last_significant_coeff_x_suffix -+ 88, // last_significant_coeff_y_suffix -+ 88, // significant_coeff_group_flag -+ 92, // significant_coeff_flag -+ 136, // coeff_abs_level_greater1_flag -+ 160, // coeff_abs_level_greater2_flag -+ 166, // coeff_abs_level_remaining -+ 166, // coeff_sign_flag -+ 166, // log2_res_scale_abs -+ 174, // res_scale_sign_flag -+ 176, // cu_chroma_qp_offset_flag -+ 177, // cu_chroma_qp_offset_idx -+}; -+ -+#define CNU 154 -+/** -+ * Indexed by init_type -+ */ -+static const uint8_t init_values[3][HEVC_CONTEXTS] = { -+ { // sao_merge_flag -+ 153, -+ // sao_type_idx -+ 200, -+ // split_coding_unit_flag -+ 139, 141, 157, -+ // cu_transquant_bypass_flag -+ 154, -+ // skip_flag -+ CNU, CNU, CNU, -+ // cu_qp_delta -+ 154, 154, 154, -+ // pred_mode -+ CNU, -+ // part_mode -+ 184, CNU, CNU, CNU, -+ // prev_intra_luma_pred_mode -+ 184, -+ // intra_chroma_pred_mode -+ 63, 139, -+ // merge_flag -+ CNU, -+ // merge_idx -+ CNU, -+ // inter_pred_idc -+ CNU, CNU, CNU, CNU, CNU, -+ // ref_idx_l0 -+ CNU, CNU, -+ // ref_idx_l1 -+ CNU, CNU, -+ // abs_mvd_greater1_flag -+ CNU, CNU, -+ // abs_mvd_greater1_flag -+ CNU, CNU, -+ // mvp_lx_flag -+ CNU, -+ // no_residual_data_flag -+ CNU, -+ // split_transform_flag -+ 153, 138, 138, -+ // cbf_luma -+ 111, 141, -+ // cbf_cb, cbf_cr -+ 94, 138, 182, 154, -+ // transform_skip_flag -+ 139, 139, -+ // explicit_rdpcm_flag -+ 139, 139, -+ // explicit_rdpcm_dir_flag -+ 139, 139, -+ // last_significant_coeff_x_prefix -+ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, -+ 79, 108, 123, 63, -+ // last_significant_coeff_y_prefix -+ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, -+ 79, 108, 123, 63, -+ // significant_coeff_group_flag -+ 91, 171, 134, 141, -+ // significant_coeff_flag -+ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, -+ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, -+ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, -+ 141, 111, -+ // coeff_abs_level_greater1_flag -+ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, -+ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, -+ // coeff_abs_level_greater2_flag -+ 138, 153, 136, 167, 152, 152, -+ // log2_res_scale_abs -+ 154, 154, 154, 154, 154, 154, 154, 154, -+ // res_scale_sign_flag -+ 154, 154, -+ // cu_chroma_qp_offset_flag -+ 154, -+ // cu_chroma_qp_offset_idx -+ 154, -+ }, -+ { // sao_merge_flag -+ 153, -+ // sao_type_idx -+ 185, -+ // split_coding_unit_flag -+ 107, 139, 126, -+ // cu_transquant_bypass_flag -+ 154, -+ // skip_flag -+ 197, 185, 201, -+ // cu_qp_delta -+ 154, 154, 154, -+ // pred_mode -+ 149, -+ // part_mode -+ 154, 139, 154, 154, -+ // prev_intra_luma_pred_mode -+ 154, -+ // intra_chroma_pred_mode -+ 152, 139, -+ // merge_flag -+ 110, -+ // merge_idx -+ 122, -+ // inter_pred_idc -+ 95, 79, 63, 31, 31, -+ // ref_idx_l0 -+ 153, 153, -+ // ref_idx_l1 -+ 153, 153, -+ // abs_mvd_greater1_flag -+ 140, 198, -+ // abs_mvd_greater1_flag -+ 140, 198, -+ // mvp_lx_flag -+ 168, -+ // no_residual_data_flag -+ 79, -+ // split_transform_flag -+ 124, 138, 94, -+ // cbf_luma -+ 153, 111, -+ // cbf_cb, cbf_cr -+ 149, 107, 167, 154, -+ // transform_skip_flag -+ 139, 139, -+ // explicit_rdpcm_flag -+ 139, 139, -+ // explicit_rdpcm_dir_flag -+ 139, 139, -+ // last_significant_coeff_x_prefix -+ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, -+ 94, 108, 123, 108, -+ // last_significant_coeff_y_prefix -+ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, -+ 94, 108, 123, 108, -+ // significant_coeff_group_flag -+ 121, 140, 61, 154, -+ // significant_coeff_flag -+ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, -+ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, -+ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, -+ 140, 140, -+ // coeff_abs_level_greater1_flag -+ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, -+ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, -+ // coeff_abs_level_greater2_flag -+ 107, 167, 91, 122, 107, 167, -+ // log2_res_scale_abs -+ 154, 154, 154, 154, 154, 154, 154, 154, -+ // res_scale_sign_flag -+ 154, 154, -+ // cu_chroma_qp_offset_flag -+ 154, -+ // cu_chroma_qp_offset_idx -+ 154, -+ }, -+ { // sao_merge_flag -+ 153, -+ // sao_type_idx -+ 160, -+ // split_coding_unit_flag -+ 107, 139, 126, -+ // cu_transquant_bypass_flag -+ 154, -+ // skip_flag -+ 197, 185, 201, -+ // cu_qp_delta -+ 154, 154, 154, -+ // pred_mode -+ 134, -+ // part_mode -+ 154, 139, 154, 154, -+ // prev_intra_luma_pred_mode -+ 183, -+ // intra_chroma_pred_mode -+ 152, 139, -+ // merge_flag -+ 154, -+ // merge_idx -+ 137, -+ // inter_pred_idc -+ 95, 79, 63, 31, 31, -+ // ref_idx_l0 -+ 153, 153, -+ // ref_idx_l1 -+ 153, 153, -+ // abs_mvd_greater1_flag -+ 169, 198, -+ // abs_mvd_greater1_flag -+ 169, 198, -+ // mvp_lx_flag -+ 168, -+ // no_residual_data_flag -+ 79, -+ // split_transform_flag -+ 224, 167, 122, -+ // cbf_luma -+ 153, 111, -+ // cbf_cb, cbf_cr -+ 149, 92, 167, 154, -+ // transform_skip_flag -+ 139, 139, -+ // explicit_rdpcm_flag -+ 139, 139, -+ // explicit_rdpcm_dir_flag -+ 139, 139, -+ // last_significant_coeff_x_prefix -+ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, -+ 79, 108, 123, 93, -+ // last_significant_coeff_y_prefix -+ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, -+ 79, 108, 123, 93, -+ // significant_coeff_group_flag -+ 121, 140, 61, 154, -+ // significant_coeff_flag -+ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, -+ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, -+ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, -+ 140, 140, -+ // coeff_abs_level_greater1_flag -+ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, -+ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, -+ // coeff_abs_level_greater2_flag -+ 107, 167, 91, 107, 107, 167, -+ // log2_res_scale_abs -+ 154, 154, 154, 154, 154, 154, 154, 154, -+ // res_scale_sign_flag -+ 154, 154, -+ // cu_chroma_qp_offset_flag -+ 154, -+ // cu_chroma_qp_offset_idx -+ 154, -+ }, -+}; -+ -+static const uint8_t scan_1x1[1] = { -+ 0, -+}; -+ -+static const uint8_t horiz_scan2x2_x[4] = { -+ 0, 1, 0, 1, -+}; -+ -+static const uint8_t horiz_scan2x2_y[4] = { -+ 0, 0, 1, 1 -+}; -+ -+static const uint8_t horiz_scan4x4_x[16] = { -+ 0, 1, 2, 3, -+ 0, 1, 2, 3, -+ 0, 1, 2, 3, -+ 0, 1, 2, 3, -+}; -+ -+static const uint8_t horiz_scan4x4_y[16] = { -+ 0, 0, 0, 0, -+ 1, 1, 1, 1, -+ 2, 2, 2, 2, -+ 3, 3, 3, 3, -+}; -+ -+static const uint8_t horiz_scan8x8_inv[8][8] = { -+ { 0, 1, 2, 3, 16, 17, 18, 19, }, -+ { 4, 5, 6, 7, 20, 21, 22, 23, }, -+ { 8, 9, 10, 11, 24, 25, 26, 27, }, -+ { 12, 13, 14, 15, 28, 29, 30, 31, }, -+ { 32, 33, 34, 35, 48, 49, 50, 51, }, -+ { 36, 37, 38, 39, 52, 53, 54, 55, }, -+ { 40, 41, 42, 43, 56, 57, 58, 59, }, -+ { 44, 45, 46, 47, 60, 61, 62, 63, }, -+}; -+ -+static const uint8_t diag_scan2x2_x[4] = { -+ 0, 0, 1, 1, -+}; -+ -+static const uint8_t diag_scan2x2_y[4] = { -+ 0, 1, 0, 1, -+}; -+ -+static const uint8_t diag_scan2x2_inv[2][2] = { -+ { 0, 2, }, -+ { 1, 3, }, -+}; -+ -+static const uint8_t diag_scan4x4_inv[4][4] = { -+ { 0, 2, 5, 9, }, -+ { 1, 4, 8, 12, }, -+ { 3, 7, 11, 14, }, -+ { 6, 10, 13, 15, }, -+}; -+ -+static const uint8_t diag_scan8x8_inv[8][8] = { -+ { 0, 2, 5, 9, 14, 20, 27, 35, }, -+ { 1, 4, 8, 13, 19, 26, 34, 42, }, -+ { 3, 7, 12, 18, 25, 33, 41, 48, }, -+ { 6, 11, 17, 24, 32, 40, 47, 53, }, -+ { 10, 16, 23, 31, 39, 46, 52, 57, }, -+ { 15, 22, 30, 38, 45, 51, 56, 60, }, -+ { 21, 29, 37, 44, 50, 55, 59, 62, }, -+ { 28, 36, 43, 49, 54, 58, 61, 63, }, -+}; -+ -+ -+typedef struct -+{ -+ uint16_t coeff; -+ uint16_t scale; -+} xy_off_t; -+ -+#define XYT_C(x,y,t) ((x) + ((y) << (t))) -+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t)) -+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t)) -+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t))) -+ -+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)} -+ -+#define OFF_DIAG(t) {\ -+ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\ -+ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\ -+ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\ -+ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\ -+} -+ -+#define OFF_HORIZ(t) {\ -+ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\ -+ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\ -+ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\ -+ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\ -+} -+ -+#define OFF_VERT(t) {\ -+ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\ -+ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\ -+ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\ -+ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\ -+} -+ -+static const xy_off_t off_xys[3][4][16] = -+{ -+ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)}, -+ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)}, -+ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)} -+}; -+ -+ -+// Helper fns -+#ifndef hevc_mem_bits32 -+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset) -+{ -+ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7); -+} -+#endif -+ -+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32) -+#define hevc_clz32 hevc_clz32_builtin -+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x) -+{ -+ // __builtin_clz says it works on ints - so adjust if int is >32 bits long -+ return __builtin_clz(x) - (sizeof(int) * 8 - 32); -+} -+#endif -+ -+// It is unlikely that we will ever need this but include for completeness -+#ifndef hevc_clz32 -+static inline unsigned int hevc_clz32(unsigned int x) -+{ -+ unsigned int n = 1; -+ if ((x & 0xffff0000) == 0) { -+ n += 16; -+ x <<= 16; -+ } -+ if ((x & 0xff000000) == 0) { -+ n += 8; -+ x <<= 8; -+ } -+ if ((x & 0xf0000000) == 0) { -+ n += 4; -+ x <<= 4; -+ } -+ if ((x & 0xc0000000) == 0) { -+ n += 2; -+ x <<= 2; -+ } -+ return n - ((x >> 31) & 1); -+} -+#endif -+ -+static inline int cabac_overflow(const CABACContext * const cc) -+{ -+ av_assert0(cc->bytestream >= cc->bytestream_start); -+ return cc->bytestream >= cc->bytestream_end + 4; -+} -+ -+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc) -+{ -+ return cabac_overflow(&lc->cc); -+} -+ -+#if !USE_BY22 -+// If no by22 then _by22 functions will revert to normal and so _peek/_flush -+// will no longer be called but the setup calls will still exist and we want -+// to null them out -+#define bypass_start(s) -+#define bypass_finish(s) -+#else -+// Use BY22 for residual bypass block -+ -+#define bypass_start(cc) get_cabac_by22_start(cc) -+#define bypass_finish(cc) get_cabac_by22_finish(cc) -+ -+// BY22 notes that bypass is simply a divide into the bitstream and so we -+// can peek out large quantities of bits at once and treat the result as if -+// it was VLC. In many cases this will lead to O(1) processing rather than -+// O(n) though the setup and teardown is sufficiently expensive that it is -+// only worth using if we expect to be dealing with more than a few bits -+// The definition of "a few bits" will vary from platform to platform but -+// tests on ARM show that it probably isn't worth it for a single coded -+// residual, but is for >1 - it also seems likely that if there are -+// more residuals then they are likely to be bigger and this will make the -+// O(1) nature of the code more worthwhile. -+ -+ -+// Bypass block start -+// Must be called before _by22_peek is used as it sets the CABAC environment -+// into the correct state. _by22_finish must be called to return to 'normal' -+// (i.e. non-bypass) cabac decoding -+#ifndef get_cabac_by22_start -+static inline void get_cabac_by22_start(CABACContext * const c) -+{ -+ const unsigned int bits = __builtin_ctz(c->low); -+ const uint32_t m = hevc_mem_bits32(c->bytestream, 0); -+ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits)); -+#if !USE_BY22_DIV -+ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff]; -+#endif -+ -+ c->bytestream -= (CABAC_BITS / 8); -+ c->by22.bits = bits; -+#if !USE_BY22_DIV -+ c->by22.range = c->range; -+ c->range = inv; -+#endif -+ c->low = x; -+} -+#endif -+ -+// Bypass block finish -+// Must be called at the end of the bypass block to return to normal operation -+static inline void get_cabac_by22_finish(CABACContext * const c) -+{ -+ unsigned int used = c->by22.bits; -+ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8); -+ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7); -+ -+ c->bytestream += bytes_used + (CABAC_BITS / 8); -+ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used; -+#if !USE_BY22_DIV -+ c->range = c->by22.range; -+#endif -+} -+ -+// Peek bypass bits -+// _by22_start must be called before _by22_peek is called and _by22_flush -+// must be called afterwards to flush any used bits -+// The actual number of valid bits returned is -+// min(, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS -+// will be at least 22 which should be long enough for any prefix or suffix -+// though probably not long enough for the worst case combination -+#ifndef get_cabac_by22_peek -+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c) -+{ -+#if USE_BY22_DIV -+ return ((unsigned int)c->low / (unsigned int)c->range) << 9; -+#else -+ uint32_t x = c->low & ~1U; -+ const uint32_t inv = c->range; -+ -+ if (inv != 0) -+ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32); -+ -+ return x << 1; -+#endif -+} -+#endif -+ -+// Flush bypass bits peeked by _by22_peek -+// Flush n bypass bits. n must be >= 1 to guarantee correct operation -+// val is an unmodified copy of whatever _by22_peek returned -+#ifndef get_cabac_by22_flush -+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val) -+{ -+ // Subtract the bits used & reshift up to the top of the word -+#if USE_BY22_DIV -+ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23)); -+#else -+ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23)); -+#endif -+ -+ // and refill lower bits -+ // We will probably OR over some existing bits but that doesn't matter -+ c->by22.bits += n; -+ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9); -+} -+#endif -+ -+#endif // USE_BY22 -+ -+ -+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc) -+{ -+ memcpy(s->cabac_save->rice, lc->stat_coeff, 4); -+ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS); -+} -+ -+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ memcpy(lc->stat_coeff, s->cabac_save->rice, 4); -+ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS); -+} -+ -+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc) -+{ -+ GetBitContext * const gb = &lc->gb; -+ skip_bits(gb, 1); -+ align_get_bits(gb); -+ return ff_init_cabac_decoder(&lc->cc, -+ gb->buffer + get_bits_count(gb) / 8, -+ (get_bits_left(gb) + 7) / 8); -+} -+ -+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ int init_type = 2 - s->sh.slice_type; -+ int i; -+ -+ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) -+ init_type ^= 3; -+ -+ for (i = 0; i < HEVC_CONTEXTS; i++) { -+ int init_value = init_values[init_type][i]; -+ int m = (init_value >> 4) * 5 - 45; -+ int n = ((init_value & 15) << 3) - 16; -+ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127; -+ -+ pre ^= pre >> 31; -+ if (pre > 124) -+ pre = 124 + (pre & 1); -+ lc->cabac_state[i] = pre; -+ } -+ -+ for (i = 0; i < 4; i++) -+ lc->stat_coeff[i] = 0; -+} -+ -+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags) -+{ -+ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0) -+ { -+ lc->qPy_pred = s->sh.slice_qp; -+ cabac_init_state(s, lc); -+ } -+ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0) -+ { -+ lc->qPy_pred = s->sh.slice_qp; -+ load_states(s, lc); -+ } -+ lc->cabac_init_req = 0; -+} -+ -+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) -+ -+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state) -+{ -+ return get_cabac_inline(c, state); -+} -+ -+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c) -+{ -+ return get_cabac_terminate(c); -+} -+ -+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc) -+{ -+ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX])) -+ return 0; -+ -+ if (!get_cabac_bypass(&lc->cc)) -+ return SAO_BAND; -+ return SAO_EDGE; -+} -+ -+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc) -+{ -+ int i; -+ int value = get_cabac_bypass(&lc->cc); -+ -+ for (i = 0; i < 4; i++) -+ value = (value << 1) | get_cabac_bypass(&lc->cc); -+ return value; -+} -+ -+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ int i = 0; -+ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; -+ -+ while (i < length && get_cabac_bypass(&lc->cc)) -+ i++; -+ return i; -+} -+ -+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc) -+{ -+ return get_cabac_bypass(&lc->cc); -+} -+ -+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc) -+{ -+ int ret = get_cabac_bypass(&lc->cc) << 1; -+ ret |= get_cabac_bypass(&lc->cc); -+ return ret; -+} -+ -+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc) -+{ -+ int val = 1; -+ -+ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0) -+ return 0; -+ -+ while (val < 5 && -+ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0) -+ val++; -+ -+ if (val >= 5) { -+ unsigned int k = 0; -+ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { -+ val += 1 << k; -+ k++; -+ } -+// if (k == CABAC_MAX_BIN) -+// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); -+ -+ while (k--) -+ val += get_cabac_bypass(&lc->cc) << k; -+ } -+ return get_cabac_bypass(&lc->cc) ? -val : val; -+} -+ -+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); -+ int i = 0; -+ -+ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) -+ i++; -+ -+ return i; -+} -+ -+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size) -+{ -+ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1 -+ return PART_2Nx2N; -+ if (log2_cb_size == s->ps.sps->log2_min_cb_size) { -+ if (lc->cu.pred_mode == MODE_INTRA) // 0 -+ return PART_NxN; -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 -+ return PART_2NxN; -+ if (log2_cb_size == 3) // 00 -+ return PART_Nx2N; -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001 -+ return PART_Nx2N; -+ return PART_NxN; // 000 -+ } -+ -+ if (!s->ps.sps->amp_enabled_flag) { -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 -+ return PART_2NxN; -+ return PART_Nx2N; -+ } -+ -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011 -+ return PART_2NxN; -+ if (get_cabac_bypass(&lc->cc)) // 0101 -+ return PART_2NxnD; -+ return PART_2NxnU; // 0100 -+ } -+ -+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001 -+ return PART_Nx2N; -+ if (get_cabac_bypass(&lc->cc)) // 0001 -+ return PART_nRx2N; -+ return PART_nLx2N; // 0000 -+} -+ -+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc) -+{ -+ int i = 0; -+ while (i < 2 && get_cabac_bypass(&lc->cc)) -+ i++; -+ return i; -+} -+ -+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc) -+{ -+ int i; -+ int value = get_cabac_bypass(&lc->cc); -+ -+ for (i = 0; i < 4; i++) -+ value = (value << 1) | get_cabac_bypass(&lc->cc); -+ return value; -+} -+ -+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc) -+{ -+ int ret; -+ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE])) -+ return 4; -+ -+ ret = get_cabac_bypass(&lc->cc) << 1; -+ ret |= get_cabac_bypass(&lc->cc); -+ return ret; -+} -+ -+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]); -+ -+ if (i != 0) { -+ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc)) -+ i++; -+ } -+ return i; -+} -+ -+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH) -+{ -+ if (nPbW + nPbH == 12) -+ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); -+ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth)) -+ return PRED_BI; -+ -+ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); -+} -+ -+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx) -+{ -+ int i = 0; -+ int max = num_ref_idx_lx - 1; -+ int max_ctx = FFMIN(max, 2); -+ -+ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i)) -+ i++; -+ if (i == 2) { -+ while (i < max && get_cabac_bypass(&lc->cc)) -+ i++; -+ } -+ -+ return i; -+} -+ -+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]); -+} -+ -+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); -+} -+ -+#if !USE_BY22 -+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc) -+{ -+ int ret = 2; -+ int k = 1; -+ -+ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { -+ ret += 1U << k; -+ k++; -+ } -+ if (k == CABAC_MAX_BIN) { -+ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); -+ return 0; -+ } -+ -+ while (k--) -+ ret += get_cabac_bypass(&lc->cc) << k; -+ return get_cabac_bypass_sign(&lc->cc, -ret); -+} -+#endif -+ -+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return get_cabac_bypass_sign(&lc->cc, -1); -+} -+ -+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) -+{ -+ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz); -+} -+ -+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) -+{ -+ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz); -+} -+ -+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) -+{ -+ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz); -+} -+ -+ -+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) { -+ int i =0; -+ -+ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) -+ i++; -+ -+ return i; -+} -+ -+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, -+ int log2_size, int *last_scx_prefix, int *last_scy_prefix) -+{ -+ int i = 0; -+ int max = (log2_size << 1) - 1; -+ int ctx_offset, ctx_shift; -+ -+ if (!c_idx_nz) { -+ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); -+ ctx_shift = (log2_size + 1) >> 2; -+ } else { -+ ctx_offset = 15; -+ ctx_shift = log2_size - 2; -+ } -+ while (i < max && -+ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) -+ i++; -+ *last_scx_prefix = i; -+ -+ i = 0; -+ while (i < max && -+ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) -+ i++; -+ *last_scy_prefix = i; -+} -+ -+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc, -+ int last_significant_coeff_prefix) -+{ -+ int i; -+ int length = (last_significant_coeff_prefix >> 1) - 1; -+ int value = get_cabac_bypass(&lc->cc); -+ -+ for (i = 1; i < length; i++) -+ value = (value << 1) | get_cabac_bypass(&lc->cc); -+ return value; -+} -+ -+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg) -+{ -+ int inc; -+ -+ inc = (ctx_cg != 0) + (c_idx_nz << 1); -+ -+ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); -+} -+ -+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset) -+{ -+ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); -+} -+ -+#if !USE_BY22 -+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r) -+#endif -+ -+ -+#ifndef coeff_abs_level_remaining_decode_bypass -+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param) -+{ -+ uint32_t y; -+ unsigned int prefix; -+ unsigned int last_coeff_abs_level_remaining; -+ unsigned int n; -+ -+ y = get_cabac_by22_peek(c); -+ prefix = hevc_clz32(~y); -+ // y << prefix will always have top bit 0 -+ -+ if (prefix < 3) { -+ const unsigned int suffix = (y << prefix) >> (31 - rice_param); -+ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix; -+ n = prefix + 1 + rice_param; -+ } -+ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2) -+ { -+ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param)); -+ -+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; -+ n = prefix * 2 + rice_param - 2; -+ } -+ else { -+ unsigned int suffix; -+ -+ get_cabac_by22_flush(c, prefix, y); -+ y = get_cabac_by22_peek(c); -+ -+ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param)); -+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; -+ n = prefix + rice_param - 2; -+ } -+ -+ get_cabac_by22_flush(c, n, y); -+ -+ return last_coeff_abs_level_remaining; -+} -+#endif -+ -+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param) -+{ -+ int prefix = 0; -+ int suffix = 0; -+ int last_coeff_abs_level_remaining; -+ int i; -+ -+ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c)) -+ prefix++; -+ if (prefix == CABAC_MAX_BIN) { -+// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); -+ return 0; -+ } -+ -+ if (prefix < 3) { -+ for (i = 0; i < rc_rice_param; i++) -+ suffix = (suffix << 1) | get_cabac_bypass(c); -+ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; -+ } else { -+ int prefix_minus3 = prefix - 3; -+ for (i = 0; i < prefix_minus3 + rc_rice_param; i++) -+ suffix = (suffix << 1) | get_cabac_bypass(c); -+ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) -+ << rc_rice_param) + suffix; -+ } -+ -+ return last_coeff_abs_level_remaining; -+} -+ -+#if !USE_BY22 -+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode -+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb) -+{ -+ unsigned int i; -+ uint32_t ret = 0; -+ -+ for (i = 0; i < nb; i++) -+ ret = (ret << 1) | get_cabac_bypass(c); -+ -+ return ret << (32 - nb); -+} -+#endif -+ -+#ifndef coeff_sign_flag_decode_bypass -+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb) -+{ -+ uint32_t y; -+ y = get_cabac_by22_peek(c); -+ get_cabac_by22_flush(c, nb, y); -+ return y & ~(0xffffffffU >> nb); -+} -+#endif -+ -+ -+#ifndef get_cabac_greater1_bits -+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n, -+ uint8_t * const state0) -+{ -+ unsigned int i; -+ unsigned int rv = 0; -+ for (i = 0; i != n; ++i) { -+ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3; -+ const unsigned int b = get_cabac(c, state0 + idx); -+ rv = (rv << 1) | b; -+ } -+ return rv; -+} -+#endif -+ -+ -+// N.B. levels returned are the values assuming coeff_abs_level_remaining -+// is uncoded, so 1 must be added if it is coded. sum_abs also reflects -+// this version of events. -+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels, -+ int * const pprev_subset_coded, int * const psum, -+ const unsigned int idx0_gt1, const unsigned int idx_gt2) -+{ -+ CABACContext * const c = &lc->cc; -+ uint8_t * const state0 = lc->cabac_state + idx0_gt1; -+ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2; -+ unsigned int rv; -+ unsigned int i; -+ const unsigned int n = FFMIN(n_end, 8); -+ -+ // Really this is i != n but the simple unconditional loop is cheaper -+ // and faster -+ for (i = 0; i != 8; ++i) -+ levels[i] = 1; -+ -+ rv = get_cabac_greater1_bits(c, n, state0); -+ -+ *pprev_subset_coded = 0; -+ *psum = n; -+ -+ rv <<= (32 - n); -+ if (rv != 0) -+ { -+ *pprev_subset_coded = 1; -+ *psum = n + 1; -+ i = hevc_clz32(rv); -+ levels[i] = 2; -+ if (get_cabac(c, state_gt2) == 0) -+ { -+ // Unset first coded bit -+ rv &= ~(0x80000000U >> i); -+ } -+ } -+ -+ if (n_end > 8) { -+ const unsigned int g8 = n_end - 8; -+ rv |= ((1 << g8) - 1) << (24 - g8); -+ for (i = 0; i != g8; ++i) { -+ levels[i + 8] = 0; -+ } -+ } -+ -+ return rv; -+} -+ -+// extended_precision_processing_flag must be false given we are -+// putting the result into a 16-bit array -+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining) -+// scale_m is uint8_t -+// -+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12) -+// or it can be 2 (if we have transquant_bypass) -+// shift is set to one less than we really want but would normally be -+// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5? -+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6 -+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient) -+// to achieve it -+ -+#ifndef trans_scale_sat -+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) -+{ -+ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); -+} -+#endif -+ -+ -+#ifndef update_rice -+static inline void update_rice(uint8_t * const stat_coeff, -+ const unsigned int last_coeff_abs_level_remaining, -+ const unsigned int c_rice_param) -+{ -+ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param; -+ if (x >= 6) -+ (*stat_coeff)++; -+ else if (x == 0 && *stat_coeff > 0) -+ (*stat_coeff)--; -+} -+#endif -+ -+ -+// n must be > 0 on entry -+#ifndef get_cabac_sig_coeff_flag_idxs -+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, -+ unsigned int n, -+ const uint8_t const * ctx_map, -+ uint8_t * p) -+{ -+ do { -+ if (get_cabac(c, state0 + ctx_map[n])) -+ *p++ = n; -+ } while (--n != 0); -+ return p; -+} -+#endif -+ -+ -+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, -+ unsigned int n, -+ const uint8_t * ctx_map, // const ptr here but not in asm -+ uint8_t * const flag_idx) -+{ -+ int rv; -+ -+ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx; -+ -+ return rv; -+} -+ -+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ -+ x0, x1, x2, x3,\ -+ x4, x5, x6, x7,\ -+ x8, x9, x10, x11,\ -+ x12, x13, x14, x15} -+ -+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ -+ x0, x4, x8, x12,\ -+ x1, x5, x9, x13,\ -+ x2, x6, x10, x14,\ -+ x3, x7, x11, x15} -+ -+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ -+ x0, x4, x1, x8,\ -+ x5, x2, x12, x9,\ -+ x6, x3, x13, x10,\ -+ x7, x14, x11, x15} -+ -+ -+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz, -+ uint8_t * const significant_coeff_group_flag, -+ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg, -+ int * const pPrev_sig) -+{ -+ while (--i >= 0) { -+ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; -+ const unsigned int x_cg = scan_x_cg[i]; -+ -+ // For the flag decode we only care about Z/NZ but -+ // we use the full Right * 2 + Down when calculating -+ // significant coeff flags so we obtain it here. -+ // -+ // The group flag array is one longer than it needs to -+ // be so we don't need to check for y_cg limits -+ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); -+ -+ if (i == 0 || -+ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig)) -+ { -+ gf_y[0] |= (1 << x_cg); -+ *pPrev_sig = prev_sig; -+ break; -+ } -+ } -+ -+ return i; -+} -+ -+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb, -+ const unsigned int log2_trafo_size, const unsigned int c_idx, -+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) -+{ -+ const AVFrame * const frame = s->frame; -+ const unsigned int stride = frame_stride1(s->frame, c_idx); -+ const unsigned int x = x0 >> ctx_hshift(s, c_idx); -+ const unsigned int y = y0 >> ctx_vshift(s, c_idx); -+ const int is_sliced = 1; // av_rpi_is_sand_frame(frame); -+ uint8_t * const dst = !is_sliced ? -+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : -+ c_idx == 0 ? -+ av_rpi_sand_frame_pos_y(frame, x, y) : -+ av_rpi_sand_frame_pos_c(frame, x, y); -+ -+ const unsigned int i = jb->intra.n; -+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; -+ -+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && -+ pc->ta.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->ta.stride == stride); -+ -+ pc->type = RPI_PRED_ADD_RESIDUAL_C; -+ } -+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && -+ pc->dc.dst == dst) -+ { -+ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->dc.stride == stride); -+ -+ // Rewrite as add residual - must rewrite all fields as different union member -+ pc->type = RPI_PRED_ADD_RESIDUAL_V; -+ pc->ta.buf = coeffs; -+ pc->ta.dst = dst; -+ pc->ta.stride = stride; -+ pc->ta.dc = dc; -+ } -+ else -+ { -+ HEVCPredCmd * const cmd = pc + 1; -+ jb->intra.n = i + 1; -+ -+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); -+ cmd->size = log2_trafo_size; -+ cmd->ta.buf = coeffs; -+ cmd->ta.dst = dst; -+ cmd->ta.stride = stride; -+ cmd->ta.dc = 0; -+ } -+} -+ -+ -+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const unsigned int log2_trafo_size, const unsigned int c_idx, -+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) -+{ -+ const AVFrame * const frame = s->frame; -+ const unsigned int stride = frame_stride1(s->frame, c_idx); -+ const unsigned int x = x0 >> ctx_hshift(s, c_idx); -+ const unsigned int y = y0 >> ctx_vshift(s, c_idx); -+ const int is_sliced = 1; -+ uint8_t * const dst = !is_sliced ? -+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : -+ c_idx == 0 ? -+ av_rpi_sand_frame_pos_y(frame, x, y) : -+ av_rpi_sand_frame_pos_c(frame, x, y); -+ -+ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); -+ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); -+ -+ const unsigned int i = jb->intra.n; -+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; -+ -+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && -+ pc->ta.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->ta.stride == stride); -+ -+ pc->ta.dc = (int16_t)coeff; -+ } -+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && -+ pc->dc.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->dc.stride == stride && -+ (pc->dc.dc & ~0xffff) == 0); -+ -+ pc->dc.dc |= (coeff << 16); -+ } -+ else -+ { -+ HEVCPredCmd * const cmd = pc + 1; -+ jb->intra.n = i + 1; -+ -+ cmd->type = RPI_PRED_ADD_DC + c_idx; -+ cmd->size = log2_trafo_size; -+ cmd->dc.dst = dst; -+ cmd->dc.stride = stride; -+ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; -+ } -+} -+ -+ -+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x0, const int y0, -+ const int log2_trafo_size, const enum ScanType scan_idx, -+ const int c_idx) -+{ -+ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag; -+ -+ int last_significant_coeff_x, last_significant_coeff_y; -+ int num_coeff = 0; -+ int prev_subset_coded = 0; -+ -+ int num_last_subset; -+ int x_cg_last_sig, y_cg_last_sig; -+ -+ const uint8_t *scan_x_cg, *scan_y_cg; -+ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; -+ -+ int use_vpu; -+#if RPI_COMPRESS_COEFFS -+ int num_nonzero = 0; -+ int use_compress = 0; -+ int *coeffs32; -+#endif -+ int use_dc = 0; -+ int16_t *coeffs; -+ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero -+ int explicit_rdpcm_flag = 0; -+ int explicit_rdpcm_dir_flag; -+ -+ int i; -+ int shift,scale; -+ const uint8_t *scale_matrix = NULL; -+ uint8_t dc_scale; -+ const int c_idx_nz = (c_idx != 0); -+ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; -+ int prev_sig = 0; -+ int may_hide_sign; -+ -+ int16_t dummy_coeffs[16]; -+ -+ // Derive QP for dequant -+ if (!lc->cu.cu_transquant_bypass_flag) { -+ may_hide_sign = s->ps.pps->sign_data_hiding_flag; -+ -+ if (s->ps.pps->transform_skip_enabled_flag && -+ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { -+ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz); -+ if (transform_skip_flag) { -+ trans_skip_or_bypass = 1; -+ if (lc->cu.pred_mode == MODE_INTRA && -+ s->ps.sps->implicit_rdpcm_enabled_flag && -+ (pred_mode_intra == 10 || pred_mode_intra == 26)) { -+ may_hide_sign = 0; -+ } -+ } -+ } -+ -+ { -+ static const uint8_t level_scale[8] = { -+ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8 -+ }; -+ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y]; -+ -+ // Shift is set to one less than will actually occur as the scale -+ // and saturate step adds 1 and then shifts right again -+ scale = level_scale[qp6 & 7]; -+// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3); -+ shift = log2_trafo_size - (qp6 >> 3); -+ -+ if (shift < 0) { -+ scale <<= -shift; -+ shift = 0; -+ } -+ } -+ -+ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) { -+ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ? -+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; -+ const unsigned int matrix_id = -+ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx; -+ -+ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; -+ dc_scale = scale_matrix[0]; -+ if (log2_trafo_size >= 4) -+ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; -+ } -+ else -+ { -+ static const uint8_t sixteen_scale[64] = { -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16, -+ 16, 16, 16, 16, 16, 16, 16, 16 -+ }; -+ scale_matrix = sixteen_scale; -+ dc_scale = 16; -+ } -+ } else { -+ static const uint8_t unit_scale[64] = { -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ 1, 1, 1, 1, 1, 1, 1, 1, -+ }; -+ scale_matrix = unit_scale; -+ shift = 0; -+ scale = 2; // We will shift right to kill this -+ dc_scale = 1; -+ -+ may_hide_sign = 0; -+ } -+ -+ -+ -+ -+ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && -+ trans_skip_or_bypass) { -+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz); -+ if (explicit_rdpcm_flag) { -+ may_hide_sign = 0; -+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz); -+ } -+ } -+ -+ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size, -+ &last_significant_coeff_x, &last_significant_coeff_y); -+ -+ if (last_significant_coeff_x > 3) { -+ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x); -+ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * -+ (2 + (last_significant_coeff_x & 1)) + -+ suffix; -+ } -+ -+ if (last_significant_coeff_y > 3) { -+ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y); -+ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * -+ (2 + (last_significant_coeff_y & 1)) + -+ suffix; -+ } -+ -+ if (scan_idx == SCAN_VERT) -+ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y); -+ -+ x_cg_last_sig = last_significant_coeff_x >> 2; -+ y_cg_last_sig = last_significant_coeff_y >> 2; -+ -+ switch (scan_idx) { -+ case SCAN_DIAG: { -+ int last_x_c = last_significant_coeff_x & 3; -+ int last_y_c = last_significant_coeff_y & 3; -+ -+ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; -+ -+ switch (log2_trafo_size) { -+ case 2: -+ scan_x_cg = scan_1x1; -+ scan_y_cg = scan_1x1; -+ break; -+ case 3: -+ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; -+ scan_x_cg = diag_scan2x2_x; -+ scan_y_cg = diag_scan2x2_y; -+ break; -+ case 4: -+ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; -+ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x; -+ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y; -+ break; -+ case 5: -+ default: -+ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; -+ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x; -+ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y; -+ break; -+ } -+ break; -+ } -+ case SCAN_HORIZ: -+ scan_x_cg = horiz_scan2x2_x; -+ scan_y_cg = horiz_scan2x2_y; -+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; -+ break; -+ default: //SCAN_VERT -+ scan_x_cg = horiz_scan2x2_y; -+ scan_y_cg = horiz_scan2x2_x; -+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; -+ break; -+ } -+ num_coeff++; -+ num_last_subset = (num_coeff - 1) >> 4; -+ -+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant -+ -+ { -+ const unsigned int ccount = 1 << (log2_trafo_size * 2); -+ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing -+ use_vpu = 0; -+ use_dc = (num_coeff == 1) && !special && -+ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); -+ -+ if (use_dc) { -+ // Just need a little empty space -+ coeffs = dummy_coeffs; -+ // No need to clear -+ } -+ else -+ { -+ use_vpu = !special && log2_trafo_size >= 4; -+#if RPI_COMPRESS_COEFFS -+ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed; -+#endif -+ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); -+#if RPI_COMPRESS_COEFFS -+ coeffs32 = (int*)coeffs; -+ if (!use_compress) -+#endif -+#if HAVE_NEON -+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); -+#else -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+#endif -+ } -+ } -+ -+ i = num_last_subset; -+ do { -+ int implicit_non_zero_coeff = 0; -+ int n_end; -+ -+ uint8_t significant_coeff_flag_idx[16]; -+ unsigned int nb_significant_coeff_flag = 0; -+ -+ if (i == num_last_subset) { -+ // First time through -+ int last_scan_pos = num_coeff - (i << 4) - 1; -+ n_end = last_scan_pos - 1; -+ significant_coeff_flag_idx[0] = last_scan_pos; -+ nb_significant_coeff_flag = 1; -+ } else { -+ n_end = 15; -+ implicit_non_zero_coeff = (i != 0); -+ } -+ -+ if (n_end >= 0) { -+ static const uint8_t ctx_idx_maps_ts2[3][16] = { -+ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 -+ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 -+ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 -+ }; -+ // N.B. prev_sig = Right * 2 + Down -+ static const uint8_t ctx_idx_maps[3][4][16] = { -+ { -+ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 -+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 -+ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default -+ }, -+ { -+ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 -+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 -+ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default -+ }, -+ { -+ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 -+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 -+ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default -+ } -+ }; -+ const uint8_t *ctx_idx_map_p; -+ int scf_offset = 0; -+ -+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { -+ ctx_idx_map_p = ctx_idx_maps[0][3]; -+ scf_offset = 40 + c_idx_nz; -+ } else { -+ if (c_idx_nz != 0) -+ scf_offset = 27; -+ -+ if (log2_trafo_size == 2) { -+ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx]; -+ } else { -+ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig]; -+ if (!c_idx_nz) { -+ if (i != 0) -+ scf_offset += 3; -+ -+ if (log2_trafo_size == 3) { -+ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; -+ } else { -+ scf_offset += 21; -+ } -+ } else { -+ if (log2_trafo_size == 3) -+ scf_offset += 9; -+ else -+ scf_offset += 12; -+ } -+ } -+ } -+ -+ if (n_end > 0) { -+ int cnt = get_sig_coeff_flag_idxs(&lc->cc, -+ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset, -+ n_end, ctx_idx_map_p, -+ significant_coeff_flag_idx + nb_significant_coeff_flag); -+ -+ nb_significant_coeff_flag += cnt; -+ if (cnt != 0) { -+ implicit_non_zero_coeff = 0; -+ } -+ } -+ -+ if (implicit_non_zero_coeff == 0) { -+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { -+ scf_offset = 42 + c_idx_nz; -+ } else { -+ if (i == 0) { -+ scf_offset = c_idx_nz ? 27 : 0; -+ } else { -+ scf_offset = 2 + scf_offset; -+ } -+ } -+ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) { -+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; -+ nb_significant_coeff_flag++; -+ } -+ } else { -+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; -+ nb_significant_coeff_flag++; -+ } -+ } -+#if RPI_COMPRESS_COEFFS -+ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full! -+ int16_t temp[32*32]; -+ const unsigned int ccount = 1 << (log2_trafo_size * 2); -+ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0; -+ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer -+ memcpy(temp, coeffs, sizeof(int)*num_nonzero); -+ coeffs32 = (int *)temp; -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+ num_nonzero--; -+ while (num_nonzero >= 0) { -+ const unsigned int res = coeffs32[num_nonzero]; -+ const unsigned int offset = res & 0xffff; -+ coeffs[ offset ] = res >> 16; -+ num_nonzero--; -+ } -+ use_compress = 0; -+ } -+#endif -+ -+ if (nb_significant_coeff_flag != 0) { -+ const unsigned int gt1_idx_delta = (c_idx_nz << 2) | -+ ((i != 0 && !c_idx_nz) ? 2 : 0) | -+ prev_subset_coded; -+ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + -+ (gt1_idx_delta << 2); -+ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + -+ gt1_idx_delta; -+ -+ const unsigned int x_cg = scan_x_cg[i]; -+ const unsigned int y_cg = scan_y_cg[i]; -+ int16_t * const blk_coeffs = coeffs + -+ ((x_cg + (y_cg << log2_trafo_size)) << 2); -+ // This calculation is 'wrong' for log2_traffo_size == 2 -+ // but that doesn't matter as in this case x_cg & y_cg -+ // are always 0 so result is correct (0) anyway -+ const uint8_t * const blk_scale = scale_matrix + -+ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size))); -+ -+ // * The following code block doesn't deal with these flags: -+ // (nor did the one it replaces) -+ // -+ // cabac_bypass_alignment_enabled_flag -+ // This should be easy but I can't find a test case -+ // extended_precision_processing_flag -+ // This can extend the required precision past 16bits -+ // so is probably tricky - also no example found yet -+ -+#if USE_N_END_1 -+ if (nb_significant_coeff_flag == 1) { -+ // There is a small gain to be had from special casing the single -+ // transform coefficient case. The reduction in complexity -+ // makes up for the code duplicatioon. -+ -+ int trans_coeff_level = 1; -+ int coeff_sign_flag; -+ int coded_val = 0; -+ -+ // initialize first elem of coeff_bas_level_greater1_flag -+ prev_subset_coded = 0; -+ -+ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) { -+ trans_coeff_level = 2; -+ prev_subset_coded = 1; -+ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2); -+ } -+ -+ // Probably not worth the overhead of starting by22 for just one value -+ coeff_sign_flag = get_cabac_bypass(&lc->cc); -+ -+ if (coded_val) -+ { -+ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { -+ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0); -+ } else { -+ uint8_t * const stat_coeff = -+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); -+ const unsigned int c_rice_param = *stat_coeff >> 2; -+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param); -+ -+ trans_coeff_level = 3 + last_coeff_abs_level_remaining; -+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); -+ } -+ } -+ -+ { -+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; -+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; -+ const unsigned int scale_m = blk_scale[xy_off->scale]; -+ const int res = trans_scale_sat( -+ (trans_coeff_level ^ k) - k, // Apply sign -+ scale, -+ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, -+ shift); -+#if RPI_COMPRESS_COEFFS -+ if (use_compress) -+ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); -+ else -+#endif -+ blk_coeffs[xy_off->coeff] = res; -+ } -+ } -+ else -+#endif -+ { -+ int sign_hidden = may_hide_sign; -+ int levels[16]; // Should be able to get away with int16_t but that fails some tests -+ uint32_t coeff_sign_flags; -+ uint32_t coded_vals = 0; -+ // Sum(abs(level[])) -+ // In fact we only need the bottom bit and in some future -+ // version that may be all we calculate -+ unsigned int sum_abs; -+ -+ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels, -+ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2); -+ -+ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3) -+ sign_hidden = 0; -+ -+ // -- Start bypass block -+ -+ bypass_start(&lc->cc); -+ -+ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden); -+ -+ if (coded_vals != 0) -+ { -+ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag; -+ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL : -+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); -+ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2; -+ int * level = levels - 1; -+ -+ do { -+ { -+ const unsigned int z = hevc_clz32(coded_vals) + 1; -+ level += z; -+ coded_vals <<= z; -+ } -+ -+ { -+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param); -+ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; -+ -+ sum_abs += last_coeff_abs_level_remaining + 1; -+ *level = trans_coeff_level; -+ -+ if (stat_coeff != NULL) -+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); -+ stat_coeff = NULL; -+ -+ if (trans_coeff_level > (3 << c_rice_param) && -+ (c_rice_param < 4 || rice_adaptation_enabled)) -+ ++c_rice_param; -+ } -+ } while (coded_vals != 0); -+ } -+ -+ // sign_hidden = 0 or 1 so we can combine the tests -+ if ((sign_hidden & sum_abs) != 0) { -+ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1]; -+ } -+ -+ bypass_finish(&lc->cc); -+ -+ // -- Finish bypass block -+ -+ // Scale loop -+ { -+ int m = nb_significant_coeff_flag - 1; -+ -+ // Deal with DC component (if any) first -+ if (i == 0 && significant_coeff_flag_idx[m] == 0) -+ { -+ const int k = (int32_t)(coeff_sign_flags << m) >> 31; -+ const int res = trans_scale_sat( -+ (levels[m] ^ k) - k, scale, dc_scale, shift); -+#if RPI_COMPRESS_COEFFS -+ if (use_compress) -+ { -+ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); -+ } -+ else -+#endif -+ { -+ blk_coeffs[0] = res; -+ } -+ --m; -+ } -+ -+#if !USE_N_END_1 -+ // If N_END_1 set then m was at least 1 initially -+ if (m >= 0) -+#endif -+ { -+ do { -+ const xy_off_t * const xy_off = scan_xy_off + -+ significant_coeff_flag_idx[m]; -+ const int k = (int32_t)(coeff_sign_flags << m) >> 31; -+ const int res = trans_scale_sat( -+ (levels[m] ^ k) - k, -+ scale, -+ blk_scale[xy_off->scale], -+ shift); -+#if RPI_COMPRESS_COEFFS -+ if (use_compress) { -+ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); -+ } else -+#endif -+ blk_coeffs[xy_off->coeff] = res; -+ } while (--m >= 0); -+ } -+ } -+ -+ } -+ } -+ } while ((i = next_subset(lc, i, c_idx_nz, -+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 && -+ !cabac_overflow(&lc->cc)); -+ -+ if (lc->cu.cu_transquant_bypass_flag) { -+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -+ (pred_mode_intra == 10 || pred_mode_intra == 26))) { -+ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag; -+ -+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); -+ } -+ } else { -+ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass -+ int rot = s->ps.sps->transform_skip_rotation_enabled_flag && -+ log2_trafo_size == 2 && -+ lc->cu.pred_mode == MODE_INTRA; -+ if (rot) { -+ for (i = 0; i < 8; i++) -+ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); -+ } -+ -+ s->hevcdsp.dequant(coeffs, log2_trafo_size); -+ -+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -+ lc->cu.pred_mode == MODE_INTRA && -+ (pred_mode_intra == 10 || pred_mode_intra == 26))) { -+ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26); -+ -+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); -+ } -+ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { -+ s->hevcdsp.transform_4x4_luma(coeffs); -+ } -+ else if (!use_vpu) -+ { -+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); -+ if (max_xy == 0) -+ { -+ if (use_dc) -+ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); -+ else -+ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); -+ } -+ else { -+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; -+ if (max_xy < 4) -+ col_limit = FFMIN(4, col_limit); -+ else if (max_xy < 8) -+ col_limit = FFMIN(8, col_limit); -+ else if (max_xy < 12) -+ col_limit = FFMIN(24, col_limit); -+ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); -+ } -+ } -+ } -+ -+#if 0 -+ // Mildly rotted - we support no mode where cross is valid -+ if (lc->tu.cross_pf) { -+ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; -+ const int ccount = 1 << (log2_trafo_size * 2); -+ -+ for (i = 0; i < ccount; i++) { -+ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); -+ } -+ } -+#endif -+ -+ if (!use_dc) { -+#if RPI_COMPRESS_COEFFS -+ if (use_compress) { -+ coeffs32[num_nonzero] = 0; -+ } -+#endif -+ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); -+ } -+} -+ -+#if !USE_BY22 -+// Stores results to lc -+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) -+{ -+ int x = abs_mvd_greater0_flag_decode(lc); -+ int y = abs_mvd_greater0_flag_decode(lc); -+ -+ if (x) -+ x += abs_mvd_greater1_flag_decode(lc); -+ if (y) -+ y += abs_mvd_greater1_flag_decode(lc); -+ -+ switch (x) { -+ case 2: x = mvd_decode(lc); break; -+ case 1: x = mvd_sign_flag_decode(lc); break; -+ case 0: x = 0; break; -+ } -+ -+ switch (y) { -+ case 2: y = mvd_decode(lc); break; -+ case 1: y = mvd_sign_flag_decode(lc); break; -+ case 0: y = 0; break; -+ } -+ return MV_XY(x,y); -+} -+#else -+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) -+{ -+ int x = abs_mvd_greater0_flag_decode(lc); -+ int y = abs_mvd_greater0_flag_decode(lc); -+ -+ if ((x | y) == 0) -+ return 0; -+ -+ if (x != 0) -+ x += abs_mvd_greater1_flag_decode(lc); -+ if (y != 0) -+ y += abs_mvd_greater1_flag_decode(lc); -+ -+ if ((x | y) == 1) -+ { -+ // Not worth starting BY22 -+ if (x != 0) -+ x = mvd_sign_flag_decode(lc); -+ if (y != 0) -+ y = mvd_sign_flag_decode(lc); -+ } -+ else -+ { -+ CABACContext * const cc = &lc->cc; -+ uint32_t val; -+ uint32_t b; -+ unsigned int n = 0; -+ -+ bypass_start(cc); -+ b = val = get_cabac_by22_peek(cc); -+ -+ if (x == 1) { -+ x = ((int32_t)b >> 31) | 1; -+ n = 1; -+ b <<= 1; -+ } -+ else if (x == 2) { -+ // EG1 so we have (leading one bits + 1) of suffix -+ // This makes prefix & suffix lengths the same -+ const unsigned int k = hevc_clz32(~b) + 1; -+ int s; -+ -+ av_assert2(k <= 15); -+ -+ b <<= k; -+ n = 2 * k + 1; // Includes suffix & sign -+ -+ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked -+ // if we are going to do this without a flush -+ if (k > CABAC_BY22_PEEK_BITS / 2 - 1) -+ { -+ // Need too many bits - flush -+ // n = k -+ get_cabac_by22_flush(cc, k, val); -+ b = val = get_cabac_by22_peek(cc); -+ n = k + 1; -+ } -+ -+ x = (b >> (32 - k)) + (1 << k); -+ b <<= k; -+ s = (int32_t)b >> 31; -+ x = (x ^ s) - s; -+ b <<= 1; -+ -+ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits) -+ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15) -+ { -+ get_cabac_by22_flush(cc, n, val); -+ b = val = get_cabac_by22_peek(cc); -+ n = 0; -+ } -+ } -+ -+ if (y == 1) { -+ y = ((int32_t)b >> 31) | 1; -+ ++n; -+ // don't care about b anymore -+ } -+ else if (y == 2) { -+ const unsigned int k = hevc_clz32(~b) + 1; -+ int s; -+ -+ av_assert2(k <= 15); -+ -+ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked -+ // if we are going to do this without a flush -+ b <<= k; -+ n += 2 * k + 1; -+ -+ if (n > CABAC_BY22_PEEK_BITS) -+ { -+ // Need too many bits - flush -+ get_cabac_by22_flush(cc, n - (k + 1), val); -+ b = val = get_cabac_by22_peek(cc); -+ n = k + 1; -+ } -+ -+ y = (b >> (32 - k)) + (1 << k); -+ s = (int32_t)(b << k) >> 31; -+ y = (y ^ s) - s; -+ // don't care about b anymore -+ } -+ -+ get_cabac_by22_flush(cc, n, val); -+ bypass_finish(cc); -+ } -+ -+ return MV_XY(x, y); -+} -+#endif -diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h -new file mode 100644 -index 0000000000..ca191f00d9 ---- /dev/null -+++ b/libavcodec/rpi_hevc_cabac_fns.h -@@ -0,0 +1,217 @@ -+/* -+ * HEVC CABAC decoding -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2018 John Cox -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H -+#define AVCODEC_RPI_HEVC_CABAC_FNS_H -+ -+#include "config.h" -+#include "rpi_hevcdec.h" -+ -+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc); -+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags); -+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size); -+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH); -+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx); -+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx); -+ -+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); -+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x0, const int y0, -+ const int log2_trafo_size, const enum ScanType scan_idx, -+ const int c_idx); -+ -+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); -+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); -+ -+#define HEVC_BIN_SAO_MERGE_FLAG 0 -+#define HEVC_BIN_SAO_TYPE_IDX 1 -+#define HEVC_BIN_SAO_EO_CLASS 2 -+#define HEVC_BIN_SAO_BAND_POSITION 2 -+#define HEVC_BIN_SAO_OFFSET_ABS 2 -+#define HEVC_BIN_SAO_OFFSET_SIGN 2 -+#define HEVC_BIN_END_OF_SLICE_FLAG 2 -+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2 -+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5 -+#define HEVC_BIN_SKIP_FLAG 6 -+#define HEVC_BIN_CU_QP_DELTA 9 -+#define HEVC_BIN_PRED_MODE 12 -+#define HEVC_BIN_PART_MODE 13 -+#define HEVC_BIN_PCM_FLAG 17 -+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17 -+#define HEVC_BIN_MPM_IDX 18 -+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18 -+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18 -+#define HEVC_BIN_MERGE_FLAG 20 -+#define HEVC_BIN_MERGE_IDX 21 -+#define HEVC_BIN_INTER_PRED_IDC 22 -+#define HEVC_BIN_REF_IDX_L0 27 -+#define HEVC_BIN_REF_IDX_L1 29 -+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31 -+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33 -+#define HEVC_BIN_ABS_MVD_MINUS2 35 -+#define HEVC_BIN_MVD_SIGN_FLAG 35 -+#define HEVC_BIN_MVP_LX_FLAG 35 -+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36 -+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37 -+#define HEVC_BIN_CBF_LUMA 40 -+#define HEVC_BIN_CBF_CB_CR 42 -+#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46 -+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48 -+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50 -+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52 -+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70 -+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88 -+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88 -+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88 -+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92 -+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136 -+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160 -+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166 -+#define HEVC_BIN_COEFF_SIGN_FLAG 166 -+#define HEVC_BIN_LOG2_RES_SCALE_ABS 166 -+#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174 -+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176 -+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177 -+ -+ -+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state); -+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c); -+ -+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) { -+ const uint8_t *ptr = c->bytestream; -+ -+ if (c->low & 0x1) -+ ptr--; -+#if CABAC_BITS == 16 -+ if (c->low & 0x1FF) -+ ptr--; -+#endif -+ if ((int) (c->bytestream_end - ptr) < n) -+ return NULL; -+ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0) -+ return NULL; -+ -+ return ptr; -+} -+ -+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG); -+} -+ -+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG); -+} -+ -+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG); -+} -+ -+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int ct_depth, -+ const unsigned int x0, const unsigned int y0) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG + -+ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) + -+ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth)); -+} -+ -+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x0, const int y0, const int x_cb, const int y_cb) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + -+ (s->cabac_stash_left[y0 >> 3] & 1) + -+ (s->cabac_stash_up[x0 >> 3] & 1)); -+} -+ -+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE); -+} -+ -+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac_terminate(&lc->cc); -+} -+ -+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE); -+} -+ -+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG); -+} -+ -+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG); -+} -+ -+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG); -+} -+ -+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth); -+} -+ -+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth); -+} -+ -+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size); -+} -+ -+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) -+{ -+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx); -+} -+ -+ -+ -+#endif -+ -diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c -new file mode 100644 -index 0000000000..341bb77d9d ---- /dev/null -+++ b/libavcodec/rpi_hevc_data.c -@@ -0,0 +1,75 @@ -+/* -+ * HEVC shared tables -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include -+ -+#include "rpi_hevc_data.h" -+ -+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = { -+ 0, 0, 1, 0, -+ 1, 2, 0, 1, -+ 2, 3, 1, 2, -+ 3, 2, 3, 3, -+}; -+ -+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = { -+ 0, 1, 0, 2, -+ 1, 0, 3, 2, -+ 1, 0, 3, 2, -+ 1, 3, 2, 3, -+}; -+ -+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = { -+ 0, 0, 1, 0, -+ 1, 2, 0, 1, -+ 2, 3, 0, 1, -+ 2, 3, 4, 0, -+ 1, 2, 3, 4, -+ 5, 0, 1, 2, -+ 3, 4, 5, 6, -+ 0, 1, 2, 3, -+ 4, 5, 6, 7, -+ 1, 2, 3, 4, -+ 5, 6, 7, 2, -+ 3, 4, 5, 6, -+ 7, 3, 4, 5, -+ 6, 7, 4, 5, -+ 6, 7, 5, 6, -+ 7, 6, 7, 7, -+}; -+ -+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = { -+ 0, 1, 0, 2, -+ 1, 0, 3, 2, -+ 1, 0, 4, 3, -+ 2, 1, 0, 5, -+ 4, 3, 2, 1, -+ 0, 6, 5, 4, -+ 3, 2, 1, 0, -+ 7, 6, 5, 4, -+ 3, 2, 1, 0, -+ 7, 6, 5, 4, -+ 3, 2, 1, 7, -+ 6, 5, 4, 3, -+ 2, 7, 6, 5, -+ 4, 3, 7, 6, -+ 5, 4, 7, 6, -+ 5, 7, 6, 7, -+}; -diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h -new file mode 100644 -index 0000000000..0aee673d8b ---- /dev/null -+++ b/libavcodec/rpi_hevc_data.h -@@ -0,0 +1,31 @@ -+/* -+ * HEVC shared data tables -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVC_DATA_H -+#define AVCODEC_RPI_HEVC_DATA_H -+ -+#include -+ -+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16]; -+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16]; -+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64]; -+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64]; -+ -+#endif /* AVCODEC_RPI_HEVC_DATA_H */ -diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c -new file mode 100644 -index 0000000000..5125d1eb6b ---- /dev/null -+++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1210 @@ -+/* -+ * HEVC video decoder -+ * -+ * Originally by: -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2013 Seppo Tomperi -+ * Copyright (C) 2013 Wassim Hamidouche -+ * -+ * Substantially rewritten: -+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+//#define DISABLE_SAO -+//#define DISABLE_DEBLOCK -+//#define DISABLE_STRENGTHS -+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames) -+//#define DISABLE_DEBLOCK_NONREF -+ -+#include "libavutil/common.h" -+#include "libavutil/internal.h" -+ -+#include "rpi_hevcdec.h" -+ -+#include "bit_depth_template.c" -+ -+#include "rpi_qpu.h" -+#include "rpi_zc.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+#define LUMA 0 -+#define CB 1 -+#define CR 2 -+ -+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2 -+// so -12,75 overall -+static const uint8_t tctablex[] = { -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18 -+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37 -+ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53 -+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75 -+}; -+#define tctable (tctablex + 12 + 6*8) -+ -+static const uint8_t betatablex[] = { -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18 -+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37 -+ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51 -+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73 -+}; -+#define betatable (betatablex + 12 + 6*8) -+ -+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y, -+ const int c_idx, const int tc_offset) -+{ -+ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2]; -+} -+ -+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int xBase, const unsigned int yBase) -+{ -+ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; -+ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size; -+ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask; -+ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask; -+ const unsigned int min_cb_width = s->ps.sps->min_cb_width; -+ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; -+ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; -+ const int qPy_pred = lc->qPy_pred; -+ -+ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred : -+ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) + -+ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred : -+ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1; -+} -+ -+// * Only called from bitstream decode in foreground -+// so should be safe -+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase) -+{ -+ const int qp_y = get_qPy_pred(s, lc, xBase, yBase); -+ -+ if (lc->tu.cu_qp_delta != 0) { -+ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere -+ int off = s->ps.sps->qp_bd_offset; -+ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off, -+ 52 + off) - off; -+ } else -+ lc->qp_y = qp_y; -+} -+ -+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx) -+{ -+ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; -+} -+ -+// "DSP" these? -+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) -+{ -+ switch (pixel_shift) -+ { -+ case 2: -+ *(uint32_t *)dst = *(uint32_t *)src; -+ break; -+ case 1: -+ *(uint16_t *)dst = *(uint16_t *)src; -+ break; -+ default: -+ *dst = *src; -+ break; -+ } -+} -+ -+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, -+ ptrdiff_t stride_src, int x, int y, int width, int height, -+ int c_idx, int x_ctb, int y_ctb) -+{ -+ const unsigned int sh = pixel_shift(s, c_idx); -+ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx); -+ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx); -+ -+ /* copy horizontal edges */ -+ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), -+ src, width << sh); -+ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), -+ src + stride_src * (height - 1), width << sh); -+ -+ /* copy vertical edges */ -+ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); -+ -+ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); -+} -+ -+// N.B. Src & dst are swapped as this is a restore! -+// x0 & y0 are in luma coords -+// Width & height are in Y/C pels as appropriate -+// * Clear scope for optimsation here but not used enough to be worth it -+static void restore_tqb_pixels(const HEVCRpiContext * const s, -+ uint8_t *src1, const uint8_t *dst1, -+ const ptrdiff_t stride_src, const ptrdiff_t stride_dst, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int width, const int height, -+ const int c_idx) -+{ -+ if (s->ps.pps->transquant_bypass_enable_flag || -+ s->ps.sps->pcm.loop_filter_disable_flag) -+ { -+ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width; -+ int blks_y = height >> (c_idx == 0 ? 3 : 2); -+ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand -+ const unsigned int bheight = (c_idx == 0) ? 8 : 4; -+ const unsigned int sh = ((x0 >> 3) & 7); -+ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1; -+ -+ do { -+ unsigned int m = (*pcm >> sh) & mask; -+ uint8_t * bd = src1; -+ const uint8_t * bs = dst1; -+ while (m != 0) { -+ if ((m & 1) != 0) { -+ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight); -+ } -+ m >>= 1; -+ bs += bwidth; -+ bd += bwidth; -+ } -+ src1 += stride_src * bheight; -+ dst1 += stride_dst * bheight; -+ pcm += s->ps.sps->pcm_width; -+ } while (--blks_y > 0); -+ } -+} -+ -+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) -+ -+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y) -+{ -+#if SAO_FILTER_N == 5 -+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; -+#elif SAO_FILTER_N == 6 -+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; -+#else -+#error Confused by size of sao fn array -+#endif -+ int c_idx; -+ int edges[4]; // 0 left 1 top 2 right 3 bottom -+ int x_ctb = x >> s->ps.sps->log2_ctb_size; -+ int y_ctb = y >> s->ps.sps->log2_ctb_size; -+ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb; -+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs]; -+ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb); -+ // flags indicating unfilterable edges -+ uint8_t vert_edge[] = { 0, 0 }; -+ uint8_t horiz_edge[] = { 0, 0 }; -+ uint8_t diag_edge[] = { 0, 0, 0, 0 }; -+ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb); -+ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag && -+ !s->ps.pps->loop_filter_across_tiles_enabled_flag; -+ uint8_t restore = no_tile_filter || !lfase; -+ uint8_t left_tile_edge = 0; -+ uint8_t right_tile_edge = 0; -+ uint8_t up_tile_edge = 0; -+ uint8_t bottom_tile_edge = 0; -+ const int sliced = 1; -+ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1); -+ -+ edges[0] = x_ctb == 0; -+ edges[1] = y_ctb == 0; -+ edges[2] = x_ctb == s->ps.sps->ctb_width - 1; -+ edges[3] = y_ctb == s->ps.sps->ctb_height - 1; -+ -+#ifdef DISABLE_SAO -+ return; -+#endif -+ -+ if (restore) { -+ if (!edges[0]) { -+ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -+ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge; -+ } -+ if (!edges[2]) { -+ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]]; -+ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge; -+ } -+ if (!edges[1]) { -+ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]; -+ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge; -+ } -+ if (!edges[3]) { -+ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]]; -+ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge; -+ } -+ if (!edges[0] && !edges[1]) { -+ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge; -+ } -+ if (!edges[1] && !edges[2]) { -+ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge; -+ } -+ if (!edges[2] && !edges[3]) { -+ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge; -+ } -+ if (!edges[0] && !edges[3]) { -+ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge; -+ } -+ } -+ -+ for (c_idx = 0; c_idx < plane_count; c_idx++) { -+ const unsigned int vshift = ctx_vshift(s, c_idx); -+ const unsigned int hshift = ctx_hshift(s, c_idx); -+ const int x0 = x >> hshift; -+ const int y0 = y >> vshift; -+ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx); -+ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift; -+ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift; -+ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0); -+ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0); -+ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; -+ ptrdiff_t stride_dst; -+ uint8_t *dst; -+ -+ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); -+ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; -+ uint8_t * const src = !sliced ? -+ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : -+ c_idx == 0 ? -+ av_rpi_sand_frame_pos_y(s->frame, x0, y0) : -+ av_rpi_sand_frame_pos_c(s->frame, x0, y0); -+ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : -+ !sliced ? src - (1 << sh) : -+ c_idx == 0 ? -+ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : -+ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); -+ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : -+ !sliced ? src + (width << sh) : -+ c_idx == 0 ? -+ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : -+ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); -+ -+ if (sliced && c_idx > 1) { -+ break; -+ } -+ -+// if (c_idx == 1) -+// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr); -+ -+ switch (sao->type_idx[c_idx]) { -+ case SAO_BAND: -+ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, -+ x_ctb, y_ctb); -+ if (s->ps.pps->transquant_bypass_enable_flag || -+ s->ps.sps->pcm.loop_filter_disable_flag) -+ { -+ // Can't use the edge buffer here as it may be in use by the foreground -+ DECLARE_ALIGNED(64, uint8_t, dstbuf) -+ [2*MAX_PB_SIZE*MAX_PB_SIZE]; -+ dst = dstbuf; -+ stride_dst = 2*MAX_PB_SIZE; -+ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); -+ if (sliced && c_idx != 0) -+ { -+ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, -+ sao->offset_val[1], sao->band_position[1], -+ sao->offset_val[2], sao->band_position[2], -+ width, height); -+ } -+ else -+ { -+ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, -+ sao->offset_val[c_idx], sao->band_position[c_idx], -+ width, height); -+ } -+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, -+ x, y, width, height, c_idx); -+ } else { -+ if (sliced && c_idx != 0) -+ { -+ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, -+ sao->offset_val[1], sao->band_position[1], -+ sao->offset_val[2], sao->band_position[2], -+ width, height); -+ } -+ else -+ { -+ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, -+ sao->offset_val[c_idx], sao->band_position[c_idx], -+ width, height); -+ } -+ } -+ sao->type_idx[c_idx] = SAO_APPLIED; -+ break; -+ case SAO_EDGE: -+ { -+ const int w = s->ps.sps->width >> hshift; -+ const int h = s->ps.sps->height >> vshift; -+ int top_edge = edges[1]; -+ int bottom_edge = edges[3]; -+ // Can't use the edge buffer here as it may be in use by the foreground -+ DECLARE_ALIGNED(64, uint8_t, dstbuf) -+ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64]; -+ -+ stride_dst = RPI_HEVC_SAO_BUF_STRIDE; -+ dst = dstbuf + stride_dst + 32; -+ -+ if (!top_edge) { -+ uint8_t *dst1; -+ int src_idx; -+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); -+ -+ dst1 = dst - stride_dst; -+ -+ if (src_l != NULL) { -+ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == -+ SAO_APPLIED); -+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); -+ } -+ -+ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == -+ SAO_APPLIED); -+ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); -+ -+ if (src_r != NULL) { -+ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == -+ SAO_APPLIED); -+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); -+ } -+ } -+ if (!bottom_edge) { -+ uint8_t * const dst1 = dst + height * stride_dst; -+ int src_idx; -+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); -+ const unsigned int hoff = height * stride_src; -+ -+ if (src_l != NULL) { -+ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == -+ SAO_APPLIED); -+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); -+ } -+ -+ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == -+ SAO_APPLIED); -+ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); -+ -+ if (src_r != NULL) { -+ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == -+ SAO_APPLIED); -+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); -+ } -+ } -+ if (src_l != NULL) { -+ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { -+ ff_hevc_rpi_copy_vert(dst - (1 << sh), -+ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), -+ sh, height, stride_dst, 1 << sh); -+ } else { -+ ff_hevc_rpi_copy_vert(dst - (1 << sh), -+ src_l, -+ sh, height, stride_dst, stride_src); -+ } -+ } -+ if (src_r != NULL) { -+ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { -+ ff_hevc_rpi_copy_vert(dst + (width << sh), -+ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), -+ sh, height, stride_dst, 1 << sh); -+ } else { -+ ff_hevc_rpi_copy_vert(dst + (width << sh), -+ src_r, -+ sh, height, stride_dst, stride_src); -+ } -+ } -+ -+ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); -+ -+ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, -+ x_ctb, y_ctb); -+ if (sliced && c_idx != 0) -+ { -+ // Class always the same for both U & V (which is just as well :-)) -+ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, -+ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], -+ width, height); -+ s->hevcdsp.sao_edge_restore_c[restore](src, dst, -+ stride_src, stride_dst, -+ sao, -+ edges, width, -+ height, c_idx, -+ vert_edge, -+ horiz_edge, -+ diag_edge); -+ } -+ else -+ { -+ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], -+ sao->eo_class[c_idx], width, height); -+ s->hevcdsp.sao_edge_restore[restore](src, dst, -+ stride_src, stride_dst, -+ sao, -+ edges, width, -+ height, c_idx, -+ vert_edge, -+ horiz_edge, -+ diag_edge); -+ } -+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, -+ x, y, width, height, c_idx); -+ sao->type_idx[c_idx] = SAO_APPLIED; -+ break; -+ } -+ } -+ } -+ -+#if RPI_ZC_SAND_8_IN_10_BUF -+ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && -+ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) -+ { -+ const unsigned int stride1 = frame_stride1(s->frame, 1); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); -+ const unsigned int xoff = (x >> 8) * stride2 * stride1; -+ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); -+ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; -+ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; -+ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; -+ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; -+ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); -+ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; -+ -+// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); -+ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); -+ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); -+ } -+#endif -+} -+ -+// When bits are delivered to deblock we want them -+//#define TL 1 -+//#define TR 2 -+//#define BL 4 -+//#define BR 8 -+ -+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br -+// so we need to rearrange before passing on -+ -+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) -+{ -+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; -+ return (pcm[0] | -+ (pcm[1] << 8) | -+ (pcm[s->ps.sps->pcm_width] << 16) | -+ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7); -+} -+ -+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) -+{ -+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; -+ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); -+} -+ -+// We cast away const here as we want this to work for both get and set -+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) -+{ -+ return (uint32_t *)(bs + -+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 -+#warning Unexpected masks -+ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes -+ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & -+ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + -+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 -+#error Stride1 < return size -+#endif -+ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + -+ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); -+} -+ -+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) -+{ -+ return (uint8_t *)(bs + -+ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & -+ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + -+ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + -+ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); -+} -+ -+ -+// Get block strength -+// Given how we call we will always get within the 32bit boundries -+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2, -+ unsigned int xl, unsigned int xr, const unsigned int y) -+{ -+ if (xr <= xl) { -+ return 0; -+ } -+ else -+ { -+#if HAVE_ARMV6T2_INLINE -+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 -+#error This case not yet handled in bs_get32 -+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 -+#error Stride1 < return size -+#endif -+ uint32_t tmp; -+ __asm__ ( -+ "lsr %[tmp], %[xl], %[xl_shift] \n\t" -+ "rsb %[xr], %[xl], %[xr] \n\t" -+ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t" -+ "add %[xr], %[xr], #7 \n\t" -+ "lsr %[bs], %[y], %[y_shift1] \n\t" -+ "bic %[xr], %[xr], #7 \n\t" -+ "ubfx %[xl], %[xl], #1, #5 \n\t" -+ "lsr %[xr], %[xr], #1 \n\t" -+ "cmp %[xr], #32 \n\t" -+ "mvn %[tmp], #0 \n\t" -+ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t" -+ "lsl %[tmp], %[tmp], %[xr] \n\t" -+ "lsr %[xl], %[bs], %[xl] \n\t" -+ "it ne \n\t" -+ "bicne %[bs], %[xl], %[tmp] \n\t" -+ : // Outputs -+ [bs]"+r"(bs), -+ [stride2]"+r"(stride2), -+ [xl]"+r"(xl), -+ [xr]"+r"(xr), -+ [tmp]"=&r"(tmp) -+ : // Inputs -+ [y]"r"(y), -+ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT), -+ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR), -+ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) -+ : // Clobbers -+ "cc" -+ ); -+ return (uint32_t) bs; -+#else -+ const uint32_t a = *bs_ptr32(bs, stride2, xl, y); -+ const unsigned int n = ((xr - xl + 7) & ~7) >> 1; -+ -+ return n == 32 ? a : -+ (a >> ((xl >> 1) & 31)) & ~(~0U << n); -+#endif -+ } -+} -+ -+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) -+{ -+ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); -+ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); -+} -+ -+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) -+{ -+ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); -+ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); -+} -+ -+ -+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) -+{ -+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; -+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; -+ const unsigned int ctb_size = (1 << log2_ctb_size); -+ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1); -+ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; -+ const DBParams * cb_dbp = s->deblock + ctb_n; -+ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); -+ -+ unsigned int cb_x; -+ -+ // Do in CTB-shaped blocks -+ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp) -+ { -+ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); -+ const unsigned int bv_l = FFMAX(cb_x, 8); -+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9; -+ const unsigned int bh_l = bv_l - 8; -+ unsigned int y; -+ -+ // Main body -+ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) -+ { -+ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); -+ -+ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; -+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; -+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; -+ -+ if (vbs != 0) -+ { -+ const uint8_t * const tcv = tctable + dbp->tc_offset; -+ const uint8_t * const betav = betatable + dbp->beta_offset; -+ unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+ unsigned int x; -+ -+ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) -+ { -+ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) -+ { -+ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ frame_stride1(s->frame, LUMA), -+ betav[qp], -+ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | -+ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), -+ pcmfa & 3, -+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); -+ } -+ } -+ } -+ -+ if (y != 0) -+ { -+ uint32_t hbs; -+ -+ // H left - mostly separated out so we only need a uint32_t hbs -+ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) -+ { -+ const unsigned int x = bh_l; -+ const unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const DBParams * const dbph = dbp - 1; -+ const uint8_t * const tc = tctable + dbph->tc_offset + qp; -+ -+ av_assert2(cb_x - bh_l == 8); -+ -+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ frame_stride1(s->frame, LUMA), -+ betatable[qp + dbph->beta_offset], -+ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | -+ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), -+ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); -+ } -+ -+ // H -+ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop -+ { -+ unsigned int x; -+ unsigned int pcmfa = pcm4(s, cb_x, y - 1); -+ -+ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) -+ { -+ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) -+ { -+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const uint8_t * const tc = tctable + dbp->tc_offset + qp; -+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ frame_stride1(s->frame, LUMA), -+ betatable[qp + dbp->beta_offset], -+ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | -+ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), -+ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); -+ } -+ } -+ } -+ } -+ -+ } -+ } -+} -+ -+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) -+{ -+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; -+ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; -+ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1; -+} -+ -+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) -+{ -+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; -+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; -+ const unsigned int ctb_size = (1 << log2_ctb_size); -+ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8); -+ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; -+ const DBParams * dbp = s->deblock + ctb_n; -+ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); -+ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1]; -+ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2]; -+ -+ unsigned int cb_x; -+ -+ av_assert1((bounds.x & (ctb_size - 1)) == 0); -+ av_assert1((bounds.y & (ctb_size - 1)) == 0); -+ av_assert1(bounds.h <= ctb_size); -+ -+ // Do in CTB-shaped blocks -+ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) { -+ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); -+ const unsigned int bv_l = FFMAX(cb_x, 16); -+ unsigned int y; -+ -+ // V above -+ if (bounds.y != 0) { -+ // Deblock V up 8 -+ // CTB above current -+ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm -+ const unsigned int y = bounds.y - 8; -+ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; -+ -+ if (vbs != 0) -+ { -+ unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; -+ unsigned int x; -+ -+ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) -+ { -+ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) -+ { -+ const int qp0 = q2h(s, x, y); -+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), -+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ pcmfa & 3); -+ } -+ } -+ } -+ } -+ -+ for (y = bounds.y; y < b_b; y += 16) -+ { -+ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | -+ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); -+ -+ // V -+ if (vbs != 0) -+ { -+ unsigned int x; -+ unsigned int pcmfa = -+ (y + 16 > b_b ? -+ pcm2(s, bv_l - 1, y) | 0xffff0000 : -+ pcm4(s, bv_l - 1, y)); -+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; -+ -+ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) -+ { -+ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) -+ { -+ const int qp0 = q2h(s, x, y); -+ const int qp1 = q2h(s, x, y + 8); -+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), -+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); -+ } -+ } -+ } -+ -+ // H -+ if (y != 0) -+ { -+ uint32_t hbs; -+ const unsigned int bh_l = bv_l - 16; -+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; -+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; -+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; -+ -+ // H left - mostly separated out so we only need a uint32_t hbs -+ // Stub is width 8 to the left of bounds, but width 16 internally -+ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) -+ { -+ unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ -+ // Chop off bits we don't want... -+ if (bh_l < bounds.x) { -+ pcmfa |= 0x10001; // TL|BL pre rearrangement -+ hbs &= ~3; // Make BS 0 -+ } -+ -+ // Double check we still want this -+ if (hbs != 0 && (~pcmfa & 0x30003) != 0) -+ { -+ const unsigned int x = bh_l; -+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; -+ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; -+ -+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), -+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); -+ } -+ } -+ -+ // H main -+ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) -+ { -+ unsigned int x; -+ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it -+ -+ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) -+ { -+ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) -+ { -+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; -+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; -+ -+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), -+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); -+ } -+ } -+ } -+ } -+ } -+ } -+} -+ -+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n) -+{ -+ return x & ~(~0U << log2_n); -+} -+ -+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) -+{ -+ av_assert2((y & 7) == 0); -+ -+ // This doesn't have the same simultainious update issues that bsf_stash -+ // does (other threads will have a different y) so we can do it the easy way -+ if ((bsf &= mask) != 0) -+ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); -+} -+ -+ -+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) -+{ -+ // We arrange this in a slightly odd fashion but it lines up with -+ // how we are going to use it in the actual deblock code & it is easier -+ // to do the contortions here than there -+ // -+ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},... -+ -+ av_assert2((x & 7) == 0); -+ -+ if ((bsf &= mask) != 0) -+ { -+ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); -+ const unsigned int sh = ((x & 8) | (y & 4)) >> 1; -+ -+ if (mask <= 0xf) -+ { -+ *p |= (bsf << sh); -+ } -+ else -+ { -+ do { -+ *p |= (bsf & 0xf) << sh; -+ p += HEVC_RPI_BS_STRIDE1_BYTES; -+ } while ((bsf >>= 4) != 0); -+ } -+ } -+} -+ -+static inline uint32_t bsf_mv(const HEVCRpiContext * const s, -+ const unsigned int rep, const unsigned int dup, -+ const unsigned int mvf_stride0, -+ const unsigned int mvf_stride1, -+ const RefPicList * const rpl_p, const RefPicList * const rpl_q, -+ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q) -+{ -+ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, -+ mvf_p, mvf_q, -+ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, -+ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1); -+} -+ -+ -+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, -+ const HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_trafo_size, -+ const int is_coded_block) -+{ -+ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0); -+ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE; -+ const RefPicList * const rpl = s->refPicList; -+ // Rep count for bsf_mv when running with min_pu chuncks -+ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size; -+ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags; -+ const unsigned int trafo_size = (1U << log2_trafo_size); -+ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1; -+ const uint32_t bsf_cbf = (bsf_mask & 0x55555555); -+ -+ // Do we cover a pred split line? -+ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split; -+ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split; -+ -+ uint32_t bsf_h; -+ uint32_t bsf_v; -+ -+#ifdef DISABLE_STRENGTHS -+ return; -+#endif -+ -+ // We are always on a size boundary -+ av_assert2((x0 & (trafo_size - 1)) == 0); -+ av_assert2((y0 & (trafo_size - 1)) == 0); -+ // log2_trafo_size not really a transform size; we can have to deal -+ // with size 2^6 blocks -+ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6); -+ -+ // Retrieve and update coded (b0), intra (b1) bs flags -+ // -+ // Store on min width (rather than uint32_t) to avoid possible issues -+ // with another thread on another core running wpp using the same -+ // memory (min CTB = 16 pels = 4 bsf els = 8 bits) -+ // -+ // In bsf BS=2 is represented by 3 as it is much easier to test & set -+ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and -+ // 3 will work the same -+ { -+ // Given where we are called from is_cbf_luma & is_intra will be constant over the block -+ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0; -+ uint8_t *const p = s->bsf_stash_up + (x0 >> 4); -+ uint8_t *const q = s->bsf_stash_left + (y0 >> 4); -+ -+ switch (log2_trafo_size) -+ { -+ case 2: -+ case 3: -+ { -+ const unsigned int sh_h = (x0 >> 1) & 7; -+ const unsigned int sh_v = (y0 >> 1) & 7; -+ bsf_h = *p; -+ bsf_v = *q; -+ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h); -+ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v); -+ bsf_h >>= sh_h; -+ bsf_v >>= sh_v; -+ break; -+ } -+ case 4: -+ bsf_h = *p; -+ bsf_v = *q; -+ *p = bsf0; -+ *q = bsf0; -+ break; -+ case 5: -+ bsf_h = *(uint16_t *)p; -+ bsf_v = *(uint16_t *)q; -+ *(uint16_t *)p = bsf0; -+ *(uint16_t *)q = bsf0; -+ break; -+ case 6: -+ default: -+ bsf_h = *(uint32_t *)p; -+ bsf_v = *(uint32_t *)q; -+ *(uint32_t *)p = bsf0; -+ *(uint32_t *)q = bsf0; -+ break; -+ } -+ -+ bsf_h |= bsf0; -+ bsf_v |= bsf0; -+ } -+ -+ // Do Horizontal -+ if ((y0 & 7) == 0) -+ { -+ // Boundary upper -+ if (y0 != 0 && -+ (off_boundary(y0, s->ps.sps->log2_ctb_size) || -+ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0)) -+ { -+ // Look at MVs (BS=1) if we don't already has a full set of bs bits -+ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split)) -+ { -+ // If we aren't on the top boundary we must be in the middle -+ // and in that case we know where mvf can change -+ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0; -+ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ? -+ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] : -+ rpl; -+ -+ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ trafo_size >> (log2_min_pu_size + log2_rep), -+ trafo_size >> (log2_min_pu_size + log2_rep), -+ rpl, rpl_top, -+ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1)); -+ } -+ -+ // Finally put the results into bs -+ hbs_set(s, x0, y0, bsf_mask, bsf_h); -+ } -+ -+ // Max of 1 pu internal split - ignore if not on 8pel boundary -+ if (has_y_split && !off_boundary(lc->cu.y_split, 3)) -+ { -+ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split); -+ // If we have the x split as well then it must be in the middle -+ const unsigned int log2_rep = has_x_split ? 1 : 0; -+ -+ hbs_set(s, x0, lc->cu.y_split, bsf_mask, -+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ trafo_size >> (log2_min_pu_size + log2_rep), -+ trafo_size >> (log2_min_pu_size + log2_rep), -+ rpl, rpl, -+ mvf, mvf - MVF_STASH_WIDTH_PU)); -+ } -+ } -+ -+ // And again for vertical - same logic as horizontal just in the other direction -+ if ((x0 & 7) == 0) -+ { -+ // Boundary left -+ if (x0 != 0 && -+ (off_boundary(x0, s->ps.sps->log2_ctb_size) || -+ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) -+ { -+ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) -+ { -+ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0; -+ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ? -+ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] : -+ rpl; -+ -+ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), -+ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep), -+ rpl, rpl_left, -+ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0)); -+ } -+ -+ vbs_set(s, x0, y0, bsf_mask, bsf_v); -+ } -+ -+ if (has_x_split && !off_boundary(lc->cu.x_split, 3)) -+ { -+ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0); -+ const unsigned int log2_rep = has_y_split ? 1 : 0; -+ -+ vbs_set(s, lc->cu.x_split, y0, bsf_mask, -+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), -+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), -+ rpl, rpl, -+ mvf, mvf - 1)); -+ } -+ } -+} -+ -+#undef LUMA -+#undef CB -+#undef CR -+ -+static inline unsigned int ussub(const unsigned int a, const unsigned int b) -+{ -+ return a < b ? 0 : a - b; -+} -+ -+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x) -+{ -+ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0; -+} -+ -+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot) -+{ -+ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); -+ int x, y; -+ -+ const unsigned int br = bounds.x + bounds.w; -+ const unsigned int bb = bounds.y + bounds.h; -+ -+ const int x_end = (br >= s->ps.sps->width); -+ const int y_end = (bb >= s->ps.sps->height); -+ -+ // Deblock may not touch the edges of the bound as they are still needed -+ // for Intra pred -+ // -+ // Deblock is disabled with a per-slice flag -+ // Given that bounds may cover multiple slices & we dblock outside bounds -+ // anyway we can't avoid deblock using that flag - about the only thing we -+ // could do is have a "no deblock seen yet" flag but it doesn't really -+ // seem worth the effort -+ -+ deblock_y_blk(s, bounds, x_end, y_end); -+ deblock_uv_blk(s, bounds, x_end, y_end); -+ -+ // SAO needs -+ // (a) CTB alignment -+ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel -+ { -+ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1)); -+ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1)); -+ const unsigned int yt = ussub(bounds.y, yo); -+ const unsigned int yb = y_end ? bb : ussub(bb, yo); -+ const unsigned int xl = ussub(bounds.x, xo); -+ const unsigned int xr = x_end ? br : ussub(br, xo); -+ -+ if (s->ps.sps->sao_enabled) -+ { -+ for (y = yt; y < yb; y += ctb_size) { -+ for (x = xl; x < xr; x += ctb_size) { -+ sao_filter_CTB(s, x, y); -+ } -+ } -+ } -+ -+ // Cache invalidate -+ y = 0; -+ if (xr != 0 && yb != 0) -+ { -+ const unsigned int llen = -+ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame)); -+ const unsigned int mask = ~(llen - 1); -+ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask; -+ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask; -+ const unsigned int it = ussub(yt, 1); -+ const unsigned int ib = y_end ? bb : yb - 1; -+ -+ if (il < ir) { -+ rpi_cache_buf_t cbuf; -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ il, it, ir - il, ib - it, -+ ctx_vshift(s, 1), 1, 1); -+ -+ // If we have to commit the right hand tile boundry due to -+ // cache boundry considerations then at EoTile we must commit -+ // that boundry to bottom of tile (bounds) -+ if (ib != bb && ir == br && eot) { -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ br - 1, ib, 1, bb - ib, -+ ctx_vshift(s, 1), 1, 1); -+ } -+ -+ rpi_cache_flush_finish(rfe); -+ -+ if (x_end) -+ y = y_end ? INT_MAX : ib; -+ -+// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1); -+ } -+ } -+ } -+ -+ return y; -+} -+ -diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h -new file mode 100644 -index 0000000000..6b36f5e737 ---- /dev/null -+++ b/libavcodec/rpi_hevc_mv.h -@@ -0,0 +1,71 @@ -+#ifndef AVCODEC_RPI_HEVC_MV_H -+#define AVCODEC_RPI_HEVC_MV_H -+ -+#include "config.h" -+ -+typedef int32_t MvXY; -+ -+typedef struct HEVCRpiMvField { -+ MvXY xy[2]; -+ int8_t ref_idx[2]; -+ int8_t pred_flag; -+ int8_t dummy; // To 12 bytes -+} HEVCRpiMvField; -+ -+ -+#define MV_X(xy) (((xy) << 16) >> 16) -+#define MV_Y(xy) ((xy) >> 16) -+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16)) -+ -+#if ARCH_ARM -+#include "arm/rpi_hevc_mv_arm.h" -+#endif -+ -+#ifndef mvxy_add -+static inline MvXY mvxy_add(const MvXY a, const MvXY b) -+{ -+ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b)); -+} -+#endif -+ -+ -+#ifndef mv_scale_xy -+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb) -+{ -+ int tx, scale_factor; -+ -+ td = td == 0 ? 1 : av_clip_int8(td); -+ tb = av_clip_int8(tb); -+ tx = (0x4000 + (abs(td) >> 1)) / td; -+ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); -+ return MV_XY( -+ av_clip_int16((scale_factor * MV_X(src) + 127 + -+ (scale_factor * MV_X(src) < 0)) >> 8), -+ av_clip_int16((scale_factor * MV_Y(src) + 127 + -+ (scale_factor * MV_Y(src) < 0)) >> 8)); -+} -+#endif -+ -+// 8.3.1 states that the bitstream may not contain poc diffs that do not -+// fit in 16 bits, so given that we don't care about the high bits we only -+// store the low 16 + LT & Inter flags -+ -+#define COL_POC_INTRA 0 -+#define COL_POC_INTER (1 << 16) -+#define COL_POC_LT (1 << 17) -+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y))) -+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff)) -+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0) -+ -+typedef struct ColMv_s { -+ int32_t poc; -+ int32_t xy; -+} ColMv; -+ -+typedef struct ColMvField_s { -+ ColMv L[2]; -+} ColMvField; -+ -+ -+ -+#endif // AVCODEC_RPI_HEVC_MV_H -diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c -new file mode 100644 -index 0000000000..27a9f69525 ---- /dev/null -+++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,487 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2013 Anand Meher Kotra -+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "hevc.h" -+#include "rpi_hevcdec.h" -+ -+static av_always_inline int -+is_eq_mer(const unsigned int plevel, -+ const unsigned int xN, const unsigned int yN, -+ const unsigned int xP, const unsigned int yP) -+{ -+ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0; -+} -+ -+// check if the mv's and refidx are the same between A and B -+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) -+{ -+ return a->pred_flag == b->pred_flag && -+ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) && -+ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1])); -+ return 0; -+} -+ -+/* -+ * 8.5.3.1.7 temporal luma motion vector prediction -+ */ -+static int temporal_luma_motion_vector(const HEVCRpiContext * const s, -+ const HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH, const int refIdxLx, -+ MvXY * const mvLXCol, const int X) -+{ -+ int x, y; -+ const ColMv * cmv = NULL; -+ -+ HEVCRpiFrame * const col_ref = s->ref->collocated_ref; -+ const RefPicList * const refPicList = s->refPicList + X; -+ const int cur_lt = refPicList->isLongTerm[refIdxLx]; -+ -+ *mvLXCol = 0; -+ // Unlikely but we might have a col_ref IDR frame! -+ if (col_ref->col_mvf == NULL) -+ return 0; -+ -+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH); -+ -+ //bottom right collocated motion vector -+ x = x0 + nPbW; -+ y = y0 + nPbH; -+ -+ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && -+ y < s->ps.sps->height && -+ x < s->ps.sps->width) -+ { -+ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + -+ (y >> 4) * s->col_mvf_stride; -+ -+ if (col->L[0].poc != COL_POC_INTRA && -+ (col->L[1].poc == COL_POC_INTRA || -+ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) -+ { -+ cmv = col->L + 0; -+ } -+ else if (col->L[1].poc != COL_POC_INTRA) -+ { -+ cmv = col->L + 1; -+ } -+ } -+ -+ // derive center collocated motion vector -+ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt) -+ { -+ cmv = NULL; -+ x = x0 + (nPbW >> 1); -+ y = y0 + (nPbH >> 1); -+ -+ { -+ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + -+ (y >> 4) * s->col_mvf_stride; -+ -+ if (col->L[0].poc != COL_POC_INTRA && -+ (col->L[1].poc == COL_POC_INTRA || -+ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) -+ { -+ cmv = col->L + 0; -+ } -+ else if (col->L[1].poc != COL_POC_INTRA) -+ { -+ cmv = col->L + 1; -+ } -+ } -+ } -+ -+ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc)) -+ return 0; -+ -+ { -+ const int col_poc = col_ref->poc; -+ const int ref_poc = refPicList->list[refIdxLx]; -+ -+ *mvLXCol = (cur_lt || -+ cmv->poc == col_poc || -+ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ? -+ cmv->xy : -+ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc); -+ } -+ -+ return cmv != NULL; -+} -+ -+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) -+{ -+ return b != NULL && compare_mv_ref_idx(a, b); -+} -+ -+ -+ -+/* -+ * 8.5.3.1.2 Derivation process for spatial merging candidates -+ */ -+static inline const HEVCRpiMvField * -+derive_spatial_merge_candidates( -+ const HEVCRpiContext * const s, -+ const HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int nPbW, const unsigned int nPbH, -+ const unsigned int avail, -+ const unsigned int part_idx, -+ const unsigned int merge_idx, -+ HEVCRpiMvField * const mvf_t) -+{ -+ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N); -+ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD); -+ -+ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); -+ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); -+ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); -+ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; -+ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level; -+ const unsigned int part_mode = lc->cu.part_mode; -+ -+ const HEVCRpiMvField * perm[4]; -+ unsigned int nb_merge_cand = 0; -+ -+ // singleMCLFlag => part_idx == 0 so no need to test for it -+ if ((avail & AVAIL_L) == 0 || -+ (part_idx == 1 && -+ ((parts_a1 >> part_mode) & 1) != 0 || -+ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) || -+ mvf_a1->pred_flag == PF_INTRA) -+ { -+ mvf_a1 = NULL; -+ } -+ else -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_a1; -+ perm[nb_merge_cand++] = mvf_a1; -+ } -+ -+ if ((avail & AVAIL_U) == 0 || -+ (part_idx == 1 && -+ ((parts_b1 >> part_mode) & 1) != 0 || -+ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) || -+ mvf_b1->pred_flag == PF_INTRA) -+ { -+ mvf_b1 = NULL; -+ } -+ else if (!mvf_eq(mvf_b1, mvf_a1)) -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_b1; -+ perm[nb_merge_cand++] = mvf_b1; -+ } -+ -+ // above right spatial merge candidate -+ // Never need mvf_b0 again so don't bother zeroing if navail -+ if ((avail & AVAIL_UR) != 0 && -+ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) && -+ mvf_b0->pred_flag != PF_INTRA && -+ !mvf_eq(mvf_b0, mvf_b1)) -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_b0; -+ perm[nb_merge_cand++] = mvf_b0; -+ } -+ -+ // left bottom spatial merge candidate -+ // Never need mvf_a0 again so don't bother zeroing if navail -+ if ((avail & AVAIL_DL) != 0 && -+ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) && -+ mvf_a0->pred_flag != PF_INTRA && -+ !mvf_eq(mvf_a0, mvf_a1)) -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_a0; -+ perm[nb_merge_cand++] = mvf_a0; -+ } -+ -+ // above left spatial merge candidate -+ if (nb_merge_cand != 4 && -+ (avail & AVAIL_UL) != 0 && -+ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0)) -+ { -+ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL -+ -+ if (mvf_b2->pred_flag != PF_INTRA && -+ !mvf_eq(mvf_b2, mvf_a1) && -+ !mvf_eq(mvf_b2, mvf_b1)) -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_b2; -+ perm[nb_merge_cand++] = mvf_b2; -+ } -+ } -+ -+ // temporal motion vector candidate -+ if (s->sh.slice_temporal_mvp_enabled_flag) -+ { -+ static const HEVCRpiMvField mvf_z = {{0}}; -+ -+ *mvf_t = mvf_z; -+ -+ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, -+ 0, mvf_t->xy + 0, 0)) -+ mvf_t->pred_flag = PF_L0; -+ -+ if (s->sh.slice_type == HEVC_SLICE_B && -+ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, -+ 0, mvf_t->xy + 1, 1)) -+ mvf_t->pred_flag |= PF_L1; -+ -+ if (mvf_t->pred_flag != 0) -+ { -+ if (merge_idx == nb_merge_cand) -+ return mvf_t; -+ perm[nb_merge_cand++] = mvf_t; -+ } -+ } -+ -+ // combined bi-predictive merge candidates (applies for B slices) -+ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1) -+ { -+ unsigned int comb_idx = 0; -+ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1); -+ const RefPicList * const refPicList = s->refPicList; -+ -+ for (comb_idx = 0; comb_idx < cand_count; comb_idx++) -+ { -+ static const uint8_t l0_l1_cand_idx[12][2] = { -+ { 0, 1, }, -+ { 1, 0, }, -+ { 0, 2, }, -+ { 2, 0, }, -+ { 1, 2, }, -+ { 2, 1, }, -+ { 0, 3, }, -+ { 3, 0, }, -+ { 1, 3, }, -+ { 3, 1, }, -+ { 2, 3, }, -+ { 3, 2, }, -+ }; -+ -+ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; -+ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; -+ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx]; -+ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx]; -+ -+ if ((mvf_c0->pred_flag & PF_L0) != 0 && -+ (mvf_c1->pred_flag & PF_L1) != 0 && -+ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] || -+ mvf_c0->xy[0] != mvf_c1->xy[1])) -+ { -+ if (merge_idx == nb_merge_cand++) -+ { -+ // Need to be a bit careful as we will construct mvf_t and we -+ // may already be using that as one of our condidates -+ // so build & copy rather than build in place -+ const HEVCRpiMvField mvf_m = { -+ .xy = { -+ mvf_c0->xy[0], -+ mvf_c1->xy[1]}, -+ .ref_idx = { -+ mvf_c0->ref_idx[0], -+ mvf_c1->ref_idx[1]}, -+ .pred_flag = PF_BI -+ }; -+ *mvf_t = mvf_m; -+ return mvf_t; -+ } -+ } -+ } -+ } -+ -+ // "append" Zero motion vector candidates -+ { -+ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ? -+ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0]; -+ const unsigned int zero_idx = merge_idx - nb_merge_cand; -+ -+ const HEVCRpiMvField mvf_m = { -+ .xy = {0, 0}, -+ .ref_idx = { -+ zero_idx < nb_refs ? zero_idx : 0, -+ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0}, -+ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0 -+ }; -+ -+ *mvf_t = mvf_m; -+ return mvf_t; -+ } -+} -+ -+ -+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode -+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, HEVCRpiMvField * const mv) -+{ -+ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ? -+ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8, -+ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8), -+ 0, merge_idx, mv) : -+ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, -+ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), -+ part_idx, merge_idx, mv); -+ -+ if (mvf_m != mv) -+ *mv = *mvf_m; -+ -+ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12) -+ mv->pred_flag = PF_L0; -+} -+ -+ -+static av_always_inline const MvXY * -+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf) -+{ -+ if (mvf != NULL) -+ { -+ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0) -+ return mvf->xy + pfi0; -+ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0) -+ return mvf->xy + pfi1; -+ } -+ return NULL; -+} -+ -+static av_always_inline const MvXY * -+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, -+ const int islt0, const int poc0, const int poc_cur, -+ MvXY * const mv_t, const HEVCRpiMvField * const mvf) -+{ -+ if (mvf != NULL) -+ { -+ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0) -+ { -+ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]]; -+ if (islt0 || poc1 == poc0) { -+ return mvf->xy + pfi0; -+ } -+ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0); -+ return mv_t; -+ } -+ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0) -+ { -+ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]]; -+ if (islt0 || poc1 == poc0) { -+ return mvf->xy + pfi1; -+ } -+ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0); -+ return mv_t; -+ } -+ } -+ return NULL; -+} -+ -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int nPbW, const unsigned int nPbH, -+ const unsigned int avail, -+ HEVCRpiMvField * const mv, -+ const unsigned int mvp_lx_flag, const unsigned int LX) -+{ -+ const unsigned int pfi0 = LX; -+ const unsigned int pfi1 = LX == 0 ? 1 : 0; -+ const RefPicList * const rpl = s->refPicList; -+ const int poc0 = rpl[LX].list[mv->ref_idx[LX]]; -+ const int poc_cur = s->poc; -+ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]]; -+ -+ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); -+ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); -+ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL -+ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); -+ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; -+ const MvXY * mva = NULL; -+ const MvXY * mvb; -+ MvXY * const mv_rv = mv->xy + LX; -+ MvXY mvt_a, mvt_b; -+ -+ *mv_rv = 0; -+ -+ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA) -+ mvf_a0 = NULL; -+ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0) -+ goto use_mva; -+ -+ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA) -+ mvf_a1 = NULL; -+ -+ if (mva == NULL && -+ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL && -+ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL) -+ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1); -+ -+ if (mvp_lx_flag == 0 && mva != NULL) -+ goto use_mva; -+ -+ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA) -+ mvf_b0 = NULL; -+ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA) -+ mvf_b1 = NULL; -+ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA) -+ mvf_b2 = NULL; -+ -+ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL && -+ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL) -+ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2); -+ -+ if (mvf_a0 == NULL && mvf_a1 == NULL) { -+ mva = mvb; -+ if (mvp_lx_flag == 0 && mva != NULL) -+ goto use_mva; -+ -+ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL && -+ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL) -+ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2); -+ } -+ -+ if (mva == NULL) { -+ mva = mvb; -+ mvb = NULL; -+ } -+ -+ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B -+ mvb = NULL; -+ -+ if (mvp_lx_flag == 0 && mva != NULL) { -+ goto use_mva; -+ } -+ else if (mvp_lx_flag != 0 && mvb != NULL) { -+ *mv_rv = *mvb; -+ } -+ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) { -+ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, -+ nPbH, mv->ref_idx[LX], -+ mv_rv, LX); -+ } -+ return; -+ -+use_mva: -+ *mv_rv = *mva; -+ return; -+} -+ -diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c -new file mode 100644 -index 0000000000..e58a59ce5e ---- /dev/null -+++ b/libavcodec/rpi_hevc_parse.c -@@ -0,0 +1,143 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "bytestream.h" -+#include "h2645_parse.h" -+#include "hevc.h" -+#include "rpi_hevc_parse.h" -+ -+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps, -+ HEVCSEIContext *sei, int is_nalff, int nal_length_size, -+ int err_recognition, int apply_defdispwin, void *logctx) -+{ -+ int i; -+ int ret = 0; -+ H2645Packet pkt = { 0 }; -+ -+ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, -+ nal_length_size, AV_CODEC_ID_HEVC, 1, 0); -+ if (ret < 0) { -+ goto done; -+ } -+ -+ for (i = 0; i < pkt.nb_nals; i++) { -+ H2645NAL *nal = &pkt.nals[i]; -+ -+ /* ignore everything except parameter sets and VCL NALUs */ -+ switch (nal->type) { -+ case HEVC_NAL_VPS: -+ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps); -+ if (ret < 0) -+ goto done; -+ break; -+ case HEVC_NAL_SPS: -+ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin); -+ if (ret < 0) -+ goto done; -+ break; -+ case HEVC_NAL_PPS: -+ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps); -+ if (ret < 0) -+ goto done; -+ break; -+ case HEVC_NAL_SEI_PREFIX: -+ case HEVC_NAL_SEI_SUFFIX: -+ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type); -+ if (ret < 0) -+ goto done; -+ break; -+ default: -+ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type); -+ break; -+ } -+ } -+ -+done: -+ ff_h2645_packet_uninit(&pkt); -+ if (err_recognition & AV_EF_EXPLODE) -+ return ret; -+ -+ return 0; -+} -+ -+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, -+ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, -+ int err_recognition, int apply_defdispwin, void *logctx) -+{ -+ int ret = 0; -+ GetByteContext gb; -+ -+ bytestream2_init(&gb, data, size); -+ -+ if (size > 3 && (data[0] || data[1] || data[2] > 1)) { -+ /* It seems the extradata is encoded as hvcC format. -+ * Temporarily, we support configurationVersion==0 until 14496-15 3rd -+ * is finalized. When finalized, configurationVersion will be 1 and we -+ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */ -+ int i, j, num_arrays, nal_len_size; -+ -+ *is_nalff = 1; -+ -+ bytestream2_skip(&gb, 21); -+ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1; -+ num_arrays = bytestream2_get_byte(&gb); -+ -+ /* nal units in the hvcC always have length coded with 2 bytes, -+ * so put a fake nal_length_size = 2 while parsing them */ -+ *nal_length_size = 2; -+ -+ /* Decode nal units from hvcC. */ -+ for (i = 0; i < num_arrays; i++) { -+ int type = bytestream2_get_byte(&gb) & 0x3f; -+ int cnt = bytestream2_get_be16(&gb); -+ -+ for (j = 0; j < cnt; j++) { -+ // +2 for the nal size field -+ int nalsize = bytestream2_peek_be16(&gb) + 2; -+ if (bytestream2_get_bytes_left(&gb) < nalsize) { -+ av_log(logctx, AV_LOG_ERROR, -+ "Invalid NAL unit size in extradata.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff, -+ *nal_length_size, err_recognition, apply_defdispwin, -+ logctx); -+ if (ret < 0) { -+ av_log(logctx, AV_LOG_ERROR, -+ "Decoding nal unit %d %d from hvcC failed\n", -+ type, i); -+ return ret; -+ } -+ bytestream2_skip(&gb, nalsize); -+ } -+ } -+ -+ /* Now store right nal length size, that will be used to parse -+ * all other nals */ -+ *nal_length_size = nal_len_size; -+ } else { -+ *is_nalff = 0; -+ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size, -+ err_recognition, apply_defdispwin, logctx); -+ if (ret < 0) -+ return ret; -+ } -+ -+ return ret; -+} -diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h -new file mode 100644 -index 0000000000..4b4d032a16 ---- /dev/null -+++ b/libavcodec/rpi_hevc_parse.h -@@ -0,0 +1,36 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * H.265 parser code -+ */ -+ -+#ifndef AVCODEC_RPI_HEVC_PARSE_H -+#define AVCODEC_RPI_HEVC_PARSE_H -+ -+#include -+ -+#include "rpi_hevc_ps.h" -+#include "rpi_hevc_sei.h" -+ -+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, -+ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, -+ int err_recognition, int apply_defdispwin, void *logctx); -+ -+#endif /* AVCODEC_RPI_HEVC_PARSE_H */ -diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c -new file mode 100644 -index 0000000000..f4e31f7d1d ---- /dev/null -+++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1938 @@ -+/* -+ * HEVC Parameter Set decoding -+ * -+ * Copyright (C) 2012 - 2103 Guillaume Martres -+ * Copyright (C) 2012 - 2103 Mickael Raulet -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2013 Vittorio Giovara -+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/imgutils.h" -+#include "golomb.h" -+#include "rpi_hevc_data.h" -+#include "rpi_hevc_ps.h" -+#include "rpi_hevcdec.h" -+ -+static const uint8_t default_scaling_list_intra[] = { -+ 16, 16, 16, 16, 17, 18, 21, 24, -+ 16, 16, 16, 16, 17, 19, 22, 25, -+ 16, 16, 17, 18, 20, 22, 25, 29, -+ 16, 16, 18, 21, 24, 27, 31, 36, -+ 17, 17, 20, 24, 30, 35, 41, 47, -+ 18, 19, 22, 27, 35, 44, 54, 65, -+ 21, 22, 25, 31, 41, 54, 70, 88, -+ 24, 25, 29, 36, 47, 65, 88, 115 -+}; -+ -+static const uint8_t default_scaling_list_inter[] = { -+ 16, 16, 16, 16, 17, 18, 20, 24, -+ 16, 16, 16, 17, 18, 20, 24, 25, -+ 16, 16, 17, 18, 20, 24, 25, 28, -+ 16, 17, 18, 20, 24, 25, 28, 33, -+ 17, 18, 20, 24, 25, 28, 33, 41, -+ 18, 20, 24, 25, 28, 33, 41, 54, -+ 20, 24, 25, 28, 33, 41, 54, 71, -+ 24, 25, 28, 33, 41, 54, 71, 91 -+}; -+ -+static const AVRational vui_sar[] = { -+ { 0, 1 }, -+ { 1, 1 }, -+ { 12, 11 }, -+ { 10, 11 }, -+ { 16, 11 }, -+ { 40, 33 }, -+ { 24, 11 }, -+ { 20, 11 }, -+ { 32, 11 }, -+ { 80, 33 }, -+ { 18, 11 }, -+ { 15, 11 }, -+ { 64, 33 }, -+ { 160, 99 }, -+ { 4, 3 }, -+ { 3, 2 }, -+ { 2, 1 }, -+}; -+ -+ -+// pps_cb_qp_offset: -12,+12 -+// slice_cb_qp_offset: -12,+12 also -+// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive." -+// cr_qp_offset_list[n]: -12,+12 -+// So worst case total offset: -24,+24 -+ -+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6) -+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n)) -+#define M(B,n) C(B,(-n)) -+ -+// Sizeof the QP_START_BLOCK -+#define QP_OFFSET_0 (8*6 + 12*2) -+#define QP_START(B) \ -+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ -+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ -+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ -+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ -+\ -+ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\ -+ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\ -+ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\ -+ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\ -+ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\ -+ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\ -+ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\ -+ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1) -+#define QP_END(B) \ -+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ -+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ -+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51) -+ -+#define T1(B)\ -+{\ -+ QP_START(B),\ -+ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ -+ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ -+ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ -+ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\ -+ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\ -+ C(B,44), C(B,45),\ -+ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\ -+ QP_END(B)\ -+} -+#define T0(B)\ -+{\ -+ QP_START(B),\ -+ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ -+ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ -+ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ -+ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\ -+ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\ -+ C(B,50), C(B,51),\ -+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ -+ QP_END(B)\ -+} -+ -+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2) -+ -+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)}; -+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)}; -+ -+#undef T -+#undef C -+#undef QP_END -+ -+#define C(B,n) ((n)<0?0:(n)>51?51:(n)) -+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps -+#define QP_DBLK_OFFSET_0 QP_OFFSET_0 -+#define QP_END(B)\ -+ 51, 51, 51, 51, 51, 51 -+ -+// These don't need all the padding we have here (12 top/bottom would be enough) -+static const uint8_t qp_c_dblk_0[] = T0(0); -+static const uint8_t qp_c_dblk_1[] = T1(0); -+ -+#undef T -+#undef M -+#undef C -+#undef QP_END -+#undef QP_START -+ -+ -+static void remove_pps(HEVCRpiParamSets * const s, const int id) -+{ -+ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data) -+ s->pps = NULL; -+ av_buffer_unref(&s->pps_list[id]); -+} -+ -+static void remove_sps(HEVCRpiParamSets * const s, const int id) -+{ -+ int i; -+ if (s->sps_list[id]) { -+ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data) -+ s->sps = NULL; -+ -+ /* drop all PPS that depend on this SPS */ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) -+ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id) -+ remove_pps(s, i); -+ -+ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data)); -+ } -+ av_buffer_unref(&s->sps_list[id]); -+} -+ -+static void remove_vps(HEVCRpiParamSets * const s, const int id) -+{ -+ int i; -+ if (s->vps_list[id]) { -+ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data) -+ s->vps = NULL; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) -+ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id) -+ remove_sps(s, i); -+ } -+ av_buffer_unref(&s->vps_list[id]); -+} -+ -+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx, -+ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header) -+{ -+ uint8_t rps_predict = 0; -+ int delta_poc; -+ int k0 = 0; -+ int k1 = 0; -+ int k = 0; -+ int i; -+ -+ if (rps != sps->st_rps && sps->nb_st_rps) -+ rps_predict = get_bits1(gb); -+ -+ if (rps_predict) { -+ const ShortTermRPS *rps_ridx; -+ int delta_rps; -+ unsigned abs_delta_rps; -+ uint8_t use_delta_flag = 0; -+ uint8_t delta_rps_sign; -+ -+ if (is_slice_header) { -+ unsigned int delta_idx = get_ue_golomb_long(gb) + 1; -+ if (delta_idx > sps->nb_st_rps) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid value of delta_idx in slice header RPS: %d > %d.\n", -+ delta_idx, sps->nb_st_rps); -+ return AVERROR_INVALIDDATA; -+ } -+ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx]; -+ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs; -+ } else -+ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1]; -+ -+ delta_rps_sign = get_bits1(gb); -+ abs_delta_rps = get_ue_golomb_long(gb) + 1; -+ if (abs_delta_rps < 1 || abs_delta_rps > 32768) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid value of abs_delta_rps: %d\n", -+ abs_delta_rps); -+ return AVERROR_INVALIDDATA; -+ } -+ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps; -+ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) { -+ int used = rps->used[k] = get_bits1(gb); -+ -+ if (!used) -+ use_delta_flag = get_bits1(gb); -+ -+ if (used || use_delta_flag) { -+ if (i < rps_ridx->num_delta_pocs) -+ delta_poc = delta_rps + rps_ridx->delta_poc[i]; -+ else -+ delta_poc = delta_rps; -+ rps->delta_poc[k] = delta_poc; -+ if (delta_poc < 0) -+ k0++; -+ else -+ k1++; -+ k++; -+ } -+ } -+ -+ if (k >= FF_ARRAY_ELEMS(rps->used)) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid num_delta_pocs: %d\n", k); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ rps->num_delta_pocs = k; -+ rps->num_negative_pics = k0; -+ // sort in increasing order (smallest first) -+ if (rps->num_delta_pocs != 0) { -+ int used, tmp; -+ for (i = 1; i < rps->num_delta_pocs; i++) { -+ delta_poc = rps->delta_poc[i]; -+ used = rps->used[i]; -+ for (k = i - 1; k >= 0; k--) { -+ tmp = rps->delta_poc[k]; -+ if (delta_poc < tmp) { -+ rps->delta_poc[k + 1] = tmp; -+ rps->used[k + 1] = rps->used[k]; -+ rps->delta_poc[k] = delta_poc; -+ rps->used[k] = used; -+ } -+ } -+ } -+ } -+ if ((rps->num_negative_pics >> 1) != 0) { -+ int used; -+ k = rps->num_negative_pics - 1; -+ // flip the negative values to largest first -+ for (i = 0; i < rps->num_negative_pics >> 1; i++) { -+ delta_poc = rps->delta_poc[i]; -+ used = rps->used[i]; -+ rps->delta_poc[i] = rps->delta_poc[k]; -+ rps->used[i] = rps->used[k]; -+ rps->delta_poc[k] = delta_poc; -+ rps->used[k] = used; -+ k--; -+ } -+ } -+ } else { -+ unsigned int prev, nb_positive_pics; -+ rps->num_negative_pics = get_ue_golomb_long(gb); -+ nb_positive_pics = get_ue_golomb_long(gb); -+ -+ if (rps->num_negative_pics >= HEVC_MAX_REFS || -+ nb_positive_pics >= HEVC_MAX_REFS) { -+ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics; -+ if (rps->num_delta_pocs) { -+ prev = 0; -+ for (i = 0; i < rps->num_negative_pics; i++) { -+ delta_poc = get_ue_golomb_long(gb) + 1; -+ if (delta_poc < 1 || delta_poc > 32768) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid value of delta_poc: %d\n", -+ delta_poc); -+ return AVERROR_INVALIDDATA; -+ } -+ prev -= delta_poc; -+ rps->delta_poc[i] = prev; -+ rps->used[i] = get_bits1(gb); -+ } -+ prev = 0; -+ for (i = 0; i < nb_positive_pics; i++) { -+ delta_poc = get_ue_golomb_long(gb) + 1; -+ if (delta_poc < 1 || delta_poc > 32768) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid value of delta_poc: %d\n", -+ delta_poc); -+ return AVERROR_INVALIDDATA; -+ } -+ prev += delta_poc; -+ rps->delta_poc[rps->num_negative_pics + i] = prev; -+ rps->used[rps->num_negative_pics + i] = get_bits1(gb); -+ } -+ } -+ } -+ return 0; -+} -+ -+ -+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx, -+ PTLCommon * const ptl) -+{ -+ int i; -+ -+ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12) -+ return -1; -+ -+ ptl->profile_space = get_bits(gb, 2); -+ ptl->tier_flag = get_bits1(gb); -+ ptl->profile_idc = get_bits(gb, 5); -+ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN) -+ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n"); -+ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10) -+ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n"); -+ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE) -+ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n"); -+ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT) -+ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n"); -+ else -+ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc); -+ -+ for (i = 0; i < 32; i++) { -+ ptl->profile_compatibility_flag[i] = get_bits1(gb); -+ -+ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i]) -+ ptl->profile_idc = i; -+ } -+ ptl->progressive_source_flag = get_bits1(gb); -+ ptl->interlaced_source_flag = get_bits1(gb); -+ ptl->non_packed_constraint_flag = get_bits1(gb); -+ ptl->frame_only_constraint_flag = get_bits1(gb); -+ -+ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15] -+ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31] -+ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43] -+ -+ return 0; -+} -+ -+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx, -+ PTL * const ptl, const int max_num_sub_layers) -+{ -+ int i; -+ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 || -+ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) { -+ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n"); -+ return -1; -+ } -+ -+ ptl->general_ptl.level_idc = get_bits(gb, 8); -+ -+ for (i = 0; i < max_num_sub_layers - 1; i++) { -+ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb); -+ ptl->sub_layer_level_present_flag[i] = get_bits1(gb); -+ } -+ -+ if (max_num_sub_layers - 1> 0) -+ for (i = max_num_sub_layers - 1; i < 8; i++) -+ skip_bits(gb, 2); // reserved_zero_2bits[i] -+ for (i = 0; i < max_num_sub_layers - 1; i++) { -+ if (ptl->sub_layer_profile_present_flag[i] && -+ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) { -+ av_log(avctx, AV_LOG_ERROR, -+ "PTL information for sublayer %i too short\n", i); -+ return -1; -+ } -+ if (ptl->sub_layer_level_present_flag[i]) { -+ if (get_bits_left(gb) < 8) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Not enough data for sublayer %i level_idc\n", i); -+ return -1; -+ } else -+ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8); -+ } -+ } -+ -+ return 0; -+} -+ -+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb, -+ const int subpic_params_present) -+{ -+ int i; -+ -+ for (i = 0; i < nb_cpb; i++) { -+ get_ue_golomb_long(gb); // bit_rate_value_minus1 -+ get_ue_golomb_long(gb); // cpb_size_value_minus1 -+ -+ if (subpic_params_present) { -+ get_ue_golomb_long(gb); // cpb_size_du_value_minus1 -+ get_ue_golomb_long(gb); // bit_rate_du_value_minus1 -+ } -+ skip_bits1(gb); // cbr_flag -+ } -+} -+ -+static int decode_hrd(GetBitContext * const gb, const int common_inf_present, -+ const int max_sublayers) -+{ -+ int nal_params_present = 0, vcl_params_present = 0; -+ int subpic_params_present = 0; -+ int i; -+ -+ if (common_inf_present) { -+ nal_params_present = get_bits1(gb); -+ vcl_params_present = get_bits1(gb); -+ -+ if (nal_params_present || vcl_params_present) { -+ subpic_params_present = get_bits1(gb); -+ -+ if (subpic_params_present) { -+ skip_bits(gb, 8); // tick_divisor_minus2 -+ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1 -+ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag -+ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1 -+ } -+ -+ skip_bits(gb, 4); // bit_rate_scale -+ skip_bits(gb, 4); // cpb_size_scale -+ -+ if (subpic_params_present) -+ skip_bits(gb, 4); // cpb_size_du_scale -+ -+ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1 -+ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1 -+ skip_bits(gb, 5); // dpb_output_delay_length_minus1 -+ } -+ } -+ -+ for (i = 0; i < max_sublayers; i++) { -+ int low_delay = 0; -+ unsigned int nb_cpb = 1; -+ int fixed_rate = get_bits1(gb); -+ -+ if (!fixed_rate) -+ fixed_rate = get_bits1(gb); -+ -+ if (fixed_rate) -+ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1 -+ else -+ low_delay = get_bits1(gb); -+ -+ if (!low_delay) { -+ nb_cpb = get_ue_golomb_long(gb) + 1; -+ if (nb_cpb < 1 || nb_cpb > 32) { -+ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ if (nal_params_present) -+ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); -+ if (vcl_params_present) -+ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); -+ } -+ return 0; -+} -+ -+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx, -+ HEVCRpiParamSets * const ps) -+{ -+ int i,j; -+ int vps_id = 0; -+ ptrdiff_t nal_size; -+ HEVCRpiVPS *vps; -+ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps)); -+ -+ if (!vps_buf) -+ return AVERROR(ENOMEM); -+ vps = (HEVCRpiVPS*)vps_buf->data; -+ -+ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n"); -+ -+ nal_size = gb->buffer_end - gb->buffer; -+ if (nal_size > sizeof(vps->data)) { -+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS " -+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", -+ nal_size, sizeof(vps->data)); -+ vps->data_size = sizeof(vps->data); -+ } else { -+ vps->data_size = nal_size; -+ } -+ memcpy(vps->data, gb->buffer, vps->data_size); -+ -+ vps_id = get_bits(gb, 4); -+ if (vps_id >= HEVC_MAX_VPS_COUNT) { -+ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id); -+ goto err; -+ } -+ -+ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits -+ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n"); -+ goto err; -+ } -+ -+ vps->vps_max_layers = get_bits(gb, 6) + 1; -+ vps->vps_max_sub_layers = get_bits(gb, 3) + 1; -+ vps->vps_temporal_id_nesting_flag = get_bits1(gb); -+ -+ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits -+ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n"); -+ goto err; -+ } -+ -+ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) { -+ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n", -+ vps->vps_max_sub_layers); -+ goto err; -+ } -+ -+ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0) -+ goto err; -+ -+ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb); -+ -+ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1; -+ for (; i < vps->vps_max_sub_layers; i++) { -+ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1; -+ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb); -+ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1; -+ -+ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) { -+ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n", -+ vps->vps_max_dec_pic_buffering[i] - 1); -+ goto err; -+ } -+ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) { -+ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n", -+ vps->vps_num_reorder_pics[i]); -+ if (avctx->err_recognition & AV_EF_EXPLODE) -+ goto err; -+ } -+ } -+ -+ vps->vps_max_layer_id = get_bits(gb, 6); -+ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1; -+ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 || -+ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) { -+ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n"); -+ goto err; -+ } -+ -+ for (i = 1; i < vps->vps_num_layer_sets; i++) -+ for (j = 0; j <= vps->vps_max_layer_id; j++) -+ skip_bits(gb, 1); // layer_id_included_flag[i][j] -+ -+ vps->vps_timing_info_present_flag = get_bits1(gb); -+ if (vps->vps_timing_info_present_flag) { -+ vps->vps_num_units_in_tick = get_bits_long(gb, 32); -+ vps->vps_time_scale = get_bits_long(gb, 32); -+ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb); -+ if (vps->vps_poc_proportional_to_timing_flag) -+ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1; -+ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb); -+ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) { -+ av_log(avctx, AV_LOG_ERROR, -+ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters); -+ goto err; -+ } -+ for (i = 0; i < vps->vps_num_hrd_parameters; i++) { -+ int common_inf_present = 1; -+ -+ get_ue_golomb_long(gb); // hrd_layer_set_idx -+ if (i) -+ common_inf_present = get_bits1(gb); -+ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers); -+ } -+ } -+ get_bits1(gb); /* vps_extension_flag */ -+ -+ if (get_bits_left(gb) < 0) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Overread VPS by %d bits\n", -get_bits_left(gb)); -+ if (ps->vps_list[vps_id]) -+ goto err; -+ } -+ -+ if (ps->vps_list[vps_id] && -+ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) { -+ av_buffer_unref(&vps_buf); -+ } else { -+ remove_vps(ps, vps_id); -+ ps->vps_list[vps_id] = vps_buf; -+ } -+ -+ return 0; -+ -+err: -+ av_buffer_unref(&vps_buf); -+ return AVERROR_INVALIDDATA; -+} -+ -+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx, -+ const int apply_defdispwin, HEVCRpiSPS * const sps) -+{ -+ VUI backup_vui, * const vui = &sps->vui; -+ GetBitContext backup; -+ int sar_present, alt = 0; -+ -+ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n"); -+ -+ sar_present = get_bits1(gb); -+ if (sar_present) { -+ uint8_t sar_idx = get_bits(gb, 8); -+ if (sar_idx < FF_ARRAY_ELEMS(vui_sar)) -+ vui->sar = vui_sar[sar_idx]; -+ else if (sar_idx == 255) { -+ vui->sar.num = get_bits(gb, 16); -+ vui->sar.den = get_bits(gb, 16); -+ } else -+ av_log(avctx, AV_LOG_WARNING, -+ "Unknown SAR index: %u.\n", sar_idx); -+ } -+ -+ vui->overscan_info_present_flag = get_bits1(gb); -+ if (vui->overscan_info_present_flag) -+ vui->overscan_appropriate_flag = get_bits1(gb); -+ -+ vui->video_signal_type_present_flag = get_bits1(gb); -+ if (vui->video_signal_type_present_flag) { -+ vui->video_format = get_bits(gb, 3); -+ vui->video_full_range_flag = get_bits1(gb); -+ vui->colour_description_present_flag = get_bits1(gb); -+ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) -+ sps->pix_fmt = AV_PIX_FMT_YUVJ420P; -+ if (vui->colour_description_present_flag) { -+ vui->colour_primaries = get_bits(gb, 8); -+ vui->transfer_characteristic = get_bits(gb, 8); -+ vui->matrix_coeffs = get_bits(gb, 8); -+ -+ // Set invalid values to "unspecified" -+ if (!av_color_primaries_name(vui->colour_primaries)) -+ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED; -+ if (!av_color_transfer_name(vui->transfer_characteristic)) -+ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED; -+ if (!av_color_space_name(vui->matrix_coeffs)) -+ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED; -+ if (vui->matrix_coeffs == AVCOL_SPC_RGB) { -+ switch (sps->pix_fmt) { -+ case AV_PIX_FMT_YUV444P: -+ sps->pix_fmt = AV_PIX_FMT_GBRP; -+ break; -+ case AV_PIX_FMT_YUV444P10: -+ sps->pix_fmt = AV_PIX_FMT_GBRP10; -+ break; -+ case AV_PIX_FMT_YUV444P12: -+ sps->pix_fmt = AV_PIX_FMT_GBRP12; -+ break; -+ } -+ } -+ } -+ } -+ -+ vui->chroma_loc_info_present_flag = get_bits1(gb); -+ if (vui->chroma_loc_info_present_flag) { -+ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb); -+ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb); -+ } -+ -+ vui->neutra_chroma_indication_flag = get_bits1(gb); -+ vui->field_seq_flag = get_bits1(gb); -+ vui->frame_field_info_present_flag = get_bits1(gb); -+ -+ // Backup context in case an alternate header is detected -+ memcpy(&backup, gb, sizeof(backup)); -+ memcpy(&backup_vui, vui, sizeof(backup_vui)); -+ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) { -+ vui->default_display_window_flag = 0; -+ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n"); -+ } else -+ vui->default_display_window_flag = get_bits1(gb); -+ -+ if (vui->default_display_window_flag) { -+ int vert_mult = 1 + (sps->chroma_format_idc < 2); -+ int horiz_mult = 1 + (sps->chroma_format_idc < 3); -+ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; -+ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; -+ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult; -+ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; -+ -+ if (apply_defdispwin && -+ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { -+ av_log(avctx, AV_LOG_DEBUG, -+ "discarding vui default display window, " -+ "original values are l:%u r:%u t:%u b:%u\n", -+ vui->def_disp_win.left_offset, -+ vui->def_disp_win.right_offset, -+ vui->def_disp_win.top_offset, -+ vui->def_disp_win.bottom_offset); -+ -+ vui->def_disp_win.left_offset = -+ vui->def_disp_win.right_offset = -+ vui->def_disp_win.top_offset = -+ vui->def_disp_win.bottom_offset = 0; -+ } -+ } -+ -+timing_info: -+ vui->vui_timing_info_present_flag = get_bits1(gb); -+ -+ if (vui->vui_timing_info_present_flag) { -+ if( get_bits_left(gb) < 66 && !alt) { -+ // The alternate syntax seem to have timing info located -+ // at where def_disp_win is normally located -+ av_log(avctx, AV_LOG_WARNING, -+ "Strange VUI timing information, retrying...\n"); -+ memcpy(vui, &backup_vui, sizeof(backup_vui)); -+ memcpy(gb, &backup, sizeof(backup)); -+ alt = 1; -+ goto timing_info; -+ } -+ vui->vui_num_units_in_tick = get_bits_long(gb, 32); -+ vui->vui_time_scale = get_bits_long(gb, 32); -+ if (alt) { -+ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n", -+ vui->vui_time_scale, vui->vui_num_units_in_tick); -+ } -+ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb); -+ if (vui->vui_poc_proportional_to_timing_flag) -+ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb); -+ vui->vui_hrd_parameters_present_flag = get_bits1(gb); -+ if (vui->vui_hrd_parameters_present_flag) -+ decode_hrd(gb, 1, sps->max_sub_layers); -+ } -+ -+ vui->bitstream_restriction_flag = get_bits1(gb); -+ if (vui->bitstream_restriction_flag) { -+ if (get_bits_left(gb) < 8 && !alt) { -+ av_log(avctx, AV_LOG_WARNING, -+ "Strange VUI bitstream restriction information, retrying" -+ " from timing information...\n"); -+ memcpy(vui, &backup_vui, sizeof(backup_vui)); -+ memcpy(gb, &backup, sizeof(backup)); -+ alt = 1; -+ goto timing_info; -+ } -+ vui->tiles_fixed_structure_flag = get_bits1(gb); -+ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb); -+ vui->restricted_ref_pic_lists_flag = get_bits1(gb); -+ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb); -+ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb); -+ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb); -+ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb); -+ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb); -+ } -+ -+ if (get_bits_left(gb) < 1 && !alt) { -+ // XXX: Alternate syntax when sps_range_extension_flag != 0? -+ av_log(avctx, AV_LOG_WARNING, -+ "Overread in VUI, retrying from timing information...\n"); -+ memcpy(vui, &backup_vui, sizeof(backup_vui)); -+ memcpy(gb, &backup, sizeof(backup)); -+ alt = 1; -+ goto timing_info; -+ } -+} -+ -+static void set_default_scaling_list_data(ScalingList * const sl) -+{ -+ int matrixId; -+ -+ for (matrixId = 0; matrixId < 6; matrixId++) { -+ // 4x4 default is 16 -+ memset(sl->sl[0][matrixId], 16, 16); -+ sl->sl_dc[0][matrixId] = 16; // default for 16x16 -+ sl->sl_dc[1][matrixId] = 16; // default for 32x32 -+ } -+ -+ memcpy(sl->sl[1][0], default_scaling_list_intra, 64); -+ memcpy(sl->sl[1][1], default_scaling_list_intra, 64); -+ memcpy(sl->sl[1][2], default_scaling_list_intra, 64); -+ -+ memcpy(sl->sl[1][3], default_scaling_list_inter, 64); -+ memcpy(sl->sl[1][4], default_scaling_list_inter, 64); -+ memcpy(sl->sl[1][5], default_scaling_list_inter, 64); -+ -+ memcpy(sl->sl[2][0], default_scaling_list_intra, 64); -+ memcpy(sl->sl[2][1], default_scaling_list_intra, 64); -+ memcpy(sl->sl[2][2], default_scaling_list_intra, 64); -+ -+ memcpy(sl->sl[2][3], default_scaling_list_inter, 64); -+ memcpy(sl->sl[2][4], default_scaling_list_inter, 64); -+ memcpy(sl->sl[2][5], default_scaling_list_inter, 64); -+ -+ memcpy(sl->sl[3][0], default_scaling_list_intra, 64); -+ memcpy(sl->sl[3][1], default_scaling_list_intra, 64); -+ memcpy(sl->sl[3][2], default_scaling_list_intra, 64); -+ -+ memcpy(sl->sl[3][3], default_scaling_list_inter, 64); -+ memcpy(sl->sl[3][4], default_scaling_list_inter, 64); -+ memcpy(sl->sl[3][5], default_scaling_list_inter, 64); -+} -+ -+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl, -+ const HEVCRpiSPS * const sps) -+{ -+ uint8_t scaling_list_pred_mode_flag; -+ int32_t scaling_list_dc_coef[2][6]; -+ int size_id, matrix_id, pos; -+ int i; -+ -+ for (size_id = 0; size_id < 4; size_id++) -+ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) { -+ scaling_list_pred_mode_flag = get_bits1(gb); -+ if (!scaling_list_pred_mode_flag) { -+ unsigned int delta = get_ue_golomb_long(gb); -+ /* Only need to handle non-zero delta. Zero means default, -+ * which should already be in the arrays. */ -+ if (delta) { -+ // Copy from previous array. -+ delta *= (size_id == 3) ? 3 : 1; -+ if (matrix_id < delta) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Invalid delta in scaling list data: %d.\n", delta); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ memcpy(sl->sl[size_id][matrix_id], -+ sl->sl[size_id][matrix_id - delta], -+ size_id > 0 ? 64 : 16); -+ if (size_id > 1) -+ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta]; -+ } -+ } else { -+ int next_coef, coef_num; -+ int32_t scaling_list_delta_coef; -+ -+ next_coef = 8; -+ coef_num = FFMIN(64, 1 << (4 + (size_id << 1))); -+ if (size_id > 1) { -+ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8; -+ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id]; -+ sl->sl_dc[size_id - 2][matrix_id] = next_coef; -+ } -+ for (i = 0; i < coef_num; i++) { -+ if (size_id == 0) -+ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] + -+ ff_hevc_rpi_diag_scan4x4_x[i]; -+ else -+ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] + -+ ff_hevc_rpi_diag_scan8x8_x[i]; -+ -+ scaling_list_delta_coef = get_se_golomb(gb); -+ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256; -+ sl->sl[size_id][matrix_id][pos] = next_coef; -+ } -+ } -+ } -+ -+ if (sps->chroma_format_idc == 3) { -+ for (i = 0; i < 64; i++) { -+ sl->sl[3][1][i] = sl->sl[2][1][i]; -+ sl->sl[3][2][i] = sl->sl[2][2][i]; -+ sl->sl[3][4][i] = sl->sl[2][4][i]; -+ sl->sl[3][5][i] = sl->sl[2][5][i]; -+ } -+ sl->sl_dc[1][1] = sl->sl_dc[0][1]; -+ sl->sl_dc[1][2] = sl->sl_dc[0][2]; -+ sl->sl_dc[1][4] = sl->sl_dc[0][4]; -+ sl->sl_dc[1][5] = sl->sl_dc[0][5]; -+ } -+ -+ -+ return 0; -+} -+ -+static int map_pixel_format(HEVCRpiSPS * const sps) -+{ -+ const int cfmt = sps->chroma_format_idc; -+ -+ sps->pix_fmt = AV_PIX_FMT_NONE; -+ switch (sps->bit_depth) { -+ case 8: -+ if (cfmt == 1) -+ sps->pix_fmt = AV_PIX_FMT_SAND128; -+ break; -+ case 10: -+ if (cfmt == 1) -+ sps->pix_fmt = AV_PIX_FMT_SAND64_10; -+ break; -+ default: -+ break; -+ } -+ -+ sps->hshift[0] = sps->vshift[0] = 0; -+ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4 -+ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2 -+ -+ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0; -+ -+ return 0; -+} -+ -+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id, -+ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx) -+{ -+ HEVCRpiWindow *ow; -+ int ret = 0; -+ int log2_diff_max_min_transform_block_size; -+ int bit_depth_chroma, start, vui_present, sublayer_ordering_info; -+ int i; -+ -+ // Coded parameters -+ -+ sps->vps_id = get_bits(gb, 4); -+ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) { -+ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (vps_list && !vps_list[sps->vps_id]) { -+ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n", -+ sps->vps_id); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sps->max_sub_layers = get_bits(gb, 3) + 1; -+ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) { -+ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n", -+ sps->max_sub_layers); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sps->temporal_id_nesting_flag = get_bits(gb, 1); -+ -+ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0) -+ return ret; -+ -+ *sps_id = get_ue_golomb_long(gb); -+ if (*sps_id >= HEVC_MAX_SPS_COUNT) { -+ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sps->chroma_format_idc = get_ue_golomb_long(gb); -+ if (sps->chroma_format_idc > 3U) { -+ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (sps->chroma_format_idc == 3) -+ sps->separate_colour_plane_flag = get_bits1(gb); -+ -+ if (sps->separate_colour_plane_flag) -+ sps->chroma_format_idc = 0; -+ -+ sps->width = get_ue_golomb_long(gb); -+ sps->height = get_ue_golomb_long(gb); -+ if ((ret = av_image_check_size(sps->width, -+ sps->height, 0, avctx)) < 0) -+ return ret; -+ -+ if (get_bits1(gb)) { // pic_conformance_flag -+ int vert_mult = 1 + (sps->chroma_format_idc < 2); -+ int horiz_mult = 1 + (sps->chroma_format_idc < 3); -+ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; -+ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; -+ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult; -+ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; -+ -+ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { -+ av_log(avctx, AV_LOG_DEBUG, -+ "discarding sps conformance window, " -+ "original values are l:%u r:%u t:%u b:%u\n", -+ sps->pic_conf_win.left_offset, -+ sps->pic_conf_win.right_offset, -+ sps->pic_conf_win.top_offset, -+ sps->pic_conf_win.bottom_offset); -+ -+ sps->pic_conf_win.left_offset = -+ sps->pic_conf_win.right_offset = -+ sps->pic_conf_win.top_offset = -+ sps->pic_conf_win.bottom_offset = 0; -+ } -+ sps->output_window = sps->pic_conf_win; -+ } -+ -+ sps->bit_depth = get_ue_golomb_long(gb) + 8; -+ bit_depth_chroma = get_ue_golomb_long(gb) + 8; -+ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Luma bit depth (%d) is different from chroma bit depth (%d), " -+ "this is unsupported.\n", -+ sps->bit_depth, bit_depth_chroma); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ ret = map_pixel_format(sps); -+ if (ret < 0) -+ return ret; -+ -+ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4; -+ if (sps->log2_max_poc_lsb > 16) { -+ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n", -+ sps->log2_max_poc_lsb - 4); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sublayer_ordering_info = get_bits1(gb); -+ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1; -+ for (i = start; i < sps->max_sub_layers; i++) { -+ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1; -+ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb); -+ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1; -+ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) { -+ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n", -+ sps->temporal_layer[i].max_dec_pic_buffering - 1U); -+ return AVERROR_INVALIDDATA; -+ } -+ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) { -+ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n", -+ sps->temporal_layer[i].num_reorder_pics); -+ if (avctx->err_recognition & AV_EF_EXPLODE || -+ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) { -+ return AVERROR_INVALIDDATA; -+ } -+ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1; -+ } -+ } -+ -+ if (!sublayer_ordering_info) { -+ for (i = 0; i < start; i++) { -+ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering; -+ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics; -+ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase; -+ } -+ } -+ -+ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3; -+ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb); -+ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2; -+ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb); -+ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size + -+ sps->log2_min_tb_size; -+ -+ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (sps->log2_diff_max_min_coding_block_size > 30) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ { -+ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size; -+ // Not a bitstream limitation, but all profiles -+ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // Inferred parameters -+ sps->log2_ctb_size = CtbLog2SizeY; -+// sps->log2_min_pu_size = sps->log2_min_cb_size - 1; -+ } -+ -+ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); -+ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb); -+ -+ sps->scaling_list_enable_flag = get_bits1(gb); -+ if (sps->scaling_list_enable_flag) { -+ set_default_scaling_list_data(&sps->scaling_list); -+ -+ if (get_bits1(gb)) { -+ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps); -+ if (ret < 0) -+ return ret; -+ } -+ } -+ -+ sps->amp_enabled_flag = get_bits1(gb); -+ sps->sao_enabled = get_bits1(gb); -+ -+ // Set pcm defaults (0) so we don't have to test _enabled when we -+ // want to use them -+ memset(&sps->pcm, 0, sizeof(sps->pcm)); -+ -+ if (get_bits1(gb)) // pcm_enabled_flag -+ { -+ const unsigned int limit_max_pcm = FFMIN(5, -+ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size); -+ sps->pcm.bit_depth = get_bits(gb, 4) + 1; -+ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1; -+ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3; -+ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size + -+ get_ue_golomb_long(gb); -+ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) { -+ av_log(avctx, AV_LOG_ERROR, -+ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n", -+ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth); -+ return AVERROR_INVALIDDATA; -+ } -+ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size || -+ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) { -+ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)", -+ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sps->pcm.loop_filter_disable_flag = get_bits1(gb); -+ } -+ -+ // Could be based on min_pcm_cb_size but much easier logic if we just stick -+ // with 8 (and costs us little) -+ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up -+ sps->pcm_height = (sps->height + 7) >> 3; -+ -+ sps->nb_st_rps = get_ue_golomb_long(gb); -+ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) { -+ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n", -+ sps->nb_st_rps); -+ return AVERROR_INVALIDDATA; -+ } -+ for (i = 0; i < sps->nb_st_rps; i++) { -+ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i], -+ sps, 0)) < 0) -+ return ret; -+ } -+ -+ sps->long_term_ref_pics_present_flag = get_bits1(gb); -+ if (sps->long_term_ref_pics_present_flag) { -+ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); -+ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { -+ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", -+ sps->num_long_term_ref_pics_sps); -+ return AVERROR_INVALIDDATA; -+ } -+ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) { -+ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb); -+ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb); -+ } -+ } -+ -+ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb); -+ sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag -+ sps->vui.sar = (AVRational){0, 1}; -+ vui_present = get_bits1(gb); -+ if (vui_present) -+ decode_vui(gb, avctx, apply_defdispwin, sps); -+ -+ if (get_bits1(gb)) { // sps_extension_flag -+ int sps_extension_flag[1]; -+ for (i = 0; i < 1; i++) -+ sps_extension_flag[i] = get_bits1(gb); -+ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); -+ if (sps_extension_flag[0]) { -+ int extended_precision_processing_flag; -+ int cabac_bypass_alignment_enabled_flag; -+ -+ sps->transform_skip_rotation_enabled_flag = get_bits1(gb); -+ sps->transform_skip_context_enabled_flag = get_bits1(gb); -+ sps->implicit_rdpcm_enabled_flag = get_bits1(gb); -+ -+ sps->explicit_rdpcm_enabled_flag = get_bits1(gb); -+ -+ extended_precision_processing_flag = get_bits1(gb); -+ if (extended_precision_processing_flag) -+ av_log(avctx, AV_LOG_WARNING, -+ "extended_precision_processing_flag not yet implemented\n"); -+ -+ if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag -+ sps->intra_filters_disable |= FILTER_EITHER; -+ sps->high_precision_offsets_enabled_flag = get_bits1(gb); -+ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); -+ -+ cabac_bypass_alignment_enabled_flag = get_bits1(gb); -+ if (cabac_bypass_alignment_enabled_flag) -+ av_log(avctx, AV_LOG_WARNING, -+ "cabac_bypass_alignment_enabled_flag not yet implemented\n"); -+ } -+ } -+ if (apply_defdispwin) { -+ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset; -+ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset; -+ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset; -+ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset; -+ } -+ -+ ow = &sps->output_window; -+ if (ow->left_offset >= INT_MAX - ow->right_offset || -+ ow->top_offset >= INT_MAX - ow->bottom_offset || -+ ow->left_offset + ow->right_offset >= sps->width || -+ ow->top_offset + ow->bottom_offset >= sps->height) { -+ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n", -+ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset); -+ if (avctx->err_recognition & AV_EF_EXPLODE) { -+ return AVERROR_INVALIDDATA; -+ } -+ av_log(avctx, AV_LOG_WARNING, -+ "Displaying the whole video surface.\n"); -+ memset(ow, 0, sizeof(*ow)); -+ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win)); -+ } -+ -+ // Inferred parameters -+ -+ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; -+ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; -+ sps->ctb_size = sps->ctb_width * sps->ctb_height; -+ -+ sps->min_cb_width = sps->width >> sps->log2_min_cb_size; -+ sps->min_cb_height = sps->height >> sps->log2_min_cb_size; -+ sps->min_tb_width = sps->width >> sps->log2_min_tb_size; -+ sps->min_tb_height = sps->height >> sps->log2_min_tb_size; -+ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE; -+ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE; -+ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; -+ -+ sps->qp_bd_offset = 6 * (sps->bit_depth - 8); -+ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7)); -+ -+ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || -+ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) { -+ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n", -+ sps->max_transform_hierarchy_depth_inter); -+ return AVERROR_INVALIDDATA; -+ } -+ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) { -+ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n", -+ sps->max_transform_hierarchy_depth_intra); -+ return AVERROR_INVALIDDATA; -+ } -+ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) { -+ av_log(avctx, AV_LOG_ERROR, -+ "max transform block size out of range: %d\n", -+ sps->log2_max_trafo_size); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (get_bits_left(gb) < 0) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Overread SPS by %d bits\n", -get_bits_left(gb)); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ return 0; -+} -+ -+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, -+ HEVCRpiParamSets *ps, int apply_defdispwin) -+{ -+ HEVCRpiSPS *sps; -+ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps)); -+ unsigned int sps_id; -+ int ret; -+ ptrdiff_t nal_size; -+ -+ if (!sps_buf) -+ return AVERROR(ENOMEM); -+ sps = (HEVCRpiSPS*)sps_buf->data; -+ -+ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n"); -+ -+ nal_size = gb->buffer_end - gb->buffer; -+ if (nal_size > sizeof(sps->data)) { -+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS " -+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", -+ nal_size, sizeof(sps->data)); -+ sps->data_size = sizeof(sps->data); -+ } else { -+ sps->data_size = nal_size; -+ } -+ memcpy(sps->data, gb->buffer, sps->data_size); -+ -+ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id, -+ apply_defdispwin, -+ ps->vps_list, avctx); -+ if (ret < 0) { -+ av_buffer_unref(&sps_buf); -+ return ret; -+ } -+ -+ if (avctx->debug & FF_DEBUG_BITSTREAM) { -+ av_log(avctx, AV_LOG_DEBUG, -+ "Parsed SPS: id %d; coded wxh: %dx%d; " -+ "cropped wxh: %dx%d; pix_fmt: %s.\n", -+ sps_id, sps->width, sps->height, -+ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset), -+ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset), -+ av_get_pix_fmt_name(sps->pix_fmt)); -+ } -+ -+ /* check if this is a repeat of an already parsed SPS, then keep the -+ * original one. -+ * otherwise drop all PPSes that depend on it */ -+ if (ps->sps_list[sps_id] && -+ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) { -+ av_buffer_unref(&sps_buf); -+ } else { -+ remove_sps(ps, sps_id); -+ ps->sps_list[sps_id] = sps_buf; -+ } -+ -+ return 0; -+} -+ -+static void hevc_pps_free(void *opaque, uint8_t *data) -+{ -+ HEVCRpiPPS *pps = (HEVCRpiPPS*)data; -+ -+ av_freep(&pps->column_width); -+ av_freep(&pps->row_height); -+ av_freep(&pps->col_bd); -+ av_freep(&pps->row_bd); -+ av_freep(&pps->col_idxX); -+ av_freep(&pps->ctb_addr_rs_to_ts); -+ av_freep(&pps->ctb_addr_ts_to_rs); -+ av_freep(&pps->tile_pos_ts); -+ av_freep(&pps->tile_size); -+ av_freep(&pps->tile_id); -+ av_freep(&pps->ctb_ts_flags); -+ -+ av_freep(&pps); -+} -+ -+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets) -+{ -+ do -+ { -+ const int offset = get_se_golomb_long(gb); -+ if (offset < -12 || offset > 12) { -+ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset); -+ return AVERROR_INVALIDDATA; -+ } -+ *offsets++ = offset; -+ } while (n_minus_1-- != 0); -+ return 0; -+} -+ -+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx, -+ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) -+{ -+ if (pps->transform_skip_enabled_flag) { -+ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2; -+ } -+ pps->cross_component_prediction_enabled_flag = get_bits1(gb); -+ if (pps->cross_component_prediction_enabled_flag && -+ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag)) -+ { -+ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb); -+ if (pps->chroma_qp_offset_list_enabled_flag) { -+ int err; -+ -+ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb); -+ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb); -+ if (pps->chroma_qp_offset_list_len_minus1 > 5) { -+ av_log(avctx, AV_LOG_ERROR, -+ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n"); -+ -+ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 || -+ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0) -+ return err; -+ } -+ -+ { -+ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0; -+ -+ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb); -+ if (pps->log2_sao_offset_scale_luma > max_offset) { -+ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid"); -+ return AVERROR_INVALIDDATA; -+ } -+ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb); -+ if (pps->log2_sao_offset_scale_chroma > max_offset) { -+ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid"); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ return(0); -+} -+ -+static inline int setup_pps(AVCodecContext * const avctx, -+ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) -+{ -+ int pic_area_in_ctbs; -+ int i, j, x, y, ctb_addr_rs, tile_id; -+ -+ // Inferred parameters -+ -+ // qp_y -> qp_u/qp_v tables -+ // The tables have at least -24,+24 overrun after adding offset here -+ // which should allow for clipless offseting -+ -+ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code -+ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0; -+ -+ if (sps->chroma_format_idc == 1) { -+ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; -+ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; -+ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; -+ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; -+ } -+ else -+ { -+ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; -+ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; -+ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; -+ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; -+ } -+ -+ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd)); -+ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd)); -+ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX)); -+ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) -+ return AVERROR(ENOMEM); -+ -+ if (pps->uniform_spacing_flag) { -+ if (!pps->column_width) { -+ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); -+ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); -+ } -+ if (!pps->column_width || !pps->row_height) -+ return AVERROR(ENOMEM); -+ -+ for (i = 0; i < pps->num_tile_columns; i++) { -+ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns - -+ (i * sps->ctb_width) / pps->num_tile_columns; -+ } -+ -+ for (i = 0; i < pps->num_tile_rows; i++) { -+ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows - -+ (i * sps->ctb_height) / pps->num_tile_rows; -+ } -+ } -+ -+ { -+ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift); -+ pps->col_bd[0] = 0; -+ pps->tile_wpp_inter_disable = 0; -+ for (i = 0; i < pps->num_tile_columns; i++) -+ { -+ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i]; -+ -+ // Avoid trying tile parallel if the columns don't fall on cache boundries -+ // (this causes too much pain syncing flushes with the QPU) -+ // Ignore the final (RHS of pic) tile boundry -+ if ((pps->col_bd[i] & td_mask) != 0) { -+ pps->tile_wpp_inter_disable = 1; -+ } -+ } -+ -+ // If we can start the next row before finishing the first line of -+ // this one then we must wait at the end of the tile -+ // * if this happens a lot then there are better but more complicated -+ // conditions that we could apply -+ if (pps->tile_wpp_inter_disable) { -+ for (i = 0; i < pps->num_tile_rows; i++) -+ { -+ if (pps->row_height[i] <= RPI_MAX_JOBS) { -+ pps->tile_wpp_inter_disable = 2; -+ break; -+ } -+ } -+ } -+ } -+ -+ pps->row_bd[0] = 0; -+ for (i = 0; i < pps->num_tile_rows; i++) -+ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; -+ -+ for (i = 0, j = 0; i < sps->ctb_width; i++) { -+ if (i >= pps->col_bd[j + 1]) -+ j++; -+ pps->col_idxX[i] = j; -+ } -+ -+ /** -+ * 6.5 -+ */ -+ pic_area_in_ctbs = sps->ctb_size; -+ -+ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); -+ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); -+ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); -+ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); -+ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); -+ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); -+ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || -+ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { -+ return AVERROR(ENOMEM); -+ } -+ -+ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags)); -+ -+ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { -+ int tb_x = ctb_addr_rs % sps->ctb_width; -+ int tb_y = ctb_addr_rs / sps->ctb_width; -+ int tile_x = 0; -+ int tile_y = 0; -+ int val = 0; -+ -+ for (i = 0; i < pps->num_tile_columns; i++) { -+ if (tb_x < pps->col_bd[i + 1]) { -+ tile_x = i; -+ break; -+ } -+ } -+ -+ for (i = 0; i < pps->num_tile_rows; i++) { -+ if (tb_y < pps->row_bd[i + 1]) { -+ tile_y = i; -+ break; -+ } -+ } -+ -+ for (i = 0; i < tile_x; i++) -+ val += pps->row_height[tile_y] * pps->column_width[i]; -+ for (i = 0; i < tile_y; i++) -+ val += sps->ctb_width * pps->row_height[i]; -+ -+ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] + -+ tb_x - pps->col_bd[tile_x]; -+ -+ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val; -+ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; -+ } -+ -+ { -+ uint8_t * pflags = pps->ctb_ts_flags; -+ uint16_t * ptid = pps->tile_id; -+ -+ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) -+ { -+ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) -+ { -+ const unsigned int tile_w = pps->column_width[i]; -+ -+ pflags[0] |= CTB_TS_FLAGS_CIREQ; -+ -+ for (x = 0; x != tile_w; ++x) { -+ pflags[x] |= CTB_TS_FLAGS_TOT; -+ } -+ -+ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) -+ { -+ pflags[0] |= CTB_TS_FLAGS_SOTL; -+ -+ if (pps->entropy_coding_sync_enabled_flag) -+ { -+ if (pps->column_width[i] != 1) -+ pflags[1] |= CTB_TS_FLAGS_CSAVE; -+ else -+ pflags[0] |= CTB_TS_FLAGS_CIREQ; -+ -+ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0) -+ pflags[0] |= CTB_TS_FLAGS_CLOAD; -+ } -+ -+ for (x = 0; x != tile_w; ++x) -+ *ptid++ = tile_id; -+ -+ pflags += tile_w; -+ pflags[-1] |= CTB_TS_FLAGS_EOTL; -+ if (i + 1 == pps->num_tile_columns) -+ pflags[-1] |= CTB_TS_FLAGS_EOL; -+ } -+ -+ pflags[-1] |= CTB_TS_FLAGS_EOT; -+ } -+ } -+ } -+ -+ { -+ unsigned int ts = 0; -+ for (j = 0; j < pps->num_tile_rows; j++) -+ for (i = 0; i < pps->num_tile_columns; i++) -+ { -+ const unsigned int size = pps->column_width[i] * pps->row_height[j]; -+ pps->tile_size[j * pps->num_tile_columns + i] = size; -+ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts; -+ ts += size; -+ } -+ } -+ -+ return 0; -+} -+ -+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx, -+ HEVCRpiParamSets * const ps) -+{ -+ const HEVCRpiSPS *sps = NULL; -+ int i, ret = 0; -+ unsigned int pps_id = 0; -+ ptrdiff_t nal_size; -+ unsigned log2_parallel_merge_level_minus2; -+ -+ AVBufferRef *pps_buf; -+ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps)); -+ -+ if (!pps) -+ return AVERROR(ENOMEM); -+ -+ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps), -+ hevc_pps_free, NULL, 0); -+ if (!pps_buf) { -+ av_freep(&pps); -+ return AVERROR(ENOMEM); -+ } -+ -+ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n"); -+ -+ nal_size = gb->buffer_end - gb->buffer; -+ if (nal_size > sizeof(pps->data)) { -+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS " -+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", -+ nal_size, sizeof(pps->data)); -+ pps->data_size = sizeof(pps->data); -+ } else { -+ pps->data_size = nal_size; -+ } -+ memcpy(pps->data, gb->buffer, pps->data_size); -+ -+ // Default values -+ pps->loop_filter_across_tiles_enabled_flag = 1; -+ pps->num_tile_columns = 1; -+ pps->num_tile_rows = 1; -+ pps->uniform_spacing_flag = 1; -+ pps->disable_dbf = 0; -+ pps->beta_offset = 0; -+ pps->tc_offset = 0; -+ pps->log2_max_transform_skip_block_size = 2; -+ -+ // Coded parameters -+ pps_id = get_ue_golomb_long(gb); -+ if (pps_id >= HEVC_MAX_PPS_COUNT) { -+ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->sps_id = get_ue_golomb_long(gb); -+ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) { -+ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ if (!ps->sps_list[pps->sps_id]) { -+ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data; -+ -+ pps->dependent_slice_segments_enabled_flag = get_bits1(gb); -+ pps->output_flag_present_flag = get_bits1(gb); -+ pps->num_extra_slice_header_bits = get_bits(gb, 3); -+ -+ pps->sign_data_hiding_flag = get_bits1(gb); -+ -+ pps->cabac_init_present_flag = get_bits1(gb); -+ -+ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1; -+ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) { -+ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n"); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; -+ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) { -+ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n"); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ -+ pps->pic_init_qp_minus26 = get_se_golomb(gb); -+ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) { -+ av_log(avctx, AV_LOG_ERROR, -+ "init_qp_minus26 %d is outside the valid range " -+ "[%d, %d].\n", -+ pps->pic_init_qp_minus26, -+ -(26 + sps->qp_bd_offset), 25); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ -+ pps->constrained_intra_pred_flag = get_bits1(gb); -+ pps->transform_skip_enabled_flag = get_bits1(gb); -+ -+ pps->cu_qp_delta_enabled_flag = get_bits1(gb); -+ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size; -+ if (pps->cu_qp_delta_enabled_flag) -+ { -+ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb); -+ -+ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) { -+ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n", -+ diff_cu_qp_delta_depth); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ -+ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth; -+ } -+ -+ pps->cb_qp_offset = get_se_golomb(gb); -+ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) { -+ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n", -+ pps->cb_qp_offset); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->cr_qp_offset = get_se_golomb(gb); -+ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) { -+ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n", -+ pps->cr_qp_offset); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb); -+ -+ pps->weighted_pred_flag = get_bits1(gb); -+ pps->weighted_bipred_flag = get_bits1(gb); -+ -+ pps->transquant_bypass_enable_flag = get_bits1(gb); -+ pps->tiles_enabled_flag = get_bits1(gb); -+ pps->entropy_coding_sync_enabled_flag = get_bits1(gb); -+ -+ if (pps->tiles_enabled_flag) { -+ pps->num_tile_columns = get_ue_golomb_long(gb) + 1; -+ pps->num_tile_rows = get_ue_golomb_long(gb) + 1; -+ if (pps->num_tile_columns <= 0 || -+ pps->num_tile_columns >= sps->width) { -+ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n", -+ pps->num_tile_columns - 1); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ if (pps->num_tile_rows <= 0 || -+ pps->num_tile_rows >= sps->height) { -+ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n", -+ pps->num_tile_rows - 1); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ -+ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); -+ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); -+ if (!pps->column_width || !pps->row_height) { -+ ret = AVERROR(ENOMEM); -+ goto err; -+ } -+ -+ pps->uniform_spacing_flag = get_bits1(gb); -+ if (!pps->uniform_spacing_flag) { -+ uint64_t sum = 0; -+ for (i = 0; i < pps->num_tile_columns - 1; i++) { -+ pps->column_width[i] = get_ue_golomb_long(gb) + 1; -+ sum += pps->column_width[i]; -+ } -+ if (sum >= sps->ctb_width) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n"); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum; -+ -+ sum = 0; -+ for (i = 0; i < pps->num_tile_rows - 1; i++) { -+ pps->row_height[i] = get_ue_golomb_long(gb) + 1; -+ sum += pps->row_height[i]; -+ } -+ if (sum >= sps->ctb_height) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n"); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum; -+ } -+ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb); -+ } -+ -+ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb); -+ -+ pps->deblocking_filter_control_present_flag = get_bits1(gb); -+ if (pps->deblocking_filter_control_present_flag) { -+ pps->deblocking_filter_override_enabled_flag = get_bits1(gb); -+ pps->disable_dbf = get_bits1(gb); -+ if (!pps->disable_dbf) { -+ int beta_offset_div2 = get_se_golomb(gb); -+ int tc_offset_div2 = get_se_golomb(gb) ; -+ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) { -+ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n", -+ beta_offset_div2); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) { -+ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n", -+ tc_offset_div2); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->beta_offset = 2 * beta_offset_div2; -+ pps->tc_offset = 2 * tc_offset_div2; -+ } -+ } -+ -+ pps->scaling_list_data_present_flag = get_bits1(gb); -+ if (pps->scaling_list_data_present_flag) { -+ set_default_scaling_list_data(&pps->scaling_list); -+ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps); -+ if (ret < 0) -+ goto err; -+ } -+ pps->lists_modification_present_flag = get_bits1(gb); -+ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb); -+ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) { -+ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n", -+ log2_parallel_merge_level_minus2); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2; -+ -+ pps->slice_header_extension_present_flag = get_bits1(gb); -+ -+ if (get_bits1(gb)) { // pps_extension_present_flag -+ int pps_range_extensions_flag = get_bits1(gb); -+ skip_bits(gb, 7); // pps_extension_7bits -+ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) { -+ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0) -+ goto err; -+ } -+ } -+ -+ ret = setup_pps(avctx, pps, sps); -+ if (ret < 0) -+ goto err; -+ -+ if (get_bits_left(gb) < 0) { -+ av_log(avctx, AV_LOG_ERROR, -+ "Overread PPS by %d bits\n", -get_bits_left(gb)); -+ ret = AVERROR_INVALIDDATA; -+ goto err; -+ } -+ -+ remove_pps(ps, pps_id); -+ ps->pps_list[pps_id] = pps_buf; -+ -+ return 0; -+ -+err: -+ av_buffer_unref(&pps_buf); -+ return ret; -+} -+ -+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type) -+{ -+ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; -+ int prev_poc_lsb = pocTid0 % max_poc_lsb; -+ int prev_poc_msb = pocTid0 - prev_poc_lsb; -+ int poc_msb; -+ -+ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2) -+ poc_msb = prev_poc_msb + max_poc_lsb; -+ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2) -+ poc_msb = prev_poc_msb - max_poc_lsb; -+ else -+ poc_msb = prev_poc_msb; -+ -+ // For BLA picture types, POCmsb is set to 0. -+ if (nal_unit_type == HEVC_NAL_BLA_W_LP || -+ nal_unit_type == HEVC_NAL_BLA_W_RADL || -+ nal_unit_type == HEVC_NAL_BLA_N_LP) -+ poc_msb = 0; -+ -+ return poc_msb + poc_lsb; -+} -diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h -new file mode 100644 -index 0000000000..c725ebb9ca ---- /dev/null -+++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,449 @@ -+/* -+ * HEVC parameter set parsing -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVC_PS_H -+#define AVCODEC_RPI_HEVC_PS_H -+ -+#include -+ -+#include "libavutil/buffer.h" -+#include "libavutil/pixfmt.h" -+#include "libavutil/rational.h" -+ -+#include "avcodec.h" -+#include "get_bits.h" -+#include "hevc.h" -+ -+typedef struct ShortTermRPS { -+ unsigned int num_negative_pics; -+ int num_delta_pocs; -+ int rps_idx_num_delta_pocs; -+ int32_t delta_poc[32]; -+ uint8_t used[32]; -+} ShortTermRPS; -+ -+typedef struct LongTermRPS { -+ int poc[32]; -+ uint8_t used[32]; -+ uint8_t nb_refs; -+} LongTermRPS; -+ -+typedef struct RpiSliceHeader { -+ unsigned int pps_id; -+ -+ ///< address (in raster order) of the first block in the current slice segment -+ unsigned int slice_segment_addr; -+ ///< address (in raster order) of the first block in the current slice -+ unsigned int slice_addr; -+ -+ enum HEVCSliceType slice_type; -+ -+ int pic_order_cnt_lsb; -+ -+ uint8_t first_slice_in_pic_flag; -+ uint8_t dependent_slice_segment_flag; -+ uint8_t pic_output_flag; -+ uint8_t colour_plane_id; -+ -+ ///< RPS coded in the slice header itself is stored here -+ int short_term_ref_pic_set_sps_flag; -+ int short_term_ref_pic_set_size; -+ ShortTermRPS slice_rps; -+ const ShortTermRPS *short_term_rps; -+ int long_term_ref_pic_set_size; -+ LongTermRPS long_term_rps; -+ unsigned int list_entry_lx[2][32]; -+ -+ uint8_t rpl_modification_flag[2]; -+ uint8_t no_output_of_prior_pics_flag; -+ uint8_t slice_temporal_mvp_enabled_flag; -+ -+ unsigned int nb_refs[2]; -+ -+ uint8_t slice_sample_adaptive_offset_flag[3]; -+ uint8_t mvd_l1_zero_flag; -+ -+ uint8_t cabac_init_flag; -+ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag -+ uint8_t slice_loop_filter_across_slices_enabled_flag; -+ uint8_t collocated_list; -+ -+ uint8_t no_dblk_boundary_flags; -+ -+ unsigned int collocated_ref_idx; -+ -+ int slice_qp_delta; -+ int slice_cb_qp_offset; // -12, +12 -+ int slice_cr_qp_offset; // -12, +12 -+ -+ uint8_t cu_chroma_qp_offset_enabled_flag; -+ -+ int beta_offset; ///< beta_offset_div2 * 2 -+ int tc_offset; ///< tc_offset_div2 * 2 -+ -+ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand -+ -+ unsigned *entry_point_offset; -+ int * offset; -+ int * size; -+ int num_entry_point_offsets; -+ int offsets_allocated; -+ -+ uint8_t offload_wpp; -+ uint8_t offload_tiles; -+ -+ int8_t slice_qp; -+ -+ uint8_t luma_log2_weight_denom; -+ uint8_t chroma_log2_weight_denom; -+ -+ int16_t luma_weight_l0[16]; // -128, +255 -+ int16_t luma_offset_l0[16]; -+ int16_t chroma_weight_l0[16][2]; -+ int16_t chroma_offset_l0[16][2]; -+ -+ int16_t luma_weight_l1[16]; -+ int16_t luma_offset_l1[16]; -+ int16_t chroma_weight_l1[16][2]; -+ int16_t chroma_offset_l1[16][2]; -+ -+} RpiSliceHeader; -+ -+typedef struct HEVCRpiWindow { -+ uint16_t left_offset; -+ uint16_t right_offset; -+ uint16_t top_offset; -+ uint16_t bottom_offset; -+} HEVCRpiWindow; -+ -+typedef struct VUI { -+ AVRational sar; -+ -+ int overscan_info_present_flag; -+ int overscan_appropriate_flag; -+ -+ int video_signal_type_present_flag; -+ int video_format; -+ int video_full_range_flag; -+ int colour_description_present_flag; -+ uint8_t colour_primaries; -+ uint8_t transfer_characteristic; -+ uint8_t matrix_coeffs; -+ -+ int chroma_loc_info_present_flag; -+ int chroma_sample_loc_type_top_field; -+ int chroma_sample_loc_type_bottom_field; -+ int neutra_chroma_indication_flag; -+ -+ int field_seq_flag; -+ int frame_field_info_present_flag; -+ -+ int default_display_window_flag; -+ HEVCRpiWindow def_disp_win; -+ -+ int vui_timing_info_present_flag; -+ uint32_t vui_num_units_in_tick; -+ uint32_t vui_time_scale; -+ int vui_poc_proportional_to_timing_flag; -+ int vui_num_ticks_poc_diff_one_minus1; -+ int vui_hrd_parameters_present_flag; -+ -+ int bitstream_restriction_flag; -+ int tiles_fixed_structure_flag; -+ int motion_vectors_over_pic_boundaries_flag; -+ int restricted_ref_pic_lists_flag; -+ int min_spatial_segmentation_idc; -+ int max_bytes_per_pic_denom; -+ int max_bits_per_min_cu_denom; -+ int log2_max_mv_length_horizontal; -+ int log2_max_mv_length_vertical; -+} VUI; -+ -+typedef struct PTLCommon { -+ uint8_t profile_space; -+ uint8_t tier_flag; -+ uint8_t profile_idc; -+ uint8_t profile_compatibility_flag[32]; -+ uint8_t level_idc; -+ uint8_t progressive_source_flag; -+ uint8_t interlaced_source_flag; -+ uint8_t non_packed_constraint_flag; -+ uint8_t frame_only_constraint_flag; -+} PTLCommon; -+ -+typedef struct PTL { -+ PTLCommon general_ptl; -+ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS]; -+ -+ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS]; -+ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS]; -+} PTL; -+ -+typedef struct HEVCRpiVPS { -+ uint8_t vps_temporal_id_nesting_flag; -+ int vps_max_layers; -+ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1 -+ -+ PTL ptl; -+ int vps_sub_layer_ordering_info_present_flag; -+ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS]; -+ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS]; -+ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS]; -+ int vps_max_layer_id; -+ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1 -+ uint8_t vps_timing_info_present_flag; -+ uint32_t vps_num_units_in_tick; -+ uint32_t vps_time_scale; -+ uint8_t vps_poc_proportional_to_timing_flag; -+ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1 -+ int vps_num_hrd_parameters; -+ -+ uint8_t data[4096]; -+ int data_size; -+} HEVCRpiVPS; -+ -+typedef struct ScalingList { -+ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs, -+ * and size ID 3 only has 2 arrays, not 6. */ -+ uint8_t sl[4][6][64]; -+ uint8_t sl_dc[2][6]; -+} ScalingList; -+ -+typedef struct HEVCRpiSPS { -+ unsigned vps_id; -+ uint8_t chroma_format_idc; -+ uint8_t separate_colour_plane_flag; -+ -+ HEVCRpiWindow output_window; -+ -+ HEVCRpiWindow pic_conf_win; -+ -+ uint16_t wp_offset_half_range; // WpOffsetHalfRange -+ -+ uint8_t bit_depth; -+ -+// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth -+ uint8_t pixel_shift; -+ enum AVPixelFormat pix_fmt; -+ -+ unsigned int log2_max_poc_lsb; -+ -+ int max_sub_layers; -+ struct { -+ int max_dec_pic_buffering; -+ int num_reorder_pics; -+ int max_latency_increase; -+ } temporal_layer[HEVC_MAX_SUB_LAYERS]; -+ uint8_t temporal_id_nesting_flag; -+ -+ uint8_t scaling_list_enable_flag; -+ ScalingList scaling_list; -+ -+ unsigned int nb_st_rps; -+ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS]; -+ -+ uint8_t amp_enabled_flag; -+ uint8_t sao_enabled; -+ -+ uint8_t long_term_ref_pics_present_flag; -+ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; -+ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; -+ uint8_t num_long_term_ref_pics_sps; -+ -+ struct { -+ uint8_t bit_depth; -+ uint8_t bit_depth_chroma; -+ uint8_t log2_min_pcm_cb_size; -+ uint8_t log2_max_pcm_cb_size; -+ uint8_t loop_filter_disable_flag; -+ } pcm; -+ char sps_temporal_mvp_enabled_flag; -+// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable -+ -+ uint8_t log2_min_cb_size; // 3..6 -+ uint8_t log2_diff_max_min_coding_block_size; -+ uint8_t log2_min_tb_size; // 2..5 -+ uint8_t log2_max_trafo_size; -+ uint8_t log2_ctb_size; // 4..6 -+// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) -+#define LOG2_MIN_PU_SIZE 2 -+#define LOG2_MIN_CU_SIZE 3 -+ -+ uint8_t max_transform_hierarchy_depth_inter; -+ uint8_t max_transform_hierarchy_depth_intra; -+ -+ char transform_skip_rotation_enabled_flag; -+ char transform_skip_context_enabled_flag; -+ char implicit_rdpcm_enabled_flag; -+ char explicit_rdpcm_enabled_flag; -+// char intra_smoothing_disabled_flag; -> intra_filtes_disable -+ char high_precision_offsets_enabled_flag; -+ char persistent_rice_adaptation_enabled_flag; -+ -+ uint8_t intra_filters_disable; -+ -+ ///< coded frame dimension in various units -+ int width; -+ int height; -+ int ctb_width; -+ int ctb_height; -+ int ctb_size; // Pic size in CTBs not size of a CTB -+ int min_cb_width; -+ int min_cb_height; -+ int min_tb_width; -+ int min_tb_height; -+ int min_pu_width; -+ int min_pu_height; -+ int pcm_width; -+ int pcm_height; -+ int tb_mask; -+ -+ int hshift[3]; -+ int vshift[3]; -+ -+ int qp_bd_offset; -+ -+ uint8_t data[4096]; -+ int data_size; -+ -+ VUI vui; -+ PTL ptl; -+} HEVCRpiSPS; -+ -+#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line -+#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line -+#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line -+#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile -+#define CTB_TS_FLAGS_CSAVE (1U << 4) -+#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request -+#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile -+#define CTB_TS_FLAGS_CLOAD (1U << 7) -+ -+typedef struct HEVCRpiPPS { -+ unsigned int sps_id; ///< seq_parameter_set_id -+ -+ uint8_t sign_data_hiding_flag; -+ -+ uint8_t cabac_init_present_flag; -+ -+ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1 -+ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1 -+ int pic_init_qp_minus26; -+ -+ uint8_t constrained_intra_pred_flag; -+ uint8_t transform_skip_enabled_flag; -+ -+ uint8_t cu_qp_delta_enabled_flag; -+ uint8_t log2_min_cu_qp_delta_size; -+ int cb_qp_offset; // -12..12 -+ int cr_qp_offset; // -12..12 -+ const uint8_t * qp_dblk_x[3]; -+ const int8_t * qp_bd_x[3]; -+ -+ uint8_t pic_slice_level_chroma_qp_offsets_present_flag; -+ uint8_t weighted_pred_flag; -+ uint8_t weighted_bipred_flag; -+ uint8_t output_flag_present_flag; -+ uint8_t transquant_bypass_enable_flag; -+ -+ uint8_t dependent_slice_segments_enabled_flag; -+ uint8_t tiles_enabled_flag; -+ uint8_t entropy_coding_sync_enabled_flag; -+ -+ uint8_t tile_wpp_inter_disable; -+ int num_tile_columns; ///< num_tile_columns_minus1 + 1 -+ int num_tile_rows; ///< num_tile_rows_minus1 + 1 -+ uint8_t uniform_spacing_flag; -+ uint8_t loop_filter_across_tiles_enabled_flag; -+ -+ uint8_t seq_loop_filter_across_slices_enabled_flag; -+ -+ uint8_t deblocking_filter_control_present_flag; -+ uint8_t deblocking_filter_override_enabled_flag; -+ uint8_t disable_dbf; -+ int beta_offset; ///< beta_offset_div2 * 2 -+ int tc_offset; ///< tc_offset_div2 * 2 -+ -+ uint8_t scaling_list_data_present_flag; -+ ScalingList scaling_list; -+ -+ uint8_t lists_modification_present_flag; -+ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2 -+ int num_extra_slice_header_bits; -+ uint8_t slice_header_extension_present_flag; -+ uint8_t log2_max_transform_skip_block_size; -+ uint8_t cross_component_prediction_enabled_flag; -+ uint8_t chroma_qp_offset_list_enabled_flag; -+ uint8_t diff_cu_chroma_qp_offset_depth; -+ uint8_t chroma_qp_offset_list_len_minus1; -+ int8_t cb_qp_offset_list[6]; -+ int8_t cr_qp_offset_list[6]; -+ uint8_t log2_sao_offset_scale_luma; -+ uint8_t log2_sao_offset_scale_chroma; -+ -+ // Inferred parameters -+ uint16_t *column_width; ///< ColumnWidth -+ uint16_t *row_height; ///< RowHeight -+ uint16_t *col_bd; ///< ColBd -+ uint16_t *row_bd; ///< RowBd -+ uint16_t *col_idxX; -+ -+ // We can limit these to uint16_t given our other size limits -+ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS -+ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS -+ uint16_t *tile_id; ///< TileId -+ uint16_t *tile_pos_ts; ///< TilePosRS -+ uint16_t *tile_size; ///< TileSize -+ uint8_t * ctb_ts_flags; -+ -+ uint8_t data[4096]; -+ int data_size; -+} HEVCRpiPPS; -+ -+typedef struct HEVCRpiParamSets { -+ /* currently active parameter sets */ -+ const HEVCRpiVPS *vps; -+ const HEVCRpiSPS *sps; -+ const HEVCRpiPPS *pps; -+ -+ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; -+ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; -+ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; -+} HEVCRpiParamSets; -+ -+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, -+ HEVCRpiParamSets *ps); -+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, -+ HEVCRpiParamSets *ps, int apply_defdispwin); -+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, -+ HEVCRpiParamSets *ps); -+ -+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, -+ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header); -+ -+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, -+ uint8_t *buf, int buf_size); -+ -+/** -+ * Compute POC of the current frame and return it. -+ */ -+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type); -+ -+#endif /* AVCODEC_RPI_HEVC_PS_H */ -diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c -new file mode 100644 -index 0000000000..8cc5796cf0 ---- /dev/null -+++ b/libavcodec/rpi_hevc_refs.c -@@ -0,0 +1,485 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/avassert.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/rpi_sand_fns.h" -+#include "internal.h" -+#include "thread.h" -+#include "hevc.h" -+#include "rpi_hevcdec.h" -+ -+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags) -+{ -+ /* frame->frame can be NULL if context init failed */ -+ if (!frame->frame || !frame->frame->buf[0]) -+ return; -+ -+ frame->flags &= ~flags; -+ if (!frame->flags) { -+ ff_thread_release_buffer(s->avctx, &frame->tf); -+ -+ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL -+ frame->col_mvf = NULL; -+ -+ frame->collocated_ref = NULL; -+ } -+} -+ -+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) -+{ -+ int i; -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) -+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], -+ HEVC_FRAME_FLAG_SHORT_REF | -+ HEVC_FRAME_FLAG_LONG_REF); -+} -+ -+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s) -+{ -+ int i; -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) -+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); -+} -+ -+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s) -+{ -+ int i, ret; -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame * const frame = &s->DPB[i]; -+ if (frame->frame->buf[0]) -+ continue; -+ -+ ret = ff_thread_get_buffer(s->avctx, &frame->tf, -+ AV_GET_BUFFER_FLAG_REF); -+ if (ret < 0) -+ return NULL; -+ -+ frame->col_mvf = NULL; -+ frame->col_mvf_buf = NULL; -+ if (s->used_for_ref && !s->is_irap) -+ { -+ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool); -+ if (!frame->col_mvf_buf) -+ goto fail; -+ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data; -+ } -+ -+ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; -+ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); -+ -+ return frame; -+ -+fail: -+ ff_hevc_rpi_unref_frame(s, frame, ~0); -+ return NULL; -+ } -+ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n"); -+ return NULL; -+} -+ -+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) -+{ -+ HEVCRpiFrame *ref; -+ int i; -+ -+ /* check that this POC doesn't already exist */ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ -+ if (frame->frame->buf[0] && frame->sequence == s->seq_decode && -+ frame->poc == poc) { -+ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n", -+ poc); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ ref = alloc_frame(s); -+ if (!ref) -+ return AVERROR(ENOMEM); -+ -+ *frame = ref->frame; -+ s->ref = ref; -+ -+ if (s->sh.pic_output_flag) -+ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF; -+ else -+ ref->flags = HEVC_FRAME_FLAG_SHORT_REF; -+ -+ ref->poc = poc; -+ ref->sequence = s->seq_decode; -+ ref->frame->crop_left = s->ps.sps->output_window.left_offset; -+ ref->frame->crop_right = s->ps.sps->output_window.right_offset; -+ ref->frame->crop_top = s->ps.sps->output_window.top_offset; -+ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset; -+ -+ return 0; -+} -+ -+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush) -+{ -+ do { -+ int nb_output = 0; -+ int min_poc = INT_MAX; -+ int i, min_idx, ret; -+ -+ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && -+ frame->sequence == s->seq_output) { -+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); -+ } -+ } -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && -+ frame->sequence == s->seq_output) { -+ nb_output++; -+ if (frame->poc < min_poc || nb_output == 1) { -+ min_poc = frame->poc; -+ min_idx = i; -+ } -+ } -+ } -+ -+ /* wait for more frames before output */ -+ if (!flush && s->seq_output == s->seq_decode && s->ps.sps && -+ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics) -+ return 0; -+ -+ if (nb_output) { -+ HEVCRpiFrame *frame = &s->DPB[min_idx]; -+ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) -+ return 0; -+ -+ ret = av_frame_ref(out, frame->frame); -+ if (frame->flags & HEVC_FRAME_FLAG_BUMPING) -+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING); -+ else -+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); -+ if (ret < 0) -+ return ret; -+ av_log(s->avctx, AV_LOG_DEBUG, -+ "Output frame with POC %d.\n", frame->poc); -+ return 1; -+ } -+ -+ if (s->seq_output != s->seq_decode) -+ s->seq_output = (s->seq_output + 1) & 0xff; -+ else -+ break; -+ } while (1); -+ -+ return 0; -+} -+ -+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s) -+{ -+ int dpb = 0; -+ int min_poc = INT_MAX; -+ int i; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ if ((frame->flags) && -+ frame->sequence == s->seq_output && -+ frame->poc != s->poc) { -+ dpb++; -+ } -+ } -+ -+ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ if ((frame->flags) && -+ frame->sequence == s->seq_output && -+ frame->poc != s->poc) { -+ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) { -+ min_poc = frame->poc; -+ } -+ } -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && -+ frame->sequence == s->seq_output && -+ frame->poc <= min_poc) { -+ frame->flags |= HEVC_FRAME_FLAG_BUMPING; -+ } -+ } -+ -+ dpb--; -+ } -+} -+ -+static int init_slice_rpl(HEVCRpiContext *s) -+{ -+ if (s->slice_idx >= s->rpl_tab_size) -+ return AVERROR_INVALIDDATA; -+ -+ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0; -+ return 0; -+} -+ -+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) -+{ -+ RpiSliceHeader *sh = &s->sh; -+ -+ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; -+ uint8_t list_idx; -+ int i, j, ret; -+ -+ ret = init_slice_rpl(s); -+ if (ret < 0) -+ return ret; -+ -+ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + -+ s->rps[LT_CURR].nb_refs)) { -+ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ for (list_idx = 0; list_idx < nb_list; list_idx++) { -+ RefPicList rpl_tmp = { { 0 } }; -+ RefPicList *rpl = &s->refPicList[list_idx]; -+ -+ /* The order of the elements is -+ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and -+ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */ -+ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF, -+ list_idx ? ST_CURR_BEF : ST_CURR_AFT, -+ LT_CURR }; -+ -+ /* concatenate the candidate lists for the current frame */ -+ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) { -+ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) { -+ RefPicList *rps = &s->rps[cand_lists[i]]; -+ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) { -+ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j]; -+ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j]; -+ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2; -+ rpl_tmp.nb_refs++; -+ } -+ } -+ } -+ -+ /* reorder the references if necessary */ -+ if (sh->rpl_modification_flag[list_idx]) { -+ for (i = 0; i < sh->nb_refs[list_idx]; i++) { -+ int idx = sh->list_entry_lx[list_idx][i]; -+ -+ if (idx >= rpl_tmp.nb_refs) { -+ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ rpl->list[i] = rpl_tmp.list[idx]; -+ rpl->ref[i] = rpl_tmp.ref[idx]; -+ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx]; -+ rpl->nb_refs++; -+ } -+ } else { -+ memcpy(rpl, &rpl_tmp, sizeof(*rpl)); -+ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]); -+ } -+ -+ if (sh->collocated_list == list_idx && -+ sh->collocated_ref_idx < rpl->nb_refs) -+ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx]; -+ } -+ -+ return 0; -+} -+ -+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc) -+{ -+ int i; -+ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *ref = &s->DPB[i]; -+ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { -+ if ((ref->poc & LtMask) == poc) -+ return ref; -+ } -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *ref = &s->DPB[i]; -+ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { -+ if (ref->poc == poc || (ref->poc & LtMask) == poc) -+ return ref; -+ } -+ } -+ -+ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s)) -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Could not find ref with POC %d\n", poc); -+ return NULL; -+} -+ -+static void mark_ref(HEVCRpiFrame *frame, int flag) -+{ -+ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); -+ frame->flags |= flag; -+} -+ -+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc) -+{ -+ HEVCRpiFrame *frame; -+ int i, x, y; -+ -+ frame = alloc_frame(s); -+ if (!frame) -+ return NULL; -+ -+ if (!s->ps.sps->pixel_shift) { -+ for (i = 0; frame->frame->buf[i]; i++) -+ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1), -+ frame->frame->buf[i]->size); -+ } else { -+ for (i = 0; frame->frame->data[i]; i++) -+ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++) -+ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) { -+ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x, -+ 1 << (s->ps.sps->bit_depth - 1)); -+ } -+ } -+ -+ frame->poc = poc; -+ frame->sequence = s->seq_decode; -+ frame->flags = 0; -+ -+ ff_hevc_rpi_progress_set_all_done(frame); -+ -+ return frame; -+} -+ -+/* add a reference with the given poc to the list and mark it as used in DPB */ -+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, -+ int poc, int ref_flag) -+{ -+ HEVCRpiFrame *ref = find_ref_idx(s, poc); -+ -+ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) -+ return AVERROR_INVALIDDATA; -+ -+ if (!ref) { -+ ref = generate_missing_ref(s, poc); -+ if (!ref) -+ return AVERROR(ENOMEM); -+ } -+ -+ list->list[list->nb_refs] = ref->poc; -+ list->ref[list->nb_refs] = ref; -+ list->nb_refs++; -+ -+ mark_ref(ref, ref_flag); -+ return 0; -+} -+ -+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s) -+{ -+ const ShortTermRPS *short_rps = s->sh.short_term_rps; -+ const LongTermRPS *long_rps = &s->sh.long_term_rps; -+ RefPicList *rps = s->rps; -+ int i, ret = 0; -+ -+ if (!short_rps) { -+ rps[0].nb_refs = rps[1].nb_refs = 0; -+ return 0; -+ } -+ -+ /* clear the reference flags on all frames except the current one */ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCRpiFrame *frame = &s->DPB[i]; -+ -+ if (frame == s->ref) -+ continue; -+ -+ mark_ref(frame, 0); -+ } -+ -+ for (i = 0; i < NB_RPS_TYPE; i++) -+ rps[i].nb_refs = 0; -+ -+ /* add the short refs */ -+ for (i = 0; i < short_rps->num_delta_pocs; i++) { -+ int poc = s->poc + short_rps->delta_poc[i]; -+ int list; -+ -+ if (!short_rps->used[i]) -+ list = ST_FOLL; -+ else if (i < short_rps->num_negative_pics) -+ list = ST_CURR_BEF; -+ else -+ list = ST_CURR_AFT; -+ -+ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF); -+ if (ret < 0) -+ goto fail; -+ } -+ -+ /* add the long refs */ -+ for (i = 0; i < long_rps->nb_refs; i++) { -+ int poc = long_rps->poc[i]; -+ int list = long_rps->used[i] ? LT_CURR : LT_FOLL; -+ -+ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF); -+ if (ret < 0) -+ goto fail; -+ } -+ -+fail: -+ /* release any frames that are now unused */ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) -+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0); -+ -+ return ret; -+} -+ -+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s) -+{ -+ int ret = 0; -+ int i; -+ const ShortTermRPS *rps = s->sh.short_term_rps; -+ LongTermRPS *long_rps = &s->sh.long_term_rps; -+ -+ if (rps) { -+ for (i = 0; i < rps->num_negative_pics; i++) -+ ret += !!rps->used[i]; -+ for (; i < rps->num_delta_pocs; i++) -+ ret += !!rps->used[i]; -+ } -+ -+ if (long_rps) { -+ for (i = 0; i < long_rps->nb_refs; i++) -+ ret += !!long_rps->used[i]; -+ } -+ return ret; -+} -diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c -new file mode 100644 -index 0000000000..cd8149d58e ---- /dev/null -+++ b/libavcodec/rpi_hevc_sei.c -@@ -0,0 +1,368 @@ -+/* -+ * HEVC Supplementary Enhancement Information messages -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2013 Vittorio Giovara -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "golomb.h" -+#include "rpi_hevc_ps.h" -+#include "rpi_hevc_sei.h" -+ -+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb) -+{ -+ int cIdx, i; -+ uint8_t hash_type; -+ //uint16_t picture_crc; -+ //uint32_t picture_checksum; -+ hash_type = get_bits(gb, 8); -+ -+ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) { -+ if (hash_type == 0) { -+ s->is_md5 = 1; -+ for (i = 0; i < 16; i++) -+ s->md5[cIdx][i] = get_bits(gb, 8); -+ } else if (hash_type == 1) { -+ // picture_crc = get_bits(gb, 16); -+ skip_bits(gb, 16); -+ } else if (hash_type == 2) { -+ // picture_checksum = get_bits_long(gb, 32); -+ skip_bits(gb, 32); -+ } -+ } -+ return 0; -+} -+ -+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb) -+{ -+ int i; -+ // Mastering primaries -+ for (i = 0; i < 3; i++) { -+ s->display_primaries[i][0] = get_bits(gb, 16); -+ s->display_primaries[i][1] = get_bits(gb, 16); -+ } -+ // White point (x, y) -+ s->white_point[0] = get_bits(gb, 16); -+ s->white_point[1] = get_bits(gb, 16); -+ -+ // Max and min luminance of mastering display -+ s->max_luminance = get_bits_long(gb, 32); -+ s->min_luminance = get_bits_long(gb, 32); -+ -+ // As this SEI message comes before the first frame that references it, -+ // initialize the flag to 2 and decrement on IRAP access unit so it -+ // persists for the coded video sequence (e.g., between two IRAPs) -+ s->present = 2; -+ return 0; -+} -+ -+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb) -+{ -+ // Max and average light levels -+ s->max_content_light_level = get_bits_long(gb, 16); -+ s->max_pic_average_light_level = get_bits_long(gb, 16); -+ // As this SEI message comes before the first frame that references it, -+ // initialize the flag to 2 and decrement on IRAP access unit so it -+ // persists for the coded video sequence (e.g., between two IRAPs) -+ s->present = 2; -+ return 0; -+} -+ -+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb) -+{ -+ get_ue_golomb_long(gb); // frame_packing_arrangement_id -+ s->present = !get_bits1(gb); -+ -+ if (s->present) { -+ s->arrangement_type = get_bits(gb, 7); -+ s->quincunx_subsampling = get_bits1(gb); -+ s->content_interpretation_type = get_bits(gb, 6); -+ -+ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag -+ skip_bits(gb, 3); -+ s->current_frame_is_frame0_flag = get_bits1(gb); -+ // frame0_self_contained_flag, frame1_self_contained_flag -+ skip_bits(gb, 2); -+ -+ if (!s->quincunx_subsampling && s->arrangement_type != 5) -+ skip_bits(gb, 16); // frame[01]_grid_position_[xy] -+ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte -+ skip_bits1(gb); // frame_packing_arrangement_persistence_flag -+ } -+ skip_bits1(gb); // upsampled_aspect_ratio_flag -+ return 0; -+} -+ -+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb) -+{ -+ s->present = !get_bits1(gb); -+ -+ if (s->present) { -+ s->hflip = get_bits1(gb); // hor_flip -+ s->vflip = get_bits1(gb); // ver_flip -+ -+ s->anticlockwise_rotation = get_bits(gb, 16); -+ skip_bits1(gb); // display_orientation_persistence_flag -+ } -+ -+ return 0; -+} -+ -+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps, -+ void *logctx, int size) -+{ -+ HEVCSEIPictureTiming *h = &s->picture_timing; -+ HEVCRpiSPS *sps; -+ -+ if (!ps->sps_list[s->active_seq_parameter_set_id]) -+ return(AVERROR(ENOMEM)); -+ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data; -+ -+ if (sps->vui.frame_field_info_present_flag) { -+ int pic_struct = get_bits(gb, 4); -+ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN; -+ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) { -+ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n"); -+ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD; -+ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) { -+ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n"); -+ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD; -+ } -+ get_bits(gb, 2); // source_scan_type -+ get_bits(gb, 1); // duplicate_flag -+ skip_bits1(gb); -+ size--; -+ } -+ skip_bits_long(gb, 8 * size); -+ -+ return 0; -+} -+ -+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb, -+ int size) -+{ -+ int flag; -+ int user_data_type_code; -+ int cc_count; -+ -+ if (size < 3) -+ return AVERROR(EINVAL); -+ -+ user_data_type_code = get_bits(gb, 8); -+ if (user_data_type_code == 0x3) { -+ skip_bits(gb, 1); // reserved -+ -+ flag = get_bits(gb, 1); // process_cc_data_flag -+ if (flag) { -+ skip_bits(gb, 1); -+ cc_count = get_bits(gb, 5); -+ skip_bits(gb, 8); // reserved -+ size -= 2; -+ -+ if (cc_count && size >= cc_count * 3) { -+ const uint64_t new_size = (s->a53_caption_size + cc_count -+ * UINT64_C(3)); -+ int i, ret; -+ -+ if (new_size > INT_MAX) -+ return AVERROR(EINVAL); -+ -+ /* Allow merging of the cc data from two fields. */ -+ ret = av_reallocp(&s->a53_caption, new_size); -+ if (ret < 0) -+ return ret; -+ -+ for (i = 0; i < cc_count; i++) { -+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); -+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); -+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); -+ } -+ skip_bits(gb, 8); // marker_bits -+ } -+ } -+ } else { -+ int i; -+ for (i = 0; i < size - 1; i++) -+ skip_bits(gb, 8); -+ } -+ -+ return 0; -+} -+ -+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb, -+ int size) -+{ -+ uint32_t country_code; -+ uint32_t user_identifier; -+ -+ if (size < 7) -+ return AVERROR(EINVAL); -+ size -= 7; -+ -+ country_code = get_bits(gb, 8); -+ if (country_code == 0xFF) { -+ skip_bits(gb, 8); -+ size--; -+ } -+ -+ skip_bits(gb, 8); -+ skip_bits(gb, 8); -+ -+ user_identifier = get_bits_long(gb, 32); -+ -+ switch (user_identifier) { -+ case MKBETAG('G', 'A', '9', '4'): -+ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size); -+ default: -+ skip_bits_long(gb, size * 8); -+ break; -+ } -+ return 0; -+} -+ -+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx) -+{ -+ int num_sps_ids_minus1; -+ int i; -+ unsigned active_seq_parameter_set_id; -+ -+ get_bits(gb, 4); // active_video_parameter_set_id -+ get_bits(gb, 1); // self_contained_cvs_flag -+ get_bits(gb, 1); // num_sps_ids_minus1 -+ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1 -+ -+ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) { -+ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ active_seq_parameter_set_id = get_ue_golomb_long(gb); -+ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) { -+ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id); -+ return AVERROR_INVALIDDATA; -+ } -+ s->active_seq_parameter_set_id = active_seq_parameter_set_id; -+ -+ for (i = 1; i <= num_sps_ids_minus1; i++) -+ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i] -+ -+ return 0; -+} -+ -+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb) -+{ -+ s->present = 1; -+ s->preferred_transfer_characteristics = get_bits(gb, 8); -+ return 0; -+} -+ -+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, -+ int type, int size) -+{ -+ switch (type) { -+ case 256: // Mismatched value from HM 8.1 -+ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); -+ case HEVC_SEI_TYPE_FRAME_PACKING: -+ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb); -+ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION: -+ return decode_nal_sei_display_orientation(&s->display_orientation, gb); -+ case HEVC_SEI_TYPE_PICTURE_TIMING: -+ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size); -+ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO: -+ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb); -+ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO: -+ return decode_nal_sei_content_light_info(&s->content_light, gb); -+ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS: -+ return decode_nal_sei_active_parameter_sets(s, gb, logctx); -+ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35: -+ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size); -+ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS: -+ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb); -+ default: -+ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type); -+ skip_bits_long(gb, 8 * size); -+ return 0; -+ } -+} -+ -+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, -+ int type, int size) -+{ -+ switch (type) { -+ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: -+ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); -+ default: -+ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type); -+ skip_bits_long(gb, 8 * size); -+ return 0; -+ } -+} -+ -+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, -+ const HEVCRpiParamSets * const ps, const int nal_unit_type) -+{ -+ int payload_type = 0; -+ int payload_size = 0; -+ int byte = 0xFF; -+ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); -+ -+ while (byte == 0xFF) { -+ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255) -+ return AVERROR_INVALIDDATA; -+ byte = get_bits(gb, 8); -+ payload_type += byte; -+ } -+ byte = 0xFF; -+ while (byte == 0xFF) { -+ if (get_bits_left(gb) < 8 + 8LL*payload_size) -+ return AVERROR_INVALIDDATA; -+ byte = get_bits(gb, 8); -+ payload_size += byte; -+ } -+ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { -+ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); -+ } else { /* nal_unit_type == NAL_SEI_SUFFIX */ -+ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); -+ } -+} -+ -+static int more_rbsp_data(GetBitContext *gb) -+{ -+ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80; -+} -+ -+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, -+ const HEVCRpiParamSets *ps, int type) -+{ -+ int ret; -+ -+ do { -+ ret = decode_nal_sei_message(gb, logctx, s, ps, type); -+ if (ret < 0) -+ return ret; -+ } while (more_rbsp_data(gb)); -+ return 1; -+} -+ -+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s) -+{ -+ s->a53_caption.a53_caption_size = 0; -+ av_freep(&s->a53_caption.a53_caption); -+} -diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h -new file mode 100644 -index 0000000000..d4ac348df9 ---- /dev/null -+++ b/libavcodec/rpi_hevc_sei.h -@@ -0,0 +1,135 @@ -+/* -+ * HEVC Supplementary Enhancement Information messages -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVC_SEI_H -+#define AVCODEC_RPI_HEVC_SEI_H -+ -+#include -+ -+#include "libavutil/md5.h" -+ -+#include "get_bits.h" -+ -+/** -+ * SEI message types -+ */ -+typedef enum { -+ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0, -+ HEVC_SEI_TYPE_PICTURE_TIMING = 1, -+ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2, -+ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3, -+ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4, -+ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5, -+ HEVC_SEI_TYPE_RECOVERY_POINT = 6, -+ HEVC_SEI_TYPE_SCENE_INFO = 9, -+ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15, -+ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, -+ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, -+ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19, -+ HEVC_SEI_TYPE_POST_FILTER_HINT = 22, -+ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23, -+ HEVC_SEI_TYPE_FRAME_PACKING = 45, -+ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47, -+ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128, -+ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129, -+ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130, -+ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131, -+ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132, -+ HEVC_SEI_TYPE_SCALABLE_NESTING = 133, -+ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134, -+ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137, -+ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144, -+ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, -+} HEVC_SEI_Type; -+ -+typedef struct HEVCSEIPictureHash { -+ uint8_t md5[3][16]; -+ uint8_t is_md5; -+} HEVCSEIPictureHash; -+ -+typedef struct HEVCSEIFramePacking { -+ int present; -+ int arrangement_type; -+ int content_interpretation_type; -+ int quincunx_subsampling; -+ int current_frame_is_frame0_flag; -+} HEVCSEIFramePacking; -+ -+typedef struct HEVCSEIDisplayOrientation { -+ int present; -+ int anticlockwise_rotation; -+ int hflip, vflip; -+} HEVCSEIDisplayOrientation; -+ -+typedef struct HEVCSEIPictureTiming { -+ int picture_struct; -+} HEVCSEIPictureTiming; -+ -+typedef struct HEVCSEIA53Caption { -+ int a53_caption_size; -+ uint8_t *a53_caption; -+} HEVCSEIA53Caption; -+ -+typedef struct HEVCSEIMasteringDisplay { -+ int present; -+ uint16_t display_primaries[3][2]; -+ uint16_t white_point[2]; -+ uint32_t max_luminance; -+ uint32_t min_luminance; -+} HEVCSEIMasteringDisplay; -+ -+typedef struct HEVCSEIContentLight { -+ int present; -+ uint16_t max_content_light_level; -+ uint16_t max_pic_average_light_level; -+} HEVCSEIContentLight; -+ -+typedef struct HEVCSEIAlternativeTransfer { -+ int present; -+ int preferred_transfer_characteristics; -+} HEVCSEIAlternativeTransfer; -+ -+typedef struct HEVCSEIContext { -+ HEVCSEIPictureHash picture_hash; -+ HEVCSEIFramePacking frame_packing; -+ HEVCSEIDisplayOrientation display_orientation; -+ HEVCSEIPictureTiming picture_timing; -+ HEVCSEIA53Caption a53_caption; -+ HEVCSEIMasteringDisplay mastering_display; -+ HEVCSEIContentLight content_light; -+ int active_seq_parameter_set_id; -+ HEVCSEIAlternativeTransfer alternative_transfer; -+} HEVCSEIContext; -+ -+struct HEVCRpiParamSets; -+ -+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, -+ const struct HEVCRpiParamSets *ps, int type); -+ -+/** -+ * Reset SEI values that are stored on the Context. -+ * e.g. Caption data that was extracted during NAL -+ * parsing. -+ * -+ * @param s HEVCRpiContext. -+ */ -+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s); -+ -+#endif /* AVCODEC_RPI_HEVC_SEI_H */ -diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c -new file mode 100644 -index 0000000000..23b49a99ae ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader.c -@@ -0,0 +1,1537 @@ -+#include "rpi_hevc_shader.h" -+ -+#ifdef _MSC_VER -+ #include -+ /* cast through uintptr_t to avoid warnings */ -+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) -+#else -+ #define POINTER_TO_UINT(X) ((unsigned int)(X)) -+#endif -+ -+#ifdef __cplusplus -+extern "C" { /* the types are probably wrong... */ -+#endif -+#ifdef __cplusplus -+} -+#endif -+ -+#ifdef _MSC_VER -+__declspec(align(8)) -+#elif defined(__GNUC__) -+__attribute__((aligned(8))) -+#endif -+unsigned int ff_hevc_rpi_shader[] = { -+// ::mc_setup_c_q0 -+// ::mc_start -+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_setup_c_qn -+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif -+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 -+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift -+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 -+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask -+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif -+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch -+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num -+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 -+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num -+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif -+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a -+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif -+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD -+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 -+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y -+// :1 -+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 -+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 -+// ::mc_filter_c_p -+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 -+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif -+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val -+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add -+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 -+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif -+// :1 -+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 -+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 -+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch -+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 -+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 -+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b -+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a -+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height -+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b -+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_c_p_l1 -+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 -+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif -+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val -+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add -+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 -+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif -+// :1 -+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 -+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next -+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 -+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch -+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax -+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 -+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 -+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b -+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a -+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height -+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b -+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_c_b -+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 -+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif -+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif -+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif -+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height -+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 -+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif -+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif -+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a -+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b -+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif -+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif -+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif -+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y -+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add -+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif -+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 -+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif -+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val -+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 -+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 -+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d -+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif -+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d -+// :1 -+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 -+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next -+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next -+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y -+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 -+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 -+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 -+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 -+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 -+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax -+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 -+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 -+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a -+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b -+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b -+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 -+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 -+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 -+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 -+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 -+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 -+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add -+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height -+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 -+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b -+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_sync_q0 -+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q1 -+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q2 -+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q3 -+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_sync_q4 -+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q5 -+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q6 -+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q7 -+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_sync_q8 -+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q9 -+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q10 -+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync_q11 -+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_exit_c_qn -+// ::mc_exit_y_qn -+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 -+// :1 -+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_exit_c_q0 -+// ::mc_exit_y_q0 -+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 -+// :1 -+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 -+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_setup_y_q0 -+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_setup_y_qn -+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif -+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 -+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask -+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 -+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 -+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 -+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif -+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1 -+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif -+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch -+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 -+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a -+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a -+// :1 -+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 -+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 -+// :per_block_setup_8 -+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif -+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif -+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 -+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif -+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif -+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init -+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul -+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 -+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) -+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add -+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val -+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif -+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif -+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 -+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 -+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d -+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c -+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 -+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 -+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif -+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 -+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d -+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c -+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d -+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c -+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 -+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif -+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 -+// ::mc_filter_y_pxx -+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 -+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+// :1 -+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 -+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 -+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 -+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 -+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 -+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 -+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 -+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 -+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height -+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next -+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next -+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b -+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y_bxx -+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 -+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 -+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+// :1 -+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 -+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 -+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 -+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 -+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 -+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 -+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 -+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 -+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 -+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off -+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 -+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add -+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next -+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 -+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height -+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch -+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 -+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b -+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y_p00 -+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num -+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 -+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif -+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a -+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif -+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif -+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif -+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init -+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift -+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 -+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif -+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base -+// :1 -+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 -+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b -+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y_b00 -+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 -+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1 -+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 -+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 -+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 -+// :1 -+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 -+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next -+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax -+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 -+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 -+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 -+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b -+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_setup_c10_q0 -+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_setup_c10_qn -+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif -+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 -+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift -+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 -+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask -+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif -+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch -+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num -+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 -+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 -+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x -+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 -+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif -+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 -+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 -+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 -+/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) -+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 -+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift -+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a -+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif -+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch -+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD -+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 -+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y -+// :1 -+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 -+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 -+// ::mc_filter_c10_p -+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif -+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val -+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add -+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 -+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif -+// :1 -+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 -+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 -+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch -+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 -+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 -+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b -+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a -+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height -+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b -+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_c10_p_l1 -+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 -+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif -+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif -+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif -+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height -+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif -+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif -+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val -+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c -+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 -+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add -+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 -+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif -+// :1 -+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 -+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next -+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 -+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch -+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax -+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 -+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 -+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 -+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b -+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a -+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 -+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height -+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b -+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_c10_b -+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif -+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif -+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 -+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif -+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif -+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif -+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul -+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height -+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif -+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 -+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif -+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif -+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a -+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b -+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif -+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif -+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif -+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y -+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add -+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif -+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 -+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif -+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val -+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 -+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 -+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d -+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif -+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d -+// :1 -+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 -+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next -+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next -+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y -+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 -+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 -+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask -+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 -+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 -+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 -+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 -+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 -+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 -+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 -+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax -+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 -+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 -+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a -+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b -+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b -+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 -+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 -+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 -+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 -+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 -+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 -+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add -+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height -+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 -+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b -+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_sync10_q0 -+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) -+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q1 -+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q2 -+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q3 -+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) -+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_sync10_q4 -+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q5 -+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q6 -+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q7 -+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) -+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_sync10_q8 -+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q9 -+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q10 -+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) -+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) -+// ::mc_sync10_q11 -+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) -+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_exit_c10_q0 -+// ::mc_exit_y10_q0 -+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 -+// :1 -+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) -+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 -+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_exit_c10_qn -+// ::mc_exit_y10_qn -+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 -+// :1 -+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 -+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 -+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_setup_y10_q0 -+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) -+// ::mc_setup_y10_qn -+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif -+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif -+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 -+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask -+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 -+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 -+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 -+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif -+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 -+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift -+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif -+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch -+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 -+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 -+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a -+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a -+// :1 -+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 -+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 -+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 -+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 -+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b -+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 -+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 -+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 -+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 -+/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) -+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 -+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 -+// :per_block_setup_10 -+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif -+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a -+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif -+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 -+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 -+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif -+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif -+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init -+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul -+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 -+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) -+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add -+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val -+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif -+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif -+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 -+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 -+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d -+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c -+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif -+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 -+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 -+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif -+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 -+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d -+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c -+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d -+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c -+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 -+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif -+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 -+// ::mc_filter_y10_pxx -+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 -+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+// :1 -+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 -+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 -+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 -+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 -+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 -+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 -+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 -+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 -+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height -+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next -+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next -+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 -+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 -+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b -+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y10_p00 -+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num -+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 -+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif -+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift -+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a -+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif -+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 -+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif -+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif -+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init -+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift -+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 -+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif -+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base -+// :1 -+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask -+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 -+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b -+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y10_bxx -+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 -+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 -+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 -+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+// :1 -+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 -+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 -+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 -+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 -+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 -+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 -+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 -+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 -+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 -+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off -+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 -+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 -+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add -+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next -+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next -+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 -+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height -+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch -+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 -+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b -+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_filter_y10_b00 -+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 -+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num -+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 -+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1 -+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 -+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 -+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 -+// :1 -+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 -+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch -+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next -+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax -+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 -+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 -+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height -+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b -+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 -+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait -+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 -+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link -+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 -+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest -+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 -+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b -+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 -+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 -+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init -+// ::mc_end -+}; -+#ifdef __HIGHC__ -+#pragma Align_to(8, ff_hevc_rpi_shader) -+#endif -diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h -new file mode 100644 -index 0000000000..79651c9b6c ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader.h -@@ -0,0 +1,63 @@ -+#ifndef rpi_hevc_shader_H -+#define rpi_hevc_shader_H -+ -+extern unsigned int ff_hevc_rpi_shader[]; -+ -+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) -+#define mc_start (ff_hevc_rpi_shader + 0) -+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) -+#define mc_filter_c_p (ff_hevc_rpi_shader + 134) -+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260) -+#define mc_filter_c_b (ff_hevc_rpi_shader + 386) -+#define mc_sync_q0 (ff_hevc_rpi_shader + 580) -+#define mc_sync_q1 (ff_hevc_rpi_shader + 598) -+#define mc_sync_q2 (ff_hevc_rpi_shader + 610) -+#define mc_sync_q3 (ff_hevc_rpi_shader + 622) -+#define mc_sync_q4 (ff_hevc_rpi_shader + 634) -+#define mc_sync_q5 (ff_hevc_rpi_shader + 652) -+#define mc_sync_q6 (ff_hevc_rpi_shader + 664) -+#define mc_sync_q7 (ff_hevc_rpi_shader + 676) -+#define mc_sync_q8 (ff_hevc_rpi_shader + 688) -+#define mc_sync_q9 (ff_hevc_rpi_shader + 706) -+#define mc_sync_q10 (ff_hevc_rpi_shader + 718) -+#define mc_sync_q11 (ff_hevc_rpi_shader + 730) -+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742) -+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742) -+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760) -+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760) -+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780) -+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782) -+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014) -+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140) -+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272) -+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358) -+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432) -+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434) -+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562) -+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684) -+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806) -+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996) -+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014) -+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026) -+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038) -+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050) -+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068) -+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080) -+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092) -+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104) -+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122) -+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134) -+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146) -+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158) -+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158) -+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178) -+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178) -+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196) -+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198) -+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440) -+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566) -+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654) -+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786) -+#define mc_end (ff_hevc_rpi_shader + 2860) -+ -+#endif -diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm -new file mode 100644 -index 0000000000..af5b59e181 ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader.qasm -@@ -0,0 +1,1850 @@ -+# Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+# All rights reserved. -+# -+# Redistribution and use in source and binary forms, with or without -+# modification, are permitted provided that the following conditions are met: -+# * Redistributions of source code must retain the above copyright -+# notice, this list of conditions and the following disclaimer. -+# * Redistributions in binary form must reproduce the above copyright -+# notice, this list of conditions and the following disclaimer in the -+# documentation and/or other materials provided with the distribution. -+# * Neither the name of the copyright holder nor the -+# names of its contributors may be used to endorse or promote products -+# derived from this software without specific prior written permission. -+# -+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+# -+# Written by Peter de Rivaz, John Cox -+ -+ -+ -+# Inter pred asm -+# -+# Logic here should be good to 14 bits without modification -+# but only 8 & 10 are currently instantiated & tested -+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow -+# in _p00 & _b00 -+ -+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress -+# the warning that we are using rotation & ra/rb registers. r0..3 can be -+# rotated through all 16 elems ra regs can only be rotated through their -+# local 4. As it happens this is what is wanted here as we do not want the -+# constants from the other half of the calc. -+ -+# Number limits in P/B calculation -+# -+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier -+# we offset our intermediates s.t. they always end up +ve before the next -+# multiply (may be -ve whilst summing but that doesn't matter). -+# -+# Range calc for up to 14 bits (Y-B pred): -+# -+# denom: [0, 7] -+# bmax = (1 << bits) - 1 -+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1] -+# -+# wt_mul: [-128, 255] -+# wt_off = off * 2 + 1: [-bmax, bmax] -+# -+# pel: [0, bmax] -+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff] -+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e] -+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6] -+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4] -+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2): -+# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000] -+# -+# This all looks good and is mostly bit depth independant - and as we manage -+# to do unsigned multiplies everywhere (now) this should be good for any bit -+# depth up to 14 (we could probably do 16 - but that requires a few tweaks -+# to the shifts we don't currently have logic for) -+ -+# PREREAD is the number of requests that we have sitting in the TMU request -+# queue. -+# -+# There are 8 slots availible in the TMU request Q for tm0s requests, but -+# only 4 output FIFO entries and overflow is bad (corruption or crash) -+# (If threaded then only 2 out FIFO entries, but we aren't.) -+# In s/w we are effectively limited to the min vertical read which is >= 4 -+# so output FIFO is the limit. -+# -+# As the test for read-next is is the main part of the Luma loop (rather than -+# the preload FIFO part) we are limited to min_luma_height - 1 -+# Min_luma_height is 4 so we can only have a preload of 3 -+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick -+# in chroma without abandoning preload pretty much entirely (which would be bad) -+# -+# Timing tests vs preload of 4 suggests this doesn't hurt us much -+# Could have preread 4 for Chroma but when tested it didn't help -+ -+.set PREREAD, 3 -+ -+# Offset added (effectively) at the exit of the H FIR filter -+# This is enough to force the result +ve -+# Is good if it is a power of 2 as that allows for >> without loss -+# -+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22 -+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00 -+# Round up to next power of 2 -+ -+.set FIR_OFFSET, 0x4000 -+ -+# Block heights - 8 & 16 are the only numbers we currently support -+ -+.set C_BLK_HEIGHT_8, 16 -+.set C_BLK_HEIGHT_16, 8 -+.set Y_BLK_HEIGHT_8, 16 -+.set Y_BLK_HEIGHT_16, 8 -+ -+# QPU counts - depend on block size -+# If we have a 2-byte format & block_size > 8 then can only afford -+# 8 QPUs -+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h -+ -+.set N_QPU_8, 12 -+.set N_QPU_16, 12 -+ -+# Value to add to the weight multiplier to convert it into an unsigned value -+# Should be power of two for convienience -+ -+.set LOG2_MUL_ADD, 14 -+.set MUL_ADD, (1 << LOG2_MUL_ADD) -+ -+# Fixed denom (max that it can be set to) -+.set DENOM, 7 -+ -+# register allocation -+# -+ -+# ra0-3 -+# Used as temp and may be loop filter coeffs (split into .8s) -+# or temp in loop. Check usage on an individual basis. -+ -+# ra4-11 -+# V FIFO / temp / free -+ -+# -- free -- ra12 -+ -+# -- free -- ra13 -+ -+# -- free -- ra14 -+ -+# -- free -- ra15 -+ -+# uniform: width:height -+.set ra_width_height, ra16 -+.set ra_width, ra16.16b -+.set ra_height, ra16.16a -+ -+# y:y2 same layout as y_y2_next so we can update both together -+.set ra_y_y2, ra17 -+.set ra_y2, ra17.16a -+.set ra_y, ra17.16b -+ -+# uniform: L1 weight (U on left, V on right) -+# Only used in Y B -+.set ra_wt_off_mul_l1, ra18 -+.set ra_wt_off_l1, ra18.16b -+.set ra_wt_mul_l1, ra18.16a -+ -+# y_next:y2_next same layout as y_y2 so we can update both together -+.set ra_y_y2_next, ra19 -+.set ra_y_next, ra19.16b -+.set ra_y2_next, ra19.16a -+ -+# Setup: consts - subdivide a single register -+.set ra_kff800100, ra20 -+.set ra_k256, ra20.16a -+.set ra_k0, ra20.8a -+.set ra_k1, ra20.8b -+.set ra_k128, ra20.8c -+.set ra_k255, ra20.8d -+ -+# Loop: xshifts -+.set ra_xshift, ra21.16a -+.set ra_xshift_next, ra21.16b -+ -+# Loop var: L0 weight (U on left, V on right) -+# _off_ is not used in loop as we want to modify it before use -+.set ra_wt_off_mul_l0, ra22 -+.set ra_wt_mul_l0, ra22.16a -+.set ra_wt_off_l0, ra22.16b -+ -+# Max pel value (for 8 bit we can get away with sat ops but not 9+) -+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the -+# 2nd byte but as the source should never be > 3 there 0x3ff should do -+.set ra_blk_height_pmax, ra23 -+.set ra_pmax, ra23.16a -+.set ra_blk_height, ra23.8c -+# --free -- ra23.8d -+ -+# Loop: src frame base (L0) -+.set ra_base, ra24 -+ -+# Misc offsets -+.set ra_fir_off_val_wt_den_p7, ra25 -+.set ra_wt_den_p7, ra25.8a -+# -- free -- ra25.8b -+.set ra_fir_off_val, ra25.16b -+ -+# As it happens these constants are the same -+.if FIR_OFFSET == MUL_ADD -+# Weight multiplier unsigned add -+.set ra_kmul_add, ra_fir_off_val -+.else -+.error "FIR_OFFSET != MUL_ADD: Need new register & init" -+.endif -+ -+# Loop: next src frame base (L0) -+.set ra_base_next, ra26 -+ -+# Loop: height<<23 + width<<16 + vdw_setup_0 -+.set ra_dma0, ra27 -+ -+# Loop: destination address -+.set ra_dest, ra28 -+ -+# Setup: Dup of rb_ef -+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul -+# (top bits are ignored by mul24) -+.set ra_ef, ra29 -+ -+# Use an even numbered register as a link register to avoid corrupting flags -+.set ra_link, ra30 -+ -+# -- free -- ra31 -+ -+.set rb_xshift2, rb0 -+.set rb_xshift2_next, rb1 -+ -+# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 -+.set rb_elem_x, rb2 -+ -+# El Flags -+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n -+# Duped into ra_ef as sometimes that is easier to use -+.set rb_ef, rb3 -+ -+# rb4-11 -+# Loop: V filter FIFO or V filter coeff -+ -+# Loop var: offset to add before shift (round + weighting offsets) -+# Exact value varies by loop -+.set rb_wt_off, rb12 -+ -+# -- free -- rb13 -+ -+# -- free -- rb14 -+ -+# Loop: src frame base (L1) -+.set rb_base2, rb15 -+ -+# Line pitch (128 for sand128) -+.set rb_pitch, rb16 -+ -+# Loop count - 2 (set up TMU for next xfer) -+.set rb_i_tmu, rb17 -+ -+# Loop count for min(height, 16) -+# Y will reset & loop again if height > 16 -+.set rb_lcount, rb18 -+ -+# frame_base2_next -+.set rb_base2_next, rb19 -+ -+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give -+# offset to the slice -+.set rb_xpitch, rb20 -+ -+# These 3 consts each save 1 instruction in Y loop setup -+# so whilst they are worthwhile they should be the 1st to die if we need -+# another b reg -+.set rb_y_coeffs_2, rb21 # 0x050b0a00 -+.set rb_y_coeffs_3, rb22 # 0x11283a40 -+.set rb_y_coeffs_5, rb23 # 0x0a0b0500 -+ -+# Setup: 0xff (8-bit) / 0xffff (9+ bit) -+.set rb_pmask, rb24 -+ -+# vdw_setup_1(dst_pitch) -+.set rb_dma1_base, rb25 -+ -+# Setup: pic width - 1 -+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. -+.set rb_max_x, rb26 -+ -+# vdw_setup_0 (depends on QPU number) -+.set rb_dma0_base, rb27 -+ -+# Setup: vw_setup value to reset VPM write pointer -+.set rb_vpm_init, rb28 -+ -+# Loop: vdw_setup_1(dst_pitch-width) = stride -+.set rb_dma1, rb29 -+ -+# Setup: pic_height - 1 -+.set rb_max_y, rb30 -+ -+# Setup: FIR H offset -+.set rb_fir_off_h, rb31 -+ -+ -+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. -+.set i_shift16, -16 -+.set i_shift21, -11 -+.set i_shift23, -9 -+.set i_shift30, -2 -+ -+# Much of the setup code is common between Y & C -+# Macros that express this - obviously these can't be overlapped -+# so are probably unsuitable for loop code -+ -+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma -+ mov r2, qpu_num -+.if v_bit_depth <= 8 -+ # 8 bit version -+ asr r1, r2, 2 -+ shl r1, r1, 6 -+ and r0, r2, 3 -+ or r0, r0, r1 -+ -+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+ add r_vpm, r0, r1 # VPM 8bit storage -+ -+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+ shl r0, r0, 5 -+ -+.else -+ # 16 bit version -+ # Limited to 8 QPUs if blk height > 8 -+ asr r1, r2, 1 -+.if v_blk_height <= 8 -+ shl r1, r1, 4 -+.else -+ shl r1, r1, 5 -+.endif -+ and r0, r2, 1 -+ or r0, r0, r1 -+ -+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR -+ add r_vpm, r0, r1 -+ -+ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into -+ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) -+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later -+ shl r0, r0, 6 -+.endif -+ add r_dma, r0, r1 # DMA out -+.endm -+ -+ -+.macro m_setup_q0 -+ srel -, 12 -+.endm -+ -+# Code start label -+::mc_start -+ -+################################################################################ -+# mc_setup_c -+# -+# typedef struct qpu_mc_pred_c_s_s { -+# int16_t y; -+# int16_t x; -+# uint32_t base; -+# uint32_t pic_cw; // C Width (== Y width / 2) -+# uint32_t pic_ch; // C Height (== Y Height / 2) -+# uint32_t stride2; -+# uint32_t stride1; -+# uint32_t wdenom; -+# int16_t y2; -+# int16_t x2; -+# uint32_t base2; -+# uint32_t next_fn; -+# } qpu_mc_pred_c_s_t; -+ -+.macro m_setup_c, v_bit_depth -+ -+# Cannot use mul24 on x as x might be -ve, so must use shift -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_pmask, 0xff -+.set v_blk_height, C_BLK_HEIGHT_8 -+.else -+.set v_x_shift, 2 -+.set v_pmask, 0xffff -+.set v_blk_height, C_BLK_HEIGHT_16 -+.endif -+ -+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y -+ -+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base -+ -+# Read image dimensions -+ sub r0, unif, 1 # pic c width -+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes -+ sub rb_max_y, unif, 1 # pic c height -+ -+# load constants -+ mov ra_kff800100, 0xff800100 -+ mov rb_pmask, v_pmask -+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+ -+# get source pitch -+ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2 -+ mov rb_pitch, unif # stride1 -+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly -+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 -+ -+ and r0, 1, elem_num -+ nop ; mul24 r0, r0, 5 -+.if v_bit_depth <= 8 -+ add rb_elem_x, r0, elem_num -+.else -+ add r0, r0, elem_num -+ add rb_elem_x, r0, r0 -+.endif -+ -+# Compute base address for first and second access -+# ra_base ends up with t0s base -+# ra_base2 ends up with t1s base -+ -+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] -+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice -+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y -+ min r0, r0, rb_max_x -+ -+# Get shift -+# Shift will always calculate as 0 for 9+ bit -+# Ideally we can optimize the shift out of the code in these cases but for now -+# it is tidier to leave it in -+.if v_bit_depth <= 8 -+ shl ra_xshift_next, r0, 3 -+.else -+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 -+.endif -+ -+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to -+ -+.if v_bit_depth <= 8 -+ and r0, r0, -4 -+.endif -+ sub r1, ra_k0, rb_pitch -+ and r1, r0, r1 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2 -+ add ra_base, ra_base, r0 -+ -+# Compute part of VPM to use for DMA output -+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? -+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base -+ -+# And again for L1, but only worrying about frame2 stuff -+ -+# Compute base address for first and second access -+# ra_base ends up with t0s base -+# rb_base2 ends up with t1s base -+ -+ shl r0, ra0.16b, v_x_shift -+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset -+ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2 -+ min r0, r0, rb_max_x -+ -+# Get shift (already zero if 9+ bit so ignore) -+.if v_bit_depth <= 8 -+ shl rb_xshift2_next, r0, 3 -+.endif -+ -+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs -+ -+.if v_bit_depth <= 8 -+ and r0, r0, -4 -+.endif -+ sub r1, ra_k0, rb_pitch -+ and r1, r0, r1 ; mov r3, PREREAD -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r2, ra_y2 -+ add rb_base2, rb_base2, r0 ; mov r0, ra_y -+ -+# Do preloads -+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD -+ -+:1 -+ sub.setf r3, r3, 1 -+ max r1, r0, 0 -+ min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 -+ -+ max r1, r2, 0 -+ brr.anynz -, r:1b -+ min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, rb_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz 1b -+ -+ mov ra_link, unif # link -+# touch registers to keep simulator happy (and fills in delay slots) -+ mov ra4, 0 ; mov rb4, 0 -+ bra -, ra_link -+ mov ra5, 0 ; mov rb5, 0 -+ mov ra6, 0 ; mov rb6, 0 -+ mov ra7, 0 ; mov rb7, 0 -+# >>> ra_link -+.endm -+ -+::mc_setup_c_q0 -+ m_setup_q0 -+::mc_setup_c_qn -+ m_setup_c 8 -+ -+################################################################################ -+# -+# mc_filter_c_p -+# -+# typedef struct qpu_mc_pred_c_p_s { -+# int16_t y; -+# int16_t x; -+# uint32_t base; -+# uint16_t h; -+# uint16_t w; -+# uint32_t coeffs_x; -+# uint32_t coeffs_y; -+# uint32_t wo_u; -+# uint32_t wo_v; -+# uint32_t dst_addr_c; -+# uint32_t next_fn; -+# } qpu_mc_pred_c_p_t; -+ -+.macro m_filter_c_p, v_tmu, v_bit_depth -+ -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+.set v_v_shift, 8 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 2 -+.set v_x_mul, 4 -+.set v_v_shift, i_shift16 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif -+ -+.if v_tmu == 0 -+.set vrx_xshift, rb_xshift2 # b side more convienient -+.set vrx_xshift_next, ra_xshift_next -+.set vra_y_next, ra_y_next -+.set vrx_base_next, ra_base_next -+.set vra_y, ra_y -+.set vra_base, ra_base -+.set vr_txs, t0s -+.else -+.set vrx_xshift, ra_xshift # a side more convienient -+.set vrx_xshift_next, rb_xshift2_next -+.set vra_y_next, ra_y2_next -+.set vrx_base_next, rb_base2_next -+.set vra_y, ra_y2 -+.set vra_base, rb_base2 -+.set vr_txs, t1s -+.endif -+ -+# denom shift values -+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) -+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) -+ -+# per-channel shifts were calculated on the *previous* invocation -+# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y -+ -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base -+ -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 -+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs -+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a -+ -+.if v_bit_depth <= 8 -+ shl vrx_xshift_next, r0, 3 -+ and r0, r0, -4 -+.endif -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs -+ add vrx_base_next, r3, r0 ; mov r1, ra_height -+ -+# set up VPM write -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight -+ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight -+ -+# Misc final setup... -+ -+ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr -+ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2) -+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register -+ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight -+ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 -+ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add -+ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4) -+ mov rb11, ra3.8d ; mov ra_link, unif # ; Link -+ -+# r5 = -4 (loop counter) -+# ra_wt_mul_l0 = weight L0 + 128 (now unsigned) -+# rb_wt_off = (offset * 2 + 1) << (wt_den + 5) -+# rb31 = FIR value offset -+ -+# FIFO: rb4, ra5, rb6, ra7 -+# Coeffs in ra3.8a, ra3.8b, rb10, rb11 -+ -+# We want (r0r1) -+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... -+# We fetch (after shift) -+# C0 : C3 : C1 : C4 : C2 : C5 : ... -+ -+:1 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+.if v_tmu == 0 -+ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 -+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+.else -+ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 -+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay] -+.endif -+ -+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+ min r3, r3, rb_max_y ; mov.ifnc r0, r2 -+ -+ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch -+.if v_tmu == 0 -+ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes -+.else -+ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes -+.endif -+ -+# apply horizontal filter -+# The filter coeffs for the two halves of this are the same (unlike in the -+# Y case) so it doesn't matter which ra0 we get them from -+# Also as the two halves are locked together we don't need to separate the 1st -+# r0 mul or the last r1 mul as they are valid for all QPUs -+ -+ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 -+ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 -+ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+ -+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift) -+# We would like to save the r5->r4 shift but we need a delay slot -+# for both r7 & r6 which we can't find anything to put in if we have -+# already multiplied r4 & r5! -+ brr.anyn -, r:1b -+ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post -+ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post -+ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 -+# >>> .anyn 1b -+ -+ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay] -+ sub r1, r1, r0 ; mul24 r0, ra7, rb11 -+ sub r1, r1, r0 -+ -+ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop -+ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop -+ brr.anyn -, r:1b -+ asr r1, r1, i_wt_den_p6 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop -+# >>> .anyn 1b -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link -+ -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_c_p -+ m_filter_c_p 0, 8 -+ -+::mc_filter_c_p_l1 -+ m_filter_c_p 1, 8 -+ -+################################################################################ -+# -+# mc_filter_c_b -+# -+# typedef struct qpu_mc_pred_c_b_s { -+# int16_t y; -+# int16_t x; -+# uint32_t base; -+# uint16_t h; -+# uint16_t w; -+# uint32_t coeffs_x1; -+# uint32_t coeffs_y1; -+# int16_t weight_u1; -+# int16_t weight_v1; -+# int16_t y2; -+# int16_t x2; -+# uint32_t base2; -+# uint32_t coeffs_x2; -+# uint32_t coeffs_y2; -+# uint32_t wo_u2; -+# uint32_t wo_v2; -+# uint32_t dst_addr_c; -+# uint32_t next_fn; -+# } qpu_mc_pred_c_b_t; -+ -+.macro m_filter_c_b, v_bit_depth -+ -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_v_shift, 8 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 2 -+.set v_v_shift, i_shift16 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif -+.set v_x_mul, (1 << v_x_shift) -+ -+# denom shift values -+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) -+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) -+ -+# per-channel shifts were calculated on the *previous* invocation -+ -+# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y -+ -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base -+ -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 -+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs -+ -+.if v_bit_depth <= 8 -+ shl ra_xshift_next, r0, 3 -+.endif -+ -+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B -+ -+# set up VPM write -+ -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight -+ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight -+ -+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 -+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base -+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register -+ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x -+ -+# L1 - uniform layout could possibly be optimized -+ -+ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<>> .anyn 1b -+ -+ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0 -+ sub.setf -, r5, rb_lcount ; mov r0, ra4 -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+ add r1, r1, r0 ; mul24 r0, ra7, rb7 -+ -+ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1 -+ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1 -+ sub r2, r2, r0 -+ -+ shr r1, r1, 6 -+ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 -+ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 -+ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add -+ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop -+ -+ brr.anyn -, r:1b -+ asr r1, r1, ra_wt_den_p7 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop -+# >>> .anyn 1b -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link -+ -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_c_b -+ m_filter_c_b 8 -+ -+################################################################################ -+# Exit code used by both Luma & Chroma so place between them to avoid I-cache -+# conflicts -+ -+.macro m_exit_drain -+.if PREREAD == 2 -+# Special case 2 as loop is wasteful -+ nop ; nop ; ldtmu0 -+ nop ; nop ; ldtmu1 -+ nop ; nop ; ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 -+.else -+ mov.setf r3, PREREAD - 1 -+:1 -+ brr.anynz -, r:1b -+ nop ; nop ; ldtmu0 -+ nop ; nop ; ldtmu1 -+ sub.setf r3, r3, 1 -+ # >>> -+ mov -, vw_wait -+.endif -+.endm -+ -+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) -+# All qpus start at the beginning and after that (group - 1) must have finished -+# before (group) can start -+# -+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain -+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - -+# lockup otherwise) -+# -+# There is some, currently ill defined, potential lockup if we have the VDM active -+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? -+# -+# The code stalled when I had many waiters on a single sem so we have a -+# "ripple" of srels to restart. Unsure why, may have been bug, but this works -+# and we currently have both the memory & sems to support it. -+.macro m_sync_q, n_qpu, n_quads -+# Do not generate code for qpu >= quads * 4 - fns should never be called -+.if n_qpu < n_quads * 4 -+ mov ra_link, unif # Can only branch to an a reg (not r0) -+ mov -, vw_wait # [ra_link delay] -+ -+.set n_sem_sync, n_qpu - (n_qpu % 4) -+.set n_sem_in, n_qpu -+.set n_sem_out, n_qpu + 1 -+ -+.if n_qpu % 4 == 0 -+ -+.set n_sem_quad_in, 12 + n_qpu / 4 -+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) -+ -+ sacq -, n_sem_sync -+ sacq -, n_sem_sync -+ sacq -, n_sem_sync -+ bra -, ra_link -+ sacq -, n_sem_quad_in -+ srel -, n_sem_out -+ srel -, n_sem_quad_out -+ -+.else -+ bra -, ra_link -+ srel -, n_sem_sync -+ sacq -, n_sem_in -+.if n_sem_out % 4 != 0 -+ srel -, n_sem_out -+.else -+ nop -+.endif -+.endif -+.endif -+.endm -+ -+.set v_quads8, N_QPU_8 / 4 -+ -+::mc_sync_q0 -+ m_sync_q 0, v_quads8 -+::mc_sync_q1 -+ m_sync_q 1, v_quads8 -+::mc_sync_q2 -+ m_sync_q 2, v_quads8 -+::mc_sync_q3 -+ m_sync_q 3, v_quads8 -+::mc_sync_q4 -+ m_sync_q 4, v_quads8 -+::mc_sync_q5 -+ m_sync_q 5, v_quads8 -+::mc_sync_q6 -+ m_sync_q 6, v_quads8 -+::mc_sync_q7 -+ m_sync_q 7, v_quads8 -+::mc_sync_q8 -+ m_sync_q 8, v_quads8 -+::mc_sync_q9 -+ m_sync_q 9, v_quads8 -+::mc_sync_q10 -+ m_sync_q 10, v_quads8 -+::mc_sync_q11 -+ m_sync_q 11, v_quads8 -+ -+# mc_exit() -+# Chroma & Luma the same now -+ -+.macro m_exit_qn -+ m_exit_drain -+ nop ; nop ; thrend -+ nop -+ nop -+# >>> thrend <<< -+.endm -+ -+::mc_exit_c_qn -+::mc_exit_y_qn -+ m_exit_qn -+ -+ -+ -+# mc_interrupt_exit12() -+ -+.macro m_exit_q0 -+ m_exit_drain -+ sacq -, 12 -+ nop ; nop ; thrend -+ mov interrupt, 1 -+ nop -+# >>> thrend <<< -+.endm -+ -+::mc_exit_c_q0 -+::mc_exit_y_q0 -+ m_exit_q0 -+ -+# LUMA CODE -+ -+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. -+# For P frames we make the second x,y coordinates offset by +8 -+ -+ -+################################################################################ -+# mc_setup -+# -+# typedef struct qpu_mc_pred_y_s_s { -+# qpu_mc_src_t next_src1; -+# qpu_mc_src_t next_src2; -+# uint16_t pic_h; -+# uint16_t pic_w; -+# uint32_t stride2; -+# uint32_t stride1; -+# uint32_t wdenom; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_s_t; -+ -+.macro m_setup_y, v_bit_depth -+ -+# Cannot use mul24 on x as x might be -ve, so must use shift -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_pmask, 0xff -+.set v_blk_height, Y_BLK_HEIGHT_8 -+.else -+.set v_x_shift, 1 -+.set v_pmask, 0xffff -+.set v_blk_height, Y_BLK_HEIGHT_16 -+.endif -+ -+ -+ # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y -+ mov ra9, unif # ref_y_base -+ mov ra1, unif # x2_y2 -+ -+ -+# load constants -+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base -+ -+ mov ra_kff800100, 0xff800100 -+ mov rb_pmask, v_pmask -+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) -+ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) -+ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) -+ mov rb_y_coeffs_2, 0x050b0a00 -+ mov rb_y_coeffs_3, 0x11283a40 -+ mov rb_y_coeffs_5, 0x0a0b0500 -+ -+# Compute part of VPM to use -+ -+# Read image dimensions -+ mov ra3, unif # width_height -+ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2 -+.if v_x_shift == 0 -+ sub rb_max_x, ra3.16b, 1 -+.else -+ sub r0, ra3.16b, 1 -+ shl rb_max_x, r0, v_x_shift -+.endif -+ sub rb_max_y, ra3.16a, 1 -+ mov r3, elem_num ; mov rb_pitch, unif # stride1 -+ -+# get destination pitch -+ mov r1, vdw_setup_1(0) # [rb_pitch delay] -+ or rb_dma1_base, r1, rb_pitch -+ -+# Compute base address for first and second access -+ add r0, ra0.16b, r3 # Load x + elem_num -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, 0 -+ min r0, r0, rb_max_x -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ -+# X is byte offset - we can only load words - mask -+ -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add ra_base, ra9, r0 -+ -+ # r3 still contains elem_num -+ add r0, ra1.16b, r3 # Load x -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, 0 -+ min r0, r0, rb_max_x -+ shl rb_xshift2_next, r0, 3 # Compute shifts -+ -+ # r2 still contains mask -+ and r0, r0, -4 -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add rb_base2, ra11, r0 -+ -+# Do preloads -+ nop ; mov r0, ra0.16a # ; r0 = y -+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 -+ -+:1 -+ sub.setf r3, r3, 1 -+ max r1, r0, 0 -+ min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 -+ -+ max r1, r2, 0 -+ brr.anynz -, r:1b -+ min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, rb_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz 1b -+ -+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base -+ -+ mov ra_link, unif # Next fn -+ -+# touch vertical context to keep simulator happy -+ mov ra8, 0 ; mov rb8, 0 # [ra_link delay] -+ bra -, ra_link -+ mov ra9, 0 ; mov rb9, 0 -+ mov ra10, 0 ; mov rb10, 0 -+ mov ra11, 0 ; mov rb11, 0 -+# >>> ra_link -+.endm -+ -+::mc_setup_y_q0 -+ m_setup_q0 -+::mc_setup_y_qn -+ m_setup_y 8 -+ -+################################################################################ -+# -+# Start of per-block setup code -+# P and B blocks share the same setup code to save on Icache space -+ -+# get base addresses and per-channel shifts for *next* invocation -+# per-channel shifts were calculated on the *previous* invocation -+ -+# 1st 3 instructions of per_block-setup in branch delay -+# -+# typedef struct qpu_mc_pred_y_p_s { -+# qpu_mc_src_t next_src1; -+# qpu_mc_src_t next_src2; -+# uint16_t h; -+# uint16_t w; -+# uint32_t mymx21; -+# uint32_t wo1; -+# uint32_t wo2; -+# uint32_t dst_addr; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_p_t; -+# -+ -+.macro m_luma_setup, v_bit_depth -+# Hack - QASM may well have have label pasting but I have no idea how... -+.if v_bit_depth == 8 -+ brr ra_link, r:per_block_setup_8 -+.elif v_bit_depth == 10 -+ brr ra_link, r:per_block_setup_10 -+.endif -+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? -+ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 -+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+.endm -+ -+.macro m_per_block_setup, v_bit_depth -+ -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_x_mul, 1 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif -+ -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x -+ -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 -+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base -+ and r1, r0, r2 ; mov ra_y_next, ra0.16a -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y -+ add ra_base_next, ra_base_next, r0 # [ra1 delay] -+ -+ add r0, ra1.16b, r3 # Load x2 -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base -+ shl rb_xshift2_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height -+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes -+ add rb_base2_next, rb_base2_next, r0 -+ -+# get width,height of block (unif load above), r1 = width * pel_size -+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) -+ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height -+ add rb_lcount, r0, (7-8) -+ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val -+ add r0, r0, r1 # Combine width and height of destination area -+ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val -+ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets -+ -+# get filter coefficients and discard unused B frame values -+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight -+ shl ra8, r0, 3 ; mov rb5, ra_k255 -+ -+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) -+ -+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val -+# but I can't see a way of doing that that is cheap enough to be worth it -+ -+# Picked out in a slightly random order to space out uniform loads -+ -+ # 1 -+ mov r1, 0x01040400 # [ra8 delay] -+ ror ra2.8b, r1, ra8.8d -+ ror ra0.8b, r1, ra8.8c -+ # 2 -+ ror ra2.8c, rb_y_coeffs_2, ra8.8d -+ ror ra0.8c, rb_y_coeffs_2, ra8.8c -+ # 0 -+ mov r1,0x00010100 # -ve [ra8 delay] -+ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset -+ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 -+ # 7 -+ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000 -+ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address -+ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 -+ # 3 -+ ror ra2.8d, rb_y_coeffs_3, ra8.8d -+ ror ra0.8d, rb_y_coeffs_3, ra8.8c -+ # 5 -+ ror ra3.8b, rb_y_coeffs_5, ra8.8d -+ ror ra1.8b, rb_y_coeffs_5, ra8.8c -+ # 6 -+ mov r1,0x04040100 -+ ror ra3.8c, r1, ra8.8d -+ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val -+ -+ bra -, ra_link -+ # 4 -+ mov r1,0x3a281100 -+ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val -+ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 -+# >>> branch ra_link -+ -+# r5 = -8 -+# r2 = fir_off_val -+# r3 = 128 -+.endm -+ -+:per_block_setup_8 -+ m_per_block_setup 8 -+ -+ -+ -+################################################################################ -+# -+# mc_filter_y_pxx -+# -+# Setup (& therefore uniform struct) shared with _bxx -+# Struct in m_luma_setup -+# -+# We can have 2 separate P reqs here as long as they mate to generate a -+# rectangular output block (i.e. h0 = h1, w0 = 8) -+# -+# At this point we have already issued PREREAD pairs of texture requests for the current block -+ -+.macro m_filter_y_pxx, v_bit_depth -+ -+# denom shift values -+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) -+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) -+ -+ m_luma_setup v_bit_depth -+ -+ shl r1, ra_wt_off_l0, i_wt_den_p5 -+ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul -+ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+ -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+# This loop is identical to the B loop from here ---> -+:1 -+ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+ -+ max r2, ra_y, 0 ; mov r1, 0 -+ min r2, r2, rb_max_y ; mov r3, ra_k1 -+ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+ add t0s, ra_base, r2 ; mov rb5, rb6 -+ shr r0, r4, ra_xshift ; mov rb6, rb7 -+ -+ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes -+ shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+ add t1s, rb_base2, r2 ; mov ra8, ra9 -+ -+# apply horizontal filter -+ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ -+ brr.anyn -, r:1b -+ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+ # >>> .anyn 1b (r5 + r5) -+ -+ # apply vertical filter and write to VPM -+ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 -+ -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb8 -+ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+ add r1, r1, r0 ; mul24 r0, ra11, rb11 -+# <--- to here -+ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height -+ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next -+ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next -+ -+ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next -+ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 -+ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add -+ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) -+ -+ brr.anyn -, r:1b -+ asr r1, r1, i_wt_den_p6 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop -+# >>> branch.anyn 1b (r5 - rb_lcount) -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link -+ -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_y_pxx -+ m_filter_y_pxx 8 -+ -+ -+################################################################################ -+ -+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+# -+# Setup (& therefore uniform struct) shared with _pxx -+# Struct in m_luma_setup -+# -+# l0 calc in els 0-7, L1 in 8-15 -+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh) -+# -+# At this point we have already issued PREREAD pairs of texture requests for the current block -+ -+.macro m_filter_y_bxx, v_bit_depth -+ -+# denom shift values -+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) -+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) -+ -+ m_luma_setup v_bit_depth -+ -+ shl r1, ra_wt_off_l0, i_wt_den_p6 -+ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 -+ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 -+ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 -+ -+# This loop is identical to the P loop from here ---> -+:1 -+ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef -+ -+ max r2, ra_y, 0 ; mov r1, 0 -+ min r2, r2, rb_max_y ; mov r3, ra_k1 -+ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 -+ add t0s, ra_base, r2 ; mov rb5, rb6 -+ shr r0, r4, ra_xshift ; mov rb6, rb7 -+ -+ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes -+ shr r1, r4, rb_xshift2 ; mov rb7, ra8 -+ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax -+ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch -+ add t1s, rb_base2, r2 ; mov ra8, ra9 -+ -+# apply horizontal filter -+ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 -+ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ -+ brr.anyn -, r:1b -+ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b -+ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 -+ # >>> .anyn 1b (r5 + r5) -+ -+ # apply vertical filter and write to VPM -+ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 -+ -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb8 -+ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c -+ add r1, r1, r0 ; mul24 r0, ra11, rb11 -+# <--- to here -+ sub r1, r1, ra4 -+ sub r1, r1, r0 ; mov r2, rb_wt_off -+ -+ asr r1, r1, 6 -+ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 -+ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add -+ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next -+ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next -+ add r1, r1, r2 ; mov r0, r1 << 8 -+ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height -+ -+ brr.anyn -, r:1b -+ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) -+# >>> branch.anyn 1b (r5 - rb_lcount) -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed block_height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link (ra_height - remaining height) -+ -+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_y_bxx -+ m_filter_y_bxx 8 -+ -+################################################################################ -+# -+# typedef struct qpu_mc_pred_y_p00_s { -+# qpu_mc_src_t next_src1; -+# uint16_t h; -+# uint16_t w; -+# uint32_t wo1; -+# uint32_t dst_addr; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_p00_t; -+ -+.macro m_filter_y_p00, v_bit_depth -+ -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_x_mul, 1 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+# Shifts to get width & height in the right place in ra_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif -+ -+ mov ra0, unif ; mov r0, elem_num # y_x -+ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0 -+ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ -+ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height -+ min r0, r0, rb_max_x ; mov ra_width_height, unif -+ -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 -+ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr -+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write -+ -+# get width,height of block (unif load above) -+# Compute vdw_setup1(dst_pitch-width) -+ shl r1, ra_width, v_x_shift -+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+ add r0, r0, r1 # Combine width and height of destination area -+ shl rb_wt_off, ra_wt_off_l0, DENOM + 7 -+ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link -+ add ra_dma0, r0, rb_dma0_base -+ -+:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch -+ -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask -+ -+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+ shl r1, r1, 8 ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+ -+ brr.anyn -, r:1b -+ asr r1, r1, DENOM + 8 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link -+ -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_y_p00 -+ m_filter_y_p00 8 -+ -+################################################################################ -+ -+.macro m_filter_y_b00, v_bit_depth -+# luma setup does a fair bit more than we need calculating filter coeffs -+# that we will never use but it saves I-cache to use it (also simple!) -+ m_luma_setup v_bit_depth -+ -+# Fix up vals that were expecting a filter (somewhat icky) -+ mov r2, 1 -+ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want -+ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero -+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 -+ -+:1 -+ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch -+ -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next -+ -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte -+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 -+ -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 -+ -+ shl r1, r1, 8 ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+ -+ brr.anyn -, r:1b -+ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b -+ -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height -+ -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc ra_dma0, rb_lcount based on new segment height -+ -+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 -+ -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride -+ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link -+ -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ brr -, r:1b -+ add rb_lcount, rb_lcount, r0 -+ add ra_dma0, ra_dma0, r1 -+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer -+# >>> 1b -+.endm -+ -+::mc_filter_y_b00 -+ m_filter_y_b00 8 -+ -+################################################################################ -+################################################################################ -+# 10 BIT -+ -+::mc_setup_c10_q0 -+ m_setup_q0 -+::mc_setup_c10_qn -+ m_setup_c 10 -+ -+::mc_filter_c10_p -+ m_filter_c_p 0, 10 -+ -+::mc_filter_c10_p_l1 -+ m_filter_c_p 1, 10 -+ -+ -+::mc_filter_c10_b -+ m_filter_c_b 10 -+ -+# Even if these fns are the same as for other bit depths we want our own copy -+# to keep the code we are using in a single lump to avoid (direct map) cache -+# thrashing -+.set v_quads10, N_QPU_16 / 4 -+ -+::mc_sync10_q0 -+ m_sync_q 0, v_quads10 -+::mc_sync10_q1 -+ m_sync_q 1, v_quads10 -+::mc_sync10_q2 -+ m_sync_q 2, v_quads10 -+::mc_sync10_q3 -+ m_sync_q 3, v_quads10 -+::mc_sync10_q4 -+ m_sync_q 4, v_quads10 -+::mc_sync10_q5 -+ m_sync_q 5, v_quads10 -+::mc_sync10_q6 -+ m_sync_q 6, v_quads10 -+::mc_sync10_q7 -+ m_sync_q 7, v_quads10 -+::mc_sync10_q8 -+ m_sync_q 8, v_quads10 -+::mc_sync10_q9 -+ m_sync_q 9, v_quads10 -+::mc_sync10_q10 -+ m_sync_q 10, v_quads10 -+::mc_sync10_q11 -+ m_sync_q 11, v_quads10 -+ -+::mc_exit_y10_q0 -+::mc_exit_c10_q0 -+ m_exit_q0 -+ -+::mc_exit_y10_qn -+::mc_exit_c10_qn -+ m_exit_qn -+ -+::mc_setup_y10_q0 -+ m_setup_q0 -+::mc_setup_y10_qn -+ m_setup_y 10 -+ -+:per_block_setup_10 -+ m_per_block_setup 10 -+ -+::mc_filter_y10_pxx -+ m_filter_y_pxx 10 -+ -+::mc_filter_y10_p00 -+ m_filter_y_p00 10 -+ -+::mc_filter_y10_bxx -+ m_filter_y_bxx 10 -+ -+::mc_filter_y10_b00 -+ m_filter_y_b00 10 -+ -+ -+ -+::mc_end -+# Do not add code here because mc_end must appear after all other code. -diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h -new file mode 100644 -index 0000000000..89711d776b ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader_cmd.h -@@ -0,0 +1,165 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#ifndef RPI_SHADER_CMD_H -+#define RPI_SHADER_CMD_H -+ -+#pragma pack(push, 4) -+ -+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y -+// If mixed then we are just confused and get a lot of warnings.... -+typedef const uint8_t * qpu_mc_src_addr_t; -+typedef uint8_t * qpu_mc_dst_addr_t; -+#else -+typedef uint32_t qpu_mc_src_addr_t; -+typedef uint32_t qpu_mc_dst_addr_t; -+#endif -+ -+typedef struct qpu_mc_src_s -+{ -+ int16_t y; -+ int16_t x; -+ qpu_mc_src_addr_t base; -+} qpu_mc_src_t; -+ -+ -+typedef struct qpu_mc_pred_c_p_s { -+ qpu_mc_src_t next_src; -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ qpu_mc_dst_addr_t dst_addr_c; -+ uint32_t next_fn; -+} qpu_mc_pred_c_p_t; -+ -+typedef struct qpu_mc_pred_c_b_s { -+ qpu_mc_src_t next_src1; -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x1; -+ uint32_t coeffs_y1; -+ int16_t weight_u1; -+ int16_t weight_v1; -+ qpu_mc_src_t next_src2; -+ uint32_t coeffs_x2; -+ uint32_t coeffs_y2; -+ uint32_t wo_u2; -+ uint32_t wo_v2; -+ qpu_mc_dst_addr_t dst_addr_c; -+ uint32_t next_fn; -+} qpu_mc_pred_c_b_t; -+ -+typedef struct qpu_mc_pred_c_s_s { -+ qpu_mc_src_t next_src1; -+ uint32_t pic_cw; // C Width (== Y width / 2) -+ uint32_t pic_ch; // C Height (== Y Height / 2) -+ uint32_t stride2; -+ uint32_t stride1; -+ qpu_mc_src_t next_src2; -+ uint32_t next_fn; -+} qpu_mc_pred_c_s_t; -+ -+typedef struct qpu_mc_pred_c_s { -+ union { -+ qpu_mc_pred_c_p_t p; -+ qpu_mc_pred_c_b_t b; -+ qpu_mc_pred_c_s_t s; -+ }; -+} qpu_mc_pred_c_t; -+ -+ -+typedef struct qpu_mc_pred_y_p_s { -+ qpu_mc_src_t next_src1; -+ qpu_mc_src_t next_src2; -+ uint16_t h; -+ uint16_t w; -+ uint32_t mymx21; -+ uint32_t wo1; -+ uint32_t wo2; -+ qpu_mc_dst_addr_t dst_addr; -+ uint32_t next_fn; -+} qpu_mc_pred_y_p_t; -+ -+typedef struct qpu_mc_pred_y_p00_s { -+ qpu_mc_src_t next_src1; -+ uint16_t h; -+ uint16_t w; -+ uint32_t wo1; -+ qpu_mc_dst_addr_t dst_addr; -+ uint32_t next_fn; -+} qpu_mc_pred_y_p00_t; -+ -+typedef struct qpu_mc_pred_y_s_s { -+ qpu_mc_src_t next_src1; -+ qpu_mc_src_t next_src2; -+ uint16_t pic_h; -+ uint16_t pic_w; -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t next_fn; -+} qpu_mc_pred_y_s_t; -+ -+typedef struct qpu_mc_pred_sync_s { -+ uint32_t next_fn; -+} qpu_mc_pred_sync_t; -+ -+// Only a useful structure in that it allows us to return something other than a void * -+typedef struct qpu_mc_pred_y_s { -+ union { -+ qpu_mc_pred_y_p_t p; -+ qpu_mc_pred_y_p00_t p00; -+ qpu_mc_pred_y_s_t s; -+ }; -+} qpu_mc_pred_y_t; -+ -+typedef union qpu_mc_pred_cmd_u { -+ qpu_mc_pred_y_t y; -+ qpu_mc_pred_c_t c; -+ qpu_mc_pred_sync_t sync; -+} qpu_mc_pred_cmd_t; -+ -+static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn) -+{ -+ // Link is last el of previous cmd -+ ((uint32_t *)cmd)[-1] = fn; -+} -+ -+#define QPU_MC_PRED_N_Y8 12 -+#define QPU_MC_PRED_N_C8 12 -+ -+#define QPU_MC_PRED_N_Y10 12 -+#define QPU_MC_PRED_N_C10 12 -+ -+#define QPU_MC_DENOM 7 -+ -+#pragma pack(pop) -+ -+#endif -+ -diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c -new file mode 100644 -index 0000000000..77d8366eb8 ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader_template.c -@@ -0,0 +1,88 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#include "hevc.h" -+#include "rpi_hevcdec.h" -+#include "libavutil/rpi_sand_fns.h" -+#include "rpi_hevc_shader_cmd.h" -+#include "rpi_hevc_shader_template.h" -+ -+typedef struct shader_track_s -+{ -+ const union qpu_mc_pred_cmd_u *qpu_mc_curr; -+ const struct qpu_mc_src_s *last_l0; -+ const struct qpu_mc_src_s *last_l1; -+ uint32_t width; // pic_width * PW -+ uint32_t height; -+ uint32_t stride2; -+ uint32_t stride1; -+} shader_track_t; -+ -+static int wtoidx(const unsigned int w) -+{ -+ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; -+ return pel_weight[w]; -+} -+ -+static const int fctom(uint32_t x) -+{ -+ int rv; -+ // As it happens we can take the 2nd filter term & divide it by 8 -+ // (dropping fractions) to get the fractional move -+ rv = 8 - ((x >> 11) & 0xf); -+ av_assert2(rv >= 0 && rv <= 7); -+ return rv; -+} -+ -+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) -+{ -+ return (x << shl) >> shr; -+} -+ -+static inline int woff_p(HEVCRpiContext *const s, int32_t x) -+{ -+ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); -+} -+ -+static inline int woff_b(HEVCRpiContext *const s, int32_t x) -+{ -+ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); -+} -+ -+static inline int wweight(int32_t x) -+{ -+ return ext(x, 16, 16); -+} -+ -+ -+#define PW 1 -+#include "rpi_hevc_shader_template_fn.h" -+ -+#undef PW -+#define PW 2 -+#include "rpi_hevc_shader_template_fn.h" -+ -diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h -new file mode 100644 -index 0000000000..0fc5a45e9f ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader_template.h -@@ -0,0 +1,49 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H -+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H -+ -+struct HEVCRpiContext; -+struct HEVCRpiInterPredEnv; -+ -+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s, -+ const struct HEVCRpiInterPredEnv *const ipe_y, -+ const struct HEVCRpiInterPredEnv *const ipe_c); -+ -+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s, -+ const struct HEVCRpiInterPredEnv *const ipe_y, -+ const struct HEVCRpiInterPredEnv *const ipe_c); -+ -+void rpi_sand_dump8(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); -+ -+void rpi_sand_dump16(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); -+ -+#endif -+ -diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h -new file mode 100644 -index 0000000000..10c163a4b9 ---- /dev/null -+++ b/libavcodec/rpi_hevc_shader_template_fn.h -@@ -0,0 +1,502 @@ -+/* -+Copyright (c) 2017 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#define STRCAT(x,y) x##y -+ -+#if PW == 1 -+#define pixel uint8_t -+#define FUNC(f) STRCAT(f, 8) -+#elif PW == 2 -+#define pixel uint16_t -+#define FUNC(f) STRCAT(f, 16) -+#else -+#error Unexpected PW -+#endif -+ -+#define PATCH_STRIDE (16 * PW) -+ -+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) -+{ -+ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { -+ const pixel s = *(const pixel *)src; -+ pixel * d = (pixel *)dst; -+ for (unsigned int j = 0; j < w; j += PW) { -+ *d++ = s; -+ } -+ } -+} -+ -+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) -+{ -+ for (unsigned int i = 0; i != h; ++i, dst += stride) { -+ memcpy(dst, src, w); -+ } -+} -+ -+static void FUNC(get_patch_y)(const shader_track_t * const st, -+ uint8_t * dst, const unsigned int dst_stride, -+ const qpu_mc_src_t *src, -+ unsigned int _w, unsigned int _h) -+{ -+ int x = src->x * PW; -+ int y = src->y; -+ int w = _w * PW; -+ int h = _h; -+ int dl = 0; -+ int dr = 0; -+ int dt = 0; -+ int db = 0; -+ -+ if (x < 0) { -+ if (-x >= w) -+ x = PW - w; -+ dl = -x; -+ w += x; -+ x = 0; -+ } -+ if (x + w > st->width) { -+ if (x >= st->width) -+ x = st->width - PW; -+ dr = (x + w) - st->width; -+ w = st->width - x; -+ } -+ -+ // Y -+ if (y < 0) { -+ if (-y >= h) -+ y = 1 - h; -+ dt = -y; -+ h += y; -+ y = 0; -+ } -+ if (y + h > st->height) { -+ if (y >= st->height) -+ y = st->height - 1; -+ db = (y + h) - st->height; -+ h = st->height - y; -+ } -+ -+ dst += dl + dt * dst_stride; -+ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); -+ -+ // Edge dup -+ if (dl != 0) -+ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); -+ if (dr != 0) -+ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); -+ w += dl + dr; -+ dst -= dl; -+ -+ if (dt != 0) -+ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); -+ if (db != 0) -+ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); -+} -+ -+ -+ -+static void FUNC(get_patch_c)(const shader_track_t * const st, -+ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, -+ const qpu_mc_src_t *src, -+ unsigned int _w, unsigned int _h) -+{ -+ int x = src->x * PW; -+ int y = src->y; -+ int w = _w * PW; -+ int h = _h; -+ int dl = 0; -+ int dr = 0; -+ int dt = 0; -+ int db = 0; -+ const int width = st->width; -+ const int height = st->height; -+ -+ if (x < 0) { -+ if (-x >= w) -+ x = PW - w; -+ dl = -x; -+ w += x; -+ x = 0; -+ } -+ if (x + w > width) { -+ if (x >= width) -+ x = width - PW; -+ dr = (x + w) - width; -+ w = width - x; -+ } -+ -+ // Y -+ if (y < 0) { -+ if (-y >= h) -+ y = 1 - h; -+ dt = -y; -+ h += y; -+ y = 0; -+ } -+ if (y + h > height) { -+ if (y >= height) -+ y = height - 1; -+ db = (y + h) - height; -+ h = height - y; -+ } -+ -+ dst_u += dl + dt * dst_stride; -+ dst_v += dl + dt * dst_stride; -+ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); -+ -+ // Edge dup -+ if (dl != 0) -+ { -+ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); -+ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); -+ } -+ if (dr != 0) -+ { -+ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); -+ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); -+ } -+ w += dl + dr; -+ dst_u -= dl; -+ dst_v -= dl; -+ -+ if (dt != 0) -+ { -+ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); -+ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); -+ } -+ if (db != 0) -+ { -+ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); -+ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); -+ } -+} -+ -+// w, y, w, h in pixels -+// stride1, stride2 in bytes -+void FUNC(rpi_sand_dump)(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) -+{ -+ const int mask = stride2 == 0 ? ~0 : stride1 - 1; -+ -+ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); -+ -+ if (is_c) { -+ x *= 2; -+ w *= 2; -+ } -+ -+ for (int i = y; i != y + h; ++i) { -+ for (int j = x; j != x + w; ++j) { -+ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; -+ char sep = is_c && (j & 1) == 0 ? ':' : ' '; -+#if PW == 1 -+ if (j < 0 || i < 0) -+ printf("..%c", sep); -+ else -+ printf("%02x%c", *(const pixel*)p, sep); -+#else -+ if (j < 0 || i < 0) -+ printf("...%c", sep); -+ else -+ printf("%03x%c", *(const pixel*)p, sep); -+#endif -+ } -+ printf("\n"); -+ } -+} -+ -+ -+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s, -+ const HEVCRpiInterPredEnv *const ipe_y, -+ const HEVCRpiInterPredEnv *const ipe_c) -+{ -+ for (int c_idx = 0; c_idx < 2; ++c_idx) -+ { -+ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; -+ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; -+ unsigned int exit_n = 0; -+ -+ if (ipe == NULL || !ipe->used) { -+ continue; -+ } -+ -+ do { -+ for (unsigned int i = 0; i != ipe->n; ++i) { -+ const HEVCRpiInterPredQ * const q = ipe->q + i; -+ shader_track_t * const st = tracka + i; -+ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; -+ -+ for (;;) { -+ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; -+ -+ if (link == q->code_setup) { -+ if (c_idx == 0) { -+ // Luma -+ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; -+ -+ st->height = c->pic_h; -+ st->width = c->pic_w * PW; -+ st->stride1 = c->stride1; -+ st->stride2 = c->stride2; -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else { -+ // Chroma -+ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; -+ -+ st->height = c->pic_ch; -+ st->width = c->pic_cw * PW; -+ st->stride1 = c->stride1; -+ st->stride2 = c->stride2; -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ } -+ else if (link == s->qpu.y_pxx) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; -+ const int w1 = FFMIN(c->w, 8); -+ const int w2 = c->w - w1; -+ -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); -+ if (w2 > 0) { -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h + 7); -+ } -+ -+ // wo[offset] = offset*2+1 -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); -+ if (w2 > 0) { -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( -+ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); -+ } -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_bxx) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; -+ -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; -+ -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h + 7); -+ -+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( -+ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); -+ -+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, -+ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), -+ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_p00) { -+ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; -+ -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); -+ -+ // wo[offset] = offset*2+1 -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); -+ -+ st->last_l0 = &c->next_src1; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_b00) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; -+ -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; -+ -+ av_assert0(c->w <= 16 && c->h <= 64); -+ -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h); -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h); -+ -+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( -+ patch_y3, patch_y1, PATCH_STRIDE, -+ c->h, 0, 0, c->w); -+ -+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, -+ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), -+ 0, woff_b(s, c->wo2), 0, 0, c->w); -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_pxx) { -+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; -+ const int mx = fctom(c->coeffs_x); -+ const int my = fctom(c->coeffs_y); -+ -+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; -+ -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); -+ -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); -+ -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); -+ -+ st->last_l0 = &c->next_src; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_pxx_l1) { -+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; -+ const int mx = fctom(c->coeffs_x); -+ const int my = fctom(c->coeffs_y); -+ -+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; -+ -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); -+ -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); -+ -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); -+ -+ st->last_l1 = &c->next_src; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_bxx) { -+ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; -+ const int mx1 = fctom(c->coeffs_x1); -+ const int my1 = fctom(c->coeffs_y1); -+ const int mx2 = fctom(c->coeffs_x2); -+ const int my2 = fctom(c->coeffs_y2); -+ -+ uint8_t patch_u1[PATCH_STRIDE * 72]; -+ uint8_t patch_v1[PATCH_STRIDE * 72]; -+ uint8_t patch_u2[PATCH_STRIDE * 72]; -+ uint8_t patch_v2[PATCH_STRIDE * 72]; -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; -+ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; -+ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; -+ -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); -+ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); -+ -+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( -+ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, mx1, my1, c->w); -+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( -+ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, mx1, my1, c->w); -+ -+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( -+ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, -+ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2), -+ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); -+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( -+ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, -+ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2), -+ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); -+ -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); -+ -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == q->code_sync) { -+ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); -+ break; -+ } -+ else if (link == q->code_exit) { -+ // We expect exit to occur without other sync -+ av_assert0(i == exit_n); -+ ++exit_n; -+ break; -+ } -+ else { -+ av_assert0(0); -+ } -+ } -+ -+ st->qpu_mc_curr = cmd; -+ } -+ } while (exit_n == 0); -+ } -+} -+ -+#undef FUNC -+#undef pixel -+ -diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s -new file mode 100644 -index 0000000000..3caef20137 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform.s -@@ -0,0 +1,444 @@ -+# ****************************************************************************** -+# Argon Design Ltd. -+# (c) Copyright 2015 Argon Design Ltd. All rights reserved. -+# -+# Module : HEVC -+# Author : Peter de Rivaz -+# ****************************************************************************** -+ -+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack) -+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions) -+.set USE_STACK, 0 -+ -+# Lines that fail to assemble start with #: -+# The script insert_magic_opcodes.sh inserts the machine code directly for these. -+# HEVC VPU Transform -+# -+# Transform matrix can be thought of as -+# output row vector = input row vector * transMatrix2 -+# -+# The even rows of the matrix are symmetric -+# The odd rows of the matrix are antisymmetric -+# -+# So only need to compute the first half of the results, then can compute the remainder with a butterfly -+# -+# EXAMPLE -+# (a b c d) (1 2 2 1) -+# (3 4 -4 -3) -+# (5 6 6 5) -+# (7 8 -8 -7) -+# -+# x=(a c)(1 2) = 1a+5c 2a+6c -+# (5 6) -+# -+# y=(b d)(3 4) = 3b+7d 4b+8d -+# (7 8) -+# -+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d -+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d -+# -+# Final results are (u , v[::-1]) -+# -+# -+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) -+# Apply the even matrix first and stop before rounding -+# Then apply the odd matrix in a full manner: -+# -+# First step is to compute partial products with the first input (16 cycles) -+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output -+# 2a 4b 6c 8d -+# 2a -4b 6c -8d -+# 1a -3b 5c -7d -+# -+# Second step is to sum partial products into final position (8 cycles) -+# 1a+3b+5c+7d -+# 2a+4b+6c+8d -+# 2a-4b+6c-8d -+# 1a-3b+5c-7d -+# -+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) -+# -+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) -+# -+# For 8x8 we could compute two in parallel. -+# -+# -+ -+# Columns are transformed first -+# -+# Store top left half of transMatrix2 in -+# Store bottom left half of transMatrix2 in HX(32,32) -+# -+# For 16x16 -+# HX(0:15,0) contains input data before transform -+# HY(0:15,0) contains 32bit output data after transform -+# HX(32,0) contains even rows of left half of transMatrix2 -+# HX(32,32) contains odd rows of left half of transMatrix2 -+# HY(48,0) contains partial products ready for summing -+# -+ -+ -+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# coeffs32 -+# num32: number of 32x32 transforms -+# command 0 for transform, 1 for memclear16(int16_t *dst,num16) -+# -+ -+.equ TRANS_SHIFT, 20 - BIT_DEPTH -+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) -+.equ TRANS_ASL2, 16 - TRANS_SHIFT -+ -+ -+hevc_trans_16x16: -+ push r6-r15, lr # TODO cut down number of used registers -+ mov r14,r3 # coeffs32 -+ mov r15,r4 # num32 -+ mov r3, 16*2 # Stride of transMatrix2 in bytes -+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix -+ -+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix -+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ # Now use r0 to describe which matrix we are working on. -+ # Allows us to prefetch the next block of coefficients for efficiency. -+ mov r0,0 # This describes the location where we read our coefficients from -+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) -+ mov r7,16*16*2 # Total block size -+ mov r8,64*16 # Value used to swap from current to next VRF location -+ mov r4,64 # Constant used for rounding first pass -+ mov r5,TRANS_RND2 # Constant used for rounding second pass -+ -+ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack -+ -+ add r11,sp,64 # Space for 32 bytes before, and rounding -+ lsr r11,5 -+ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32 -+ -+ lsr r10, r2, 16 # Number of compressed blocks stored in top short -+ extu r2,16 -+ # At start of block r0,r1 point to the current block (that has already been loaded) -+ # r0 VRF location of current block -+ # r1 address of current block -+ # r2 number of 16*16 transforms to do -+ # r3 Stride of coefficients (==32) -+ # r4 TRANS_RND1 (64) -+ # r5 TRANS_RND2 -+ # r6 temporary used inside col_trans16 -+ # r7 16*16*2 total bytes in block -+ # r8 64*16 VRF switch locations -+ # r9 temporary in unpack_coeff for index -+ # r10 number of 16x16 transforms using compression -+ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer) -+ # r12 temporary counter in unpack_coeff -+ # r13 -+ # r14 Save information for 32 bit transform (coeffs location) -+ # r15 Save information for 32 bit transform (number of transforms) -+ cmp r2,0 -+ beq done16x16s -+block_loop: -+ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests -+ cmp r10,0 -+ mov r6, r1 -+ beq not_compressed -+ sub r10, 1 -+ bl unpack16x16 -+not_compressed: -+ #mov r6,r1 # DEBUG without compress -+ vldh HX(0++,0)+r0,(r6 += r3) REP 16 -+ #eor r0,r8 -+ #add r1,r7 -+ # Prefetch the next block -+ #bl unpack16x16 -+ #vldh HX(0++,0)+r0,(r6 += r3) REP 16 -+ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG -+ #eor r0,r8 -+ #sub r1,r7 -+ -+ # Transform the current block -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? -+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position -+ -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) -+ -+ # Save results - note there has been a transposition during the processing so we save columns -+ vsth VX(0,32++)+r0, (r1 += r3) REP 16 -+ -+ # Move onto next block -+ eor r0,r8 -+ add r1,r7 -+ -+ addcmpbgt r2,-1,0,block_loop -+done16x16s: -+ -+ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack -+ # Now go and do any 32x32 transforms -+ b hevc_trans_32x32 -+ -+ pop r6-r15, pc -+# This returns a value in r6 that says where to load the data from. -+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it. -+unpack16x16: -+# Clear out destination -+ vmov HX(0,0)+r0,0 -+ mov r6, r11 -+ vsth HX(0,0)+r0,(r6 += r3) REP 16 -+ mov r5, r1 # Moving pointer to input coefficients -+unpack_outer_loop: -+ # Loop until we find the end -+ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous? -+ sub r6,r11,32 -+ #add r6,pc,packed_data-$ # Packed data -+ vsth HX(0,0)+r0,(r6) # Store into packed data -+ mov r12,0 -+unpack_loop: -+ ld r4,(r6) -+ add r6,r6,4 -+ lsr r9,r4,16 # r9 is destination value -+ cmp r4,0 # {value,index} -+ extu r4,8 -+ beq done_unpack -+ sth r9,(r11, r4) -+ addcmpblt r12,1,8,unpack_loop -+# # Read next 16 -+ add r5,32 -+ b unpack_outer_loop -+done_unpack: -+# # Set new load location -+ mov r6, r11 -+ #add r6,pc,unpacked_data-$ -+# # Restore constants -+ mov r4,64 -+ mov r5,TRANS_RND2 -+# pop r6-r15, pc -+ b lr -+ -+# r1,r2,r3 r7,r8 should be preserved -+# HX(0++,0)+r0 is the block to be transformed -+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients -+# Use HY(48,0) for intermediate results -+# r0 can be used, but should be returned to its original value at the end -+col_trans_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+col_trans_odd_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_odd_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_odd_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+# r1/r10 input pointer -+# r0,r4,r5,r6 free -+# r8/r9 output storage -+# -+# Store packed coefficients at r9-32 -+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows) -+unpack32x32: -+# Clear out destination -+ vmov HX(0,0),0 -+ add r0, r9, 32*32*2 # Unpacked buffer -+ mov r4, 32 -+ vsth HX(0,0),(r0 += r4) REP 64 -+unpack_outer_loop32: -+ # Loop until we find the end -+ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous? -+ sub r6,r9,32 -+ #add r6,pc,packed_data-$ # Packed data -+ vsth HX(0,0),(r6) # Store into packed data -+ mov r8,0 -+unpack_loop32: -+ ld r4,(r6) -+ add r6,r6,4 -+ lsr r5,r4,16 # r5 is destination value -+ cmp r4,0 # {value,index} -+ extu r4,10 -+ beq done_unpack -+ sth r5,(r0, r4) -+ addcmpblt r8,1,8,unpack_loop32 -+# # Read next 16 -+ add r1,32 -+ b unpack_outer_loop32 -+done_unpack32: -+ b lr -+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16 -+# -+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first! -+hevc_trans_32x32: -+ mov r1,r14 # coeffs -+ mov r2,r15 # num -+ lsr r15,r15,16 # Number that are packed -+ extu r2,16 # Total number -+ -+ # Fetch odd transform matrix -+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) -+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix -+ #add r0, 16*16*2 -+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer -+ mov r7, 16*16*2 # Total block size -+ -+.if USE_STACK -+ # Stack base allocation -+ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking -+ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it -+ add r8,sp,63 -+ lsr r8,5 -+ lsl r8,5 -+.else -+#:version r8 -+ .half 0x00e8 #AUTOINSERTED -+ btst r8,16 -+#:add r8,pc,intermediate_results-$ -+ .half 0xbfe8 -+ .half intermediate_results-($-2) -+ beq on_vpu1 -+ add r8,r8,32*32*2*2+16*2 # Move to secondary storage -+on_vpu1: -+.endif -+ mov r9,r8 # Backup of the temporary storage -+ mov r10,r1 # Backup of the coefficient buffer -+ -+ cmp r2,0 -+ beq done32x32s -+block_loop32: -+ -+ # Transform the first 16 columns -+ mov r1,r10 # Input Coefficient buffer -+ mov r8,r9 # Output temporary storage -+ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed) -+ cmp r2,r15 -+ bgt not_compressed_32 -+ bl unpack32x32 -+ add r1,r9,32*32*2 # Uncompressed into temporary storage -+ mov r8,r9 # Transform into here -+not_compressed_32: -+ # COLUMN TRANSFORM -+ mov r4, 64 # Constant used for rounding first pass -+ mov r5, 9 # left shift used for rounding first pass -+ -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ # ROW TRANSFORM -+ mov r4, TRANS_RND2 # Constant used for rounding second pass -+ mov r5, TRANS_ASL2 # left shift used for rounding second pass -+ -+ mov r1,r9 # Input temporary storage -+ mov r8,r10 # Output Coefficient buffer -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ add r10, 32*32*2 # move onto next block of coefficients -+ addcmpbgt r2,-1,0,block_loop32 -+done32x32s: -+ -+.if USE_STACK -+ add sp,sp,32*32*4+64# Restore stack -+.endif -+ -+ pop r6-r15, pc -+ -+trans32: -+ push lr -+ # We can no longer afford the VRF space to do prefetching when doing 32x32 -+ # Fetch the even rows -+ vldh HX(0++,0),(r1 += r3) REP 16 -+ # Fetch the odd rows -+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 -+ -+ # Transform the even rows using even matrix -+ mov r0, 0 # Even rows -+ bl col_trans_16 -+ -+ # Now transform the odd rows using odd matrix -+ mov r0, 64*16 # Odd rows -+ bl col_trans_odd_16 -+ -+ # Now apply butterfly to compute the first 16 results -+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ # 16bit results now in HX(48,32) -+ mov r0,r8 -+ mov r6,32*2 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ -+ # Now apply butterfly to compute the second 16 results (in reverse order) -+ vsub HY(63,0),HY(0 ,0),HY(16,0) -+ vsub HY(62,0),HY(1 ,0),HY(17,0) -+ vsub HY(61,0),HY(2 ,0),HY(18,0) -+ vsub HY(60,0),HY(3 ,0),HY(19,0) -+ vsub HY(59,0),HY(4 ,0),HY(20,0) -+ vsub HY(58,0),HY(5 ,0),HY(21,0) -+ vsub HY(57,0),HY(6 ,0),HY(22,0) -+ vsub HY(56,0),HY(7 ,0),HY(23,0) -+ vsub HY(55,0),HY(8 ,0),HY(24,0) -+ vsub HY(54,0),HY(9 ,0),HY(25,0) -+ vsub HY(53,0),HY(10,0),HY(26,0) -+ vsub HY(52,0),HY(11,0),HY(27,0) -+ vsub HY(51,0),HY(12,0),HY(28,0) -+ vsub HY(50,0),HY(13,0),HY(29,0) -+ vsub HY(49,0),HY(14,0),HY(30,0) -+ vsub HY(48,0),HY(15,0),HY(31,0) -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ add r0,r8,32 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ pop pc -+ -+.if USE_STACK == 0 -+ .balign 32 -+ -+# .space directives generate 0's in the bin so avoid unnecessary padding by -+# just setting to appropriate value -+.equ intermediate_results, $+16*2 -+ -+# Layout goes: -+# -+#packed_buffer: -+# .space 16*2 -+#intermediate_results: -+# .space 32*32*2 -+#unpacked_buffer: -+# .space 32*32*2 -+# -+#packed_buffer2: -+# .space 16*2 -+#intermediate_results2: -+# .space 32*32*2 -+#unpacked_buffer2: -+# .space 32*32*2 -+.endif -+ -+ -diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h -new file mode 100644 -index 0000000000..1c364492d0 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform10.h -@@ -0,0 +1,94 @@ -+static const unsigned char rpi_hevc_transform10 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 -+}; -diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h -new file mode 100644 -index 0000000000..1128a2c054 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform8.h -@@ -0,0 +1,94 @@ -+static const unsigned char rpi_hevc_transform8 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 -+}; -diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c -new file mode 100644 -index 0000000000..e651e5c565 ---- /dev/null -+++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,6134 @@ -+/* -+ * HEVC video Decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2012 - 2013 Mickael Raulet -+ * Copyright (C) 2012 - 2013 Gildas Cocherel -+ * Copyright (C) 2012 - 2013 Wassim Hamidouche -+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/attributes.h" -+#include "libavutil/common.h" -+#include "libavutil/display.h" -+#include "libavutil/internal.h" -+#include "libavutil/mastering_display_metadata.h" -+#include "libavutil/md5.h" -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/stereo3d.h" -+ -+#include "decode.h" -+#include "bswapdsp.h" -+#include "bytestream.h" -+#include "golomb.h" -+#include "hevc.h" -+#include "rpi_hevc_data.h" -+#include "rpi_hevc_parse.h" -+#include "rpi_hevcdec.h" -+#include "rpi_hevc_cabac_fns.h" -+#include "profiles.h" -+#include "hwconfig.h" -+ -+#include "rpi_zc_frames.h" -+#include "rpi_qpu.h" -+#include "rpi_hevc_shader.h" -+#include "rpi_hevc_shader_cmd.h" -+#include "rpi_hevc_shader_template.h" -+#include "rpi_zc.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+#include "pthread.h" -+#include -+ -+#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards -+ -+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) -+ -+#ifndef av_mod_uintp2 -+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) -+{ -+ return a & ((1 << p) - 1); -+} -+# define av_mod_uintp2 av_mod_uintp2_c -+#endif -+ -+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; -+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); -+ -+#define MC_DUMMY_X (-32) -+#define MC_DUMMY_Y (-32) -+ -+// UV & Y both have min 4x4 pred (no 2x2 chroma) -+// Allow for even spread +1 for setup, +1 for rounding -+// As we have load sharing this can (in theory) be exceeded so we have to -+// check after each CTU, but it is a good base size -+ -+// Worst case (all 4x4) commands per CTU -+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) -+#define QPU_C_CMD_PER_CTU_MAX (8 * 8) -+ -+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64) -+ -+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP) -+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS) -+ -+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2) -+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2) -+ -+// Total cmds to allocate - allow for slack & setup -+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX) -+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX) -+ -+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2)) -+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2)) -+ -+// The QPU code for UV blocks only works up to a block width of 8 -+#define RPI_CHROMA_BLOCK_WIDTH 8 -+ -+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) -+ -+ -+// Actual filter goes -ve, +ve, +ve, -ve using these values -+static const uint32_t rpi_filter_coefs[8] = { -+ ENCODE_COEFFS( 0, 64, 0, 0), -+ ENCODE_COEFFS( 2, 58, 10, 2), -+ ENCODE_COEFFS( 4, 54, 16, 2), -+ ENCODE_COEFFS( 6, 46, 28, 4), -+ ENCODE_COEFFS( 4, 36, 36, 4), -+ ENCODE_COEFFS( 4, 28, 46, 6), -+ ENCODE_COEFFS( 2, 16, 54, 4), -+ ENCODE_COEFFS( 2, 10, 58, 2) -+}; -+ -+// Function arrays by QPU -+ -+static const int * const inter_pred_setup_c_qpu[12] = { -+ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, -+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, -+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn -+}; -+ -+static const int * const inter_pred_setup_c10_qpu[12] = { -+ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, -+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, -+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn -+}; -+ -+static const int * const inter_pred_setup_y_qpu[12] = { -+ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, -+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, -+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn -+}; -+ -+static const int * const inter_pred_setup_y10_qpu[12] = { -+ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, -+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, -+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn -+}; -+ -+static const int * const inter_pred_sync_qpu[12] = { -+ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, -+ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, -+ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 -+}; -+ -+static const int * const inter_pred_sync10_qpu[12] = { -+ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, -+ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, -+ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 -+}; -+ -+static const int * const inter_pred_exit_c_qpu[12] = { -+ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, -+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, -+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn -+}; -+ -+static const int * const inter_pred_exit_c10_qpu[12] = { -+ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, -+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, -+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn -+}; -+ -+static const int * const inter_pred_exit_y_qpu[12] = { -+ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, -+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, -+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn -+}; -+ -+static const int * const inter_pred_exit_y10_qpu[12] = { -+ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, -+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, -+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn -+}; -+ -+typedef struct ipe_chan_info_s -+{ -+ const uint8_t bit_depth; -+ const uint8_t n; -+ const int * const * setup_fns; -+ const int * const * sync_fns; -+ const int * const * exit_fns; -+} ipe_chan_info_t; -+ -+typedef struct ipe_init_info_s -+{ -+ ipe_chan_info_t luma; -+ ipe_chan_info_t chroma; -+} ipe_init_info_t; -+ -+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a) -+{ -+ switch (ln) -+ { -+ default: // normally 0 -+ *b = a; -+ break; -+ case 1: -+ a |= a << 8; -+ *(uint16_t *)b = a; -+ b += stride; -+ *(uint16_t *)b = a; -+ break; -+ case 2: -+ a |= a << 8; -+ a |= a << 16; -+ *(uint32_t *)b = a; -+ b += stride; -+ *(uint32_t *)b = a; -+ b += stride; -+ *(uint32_t *)b = a; -+ b += stride; -+ *(uint32_t *)b = a; -+ break; -+ case 3: -+ { -+ unsigned int i; -+ uint64_t d; -+ a |= a << 8; -+ a |= a << 16; -+ d = ((uint64_t)a << 32) | a; -+ for (i = 0; i != 8; ++i, b += stride) -+ *(uint64_t *)b = d; -+ break; -+ } -+ case 4: -+ { -+ unsigned int i; -+ uint64_t d; -+ a |= a << 8; -+ a |= a << 16; -+ d = ((uint64_t)a << 32) | a; -+ for (i = 0; i != 16; ++i, b += stride) -+ { -+ *(uint64_t *)b = d; -+ *(uint64_t *)(b + 8) = d; -+ } -+ break; -+ } -+ } -+} -+ -+// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3 -+// (4 not required) -+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) -+{ -+ switch (ln) -+ { -+ default: // 0 or -1 -+ *b_u = a; -+ *b_l = a; -+ break; -+ case 1: -+ a |= a << 8; -+ *(uint16_t *)b_u = a; -+ *(uint16_t *)b_l = a; -+ break; -+ case 2: -+ a |= a << 8; -+ a |= a << 16; -+ *(uint32_t *)b_u = a; -+ *(uint32_t *)b_l = a; -+ break; -+ case 3: -+ a |= a << 8; -+ a |= a << 16; -+ *(uint32_t *)b_u = a; -+ *(uint32_t *)(b_u + 4) = a; -+ *(uint32_t *)b_l = a; -+ *(uint32_t *)(b_l + 4) = a; -+ break; -+ case 4: -+ a |= a << 8; -+ a |= a << 16; -+ *(uint32_t *)b_u = a; -+ *(uint32_t *)(b_u + 4) = a; -+ *(uint32_t *)(b_u + 8) = a; -+ *(uint32_t *)(b_u + 12) = a; -+ *(uint32_t *)b_l = a; -+ *(uint32_t *)(b_l + 4) = a; -+ *(uint32_t *)(b_l + 8) = a; -+ *(uint32_t *)(b_l + 12) = a; -+ break; -+ } -+} -+ -+static void zap_cabac_stash(uint8_t * b, const int ln) -+{ -+ switch (ln) -+ { -+ default: // 0 -+ *b = 0; -+ break; -+ case 1: -+ *(uint16_t *)b = 0; -+ break; -+ case 2: -+ *(uint32_t *)b = 0; -+ break; -+ case 3: -+ *(uint32_t *)b = 0; -+ *(uint32_t *)(b + 4) = 0; -+ break; -+ } -+} -+ -+ -+ -+// Set a small square block of bits in a bitmap -+// Bits must be aligned on their size boundry (which will be true of all split CBs) -+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln) -+{ -+ unsigned int n; -+ const unsigned int sh = (x & 7); -+ -+ f += (x >> 3); -+ -+ av_assert2(ln <= 3); -+ av_assert2((x & ((1 << ln) - 1)) == 0); -+ -+ switch (ln) -+ { -+ default: // 1 -+ f[0] |= 1 << sh; -+ break; -+ case 1: // 3 * 2 -+ n = 3 << sh; -+ f[0] |= n; -+ f[stride] |= n; -+ break; -+ case 2: // 0xf * 4 -+ n = 0xf << sh; -+ f[0] |= n; -+ f[stride] |= n; -+ f[stride * 2] |= n; -+ f[stride * 3] |= n; -+ break; -+ case 3: // 0xff * 8 -+ for (n = 0; n != 8; ++n, f += stride) -+ *f = 0xff; -+ break; -+ } -+} -+ -+static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 -+ { // 8 -+ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, -+ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} -+ }, -+ { // 9 -+ .luma = {0}, -+ .chroma = {0} -+ }, -+ { // 10 -+ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, -+ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} -+ } -+ -+}; -+ -+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) -+{ -+ const unsigned int n = ici->n; -+ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word -+ -+ ipe->n = n; -+ ipe->max_fill = q1_size - ipe->min_gap; -+ for(unsigned int i = 0; i < n; i++) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ q->qpu_mc_curr = q->qpu_mc_base = -+ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); -+ q->code_setup = qpu_fn(ici->setup_fns[i]); -+ q->code_sync = qpu_fn(ici->sync_fns[i]); -+ q->code_exit = qpu_fn(ici->exit_fns[i]); -+ } -+} -+ -+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth) -+{ -+ av_assert0(bit_depth >= 8 && bit_depth <= 16); -+ -+ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); -+} -+ -+// Unsigned Trivial MOD -+static inline unsigned int utmod(const unsigned int x, const unsigned int n) -+{ -+ return x >= n ? x - n : x; -+} -+ -+// returns pq->job_n++ -+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq) -+{ -+ unsigned int const x2 = pq->job_n; -+ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS); -+ return x2; -+} -+ -+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) -+{ -+ pq->terminate = 0; -+ pq->job_n = 0; -+ pq->context = s; -+ pq->worker = worker; -+ pq->psem_out = psem_out; -+ pq->pass_n = n; -+ pq->started = 0; -+ sem_init(&pq->sem_in, 0, 0); -+} -+ -+static void pass_queue_kill(HEVCRpiPassQueue * const pq) -+{ -+ sem_destroy(&pq->sem_in); -+} -+ -+static inline void rpi_sem_wait(sem_t * const sem) -+{ -+ while (sem_wait(sem) != 0) { -+ av_assert0(errno == EINTR); -+ } -+} -+ -+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) -+{ -+ sem_post(&pq->sem_in); -+} -+ -+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ // Do the various passes - common with the worker code -+ for (unsigned int i = 0; i != RPI_PASSES; ++i) { -+ s->passq[i].worker(s, jb); -+ } -+} -+ -+ -+#if 0 -+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) -+{ -+ int x; -+ sem_getvalue((sem_t *)&jbc->sem_out, &x); -+ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); -+} -+#endif -+ -+ -+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc) -+{ -+ HEVCRpiJob * jb; -+ HEVCRpiJobGlobal * const jbg = jbc->jbg; -+ -+ pthread_mutex_lock(&jbg->lock); -+ // Check local 1st -+ if ((jb = jbc->jb1) != NULL) -+ { -+ // Only 1 - very easy :-) -+ jbc->jb1 = NULL; -+ } -+ else -+ { -+ // Now look for global free chain -+ if ((jb = jbg->free1) != NULL) -+ { -+ // Found one - unlink it -+ jbg->free1 = jb->next; -+ jb->next = NULL; -+ } -+ else -+ { -+ // Out of places to look - wait for one to become free - add to Qs -+ -+ // Global -+ // If "good" lc then add after the last "good" el in the chain -+ // otherwise add to the tail -+ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) -+ { -+ // Add to end as we had to wait last time or wait Q empty -+ if ((lc->jw_prev = jbg->wait_tail) == NULL) -+ jbg->wait_head = lc; -+ else -+ lc->jw_prev->jw_next = lc; -+ lc->jw_next = NULL; -+ jbg->wait_tail = lc; -+ } -+ else -+ { -+ // This is a "good" lc that we need to poke into the middle -+ // of the Q -+ // We know that the Q isn't empty and there is at least one -+ // !last_progess_good el in it from the previous test -+ -+ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after -+ -+ if (p == NULL) -+ { -+ // No current good els - add to head -+ lc->jw_next = jbg->wait_head; -+ jbg->wait_head = lc; -+ } -+ else -+ { -+ lc->jw_next = p->jw_next; -+ p->jw_next = lc; -+ } -+ -+ lc->jw_next->jw_prev = lc; -+ lc->jw_prev = p; -+ } -+ -+ // If "good" then we are now the last good waiting el -+ if (lc->last_progress_good) -+ jbg->wait_good = lc; -+ -+ // Local -+ if ((lc->ljw_prev = jbc->lcw_tail) == NULL) -+ jbc->lcw_head = lc; -+ else -+ lc->ljw_prev->ljw_next = lc; -+ lc->ljw_next = NULL; -+ jbc->lcw_tail = lc; -+ } -+ } -+ -+ pthread_mutex_unlock(&jbg->lock); -+ -+ if (jb == NULL) // Need to wait -+ { -+ rpi_sem_wait(&lc->jw_sem); -+ jb = lc->jw_job; // Set by free code -+ } -+ -+ return jb; -+} -+ -+ -+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) -+{ -+ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock -+ HEVCRpiJobCtl * jbc = jb->jbc_local; -+ HEVCRpiLocalContext * lc = NULL; -+ -+ pthread_mutex_lock(&jbg->lock); -+ -+ if (jbc != NULL) -+ { -+ av_assert1(jbc->jb1 == NULL); -+ -+ // Release to Local if nothing waiting there -+ if ((lc = jbc->lcw_head) == NULL) -+ jbc->jb1 = jb; -+ } -+ else -+ { -+ // Release to global if nothing waiting there -+ if ((lc = jbg->wait_head) == NULL) -+ { -+ jb->next = jbg->free1; -+ jbg->free1 = jb; -+ } -+ else -+ { -+ // ? seems somehow mildy ugly... -+ jbc = lc->context->jbc; -+ } -+ } -+ -+ if (lc != NULL) -+ { -+ // Something was waiting -+ -+ // Unlink -+ // Global -+ if (lc->jw_next == NULL) -+ jbg->wait_tail = lc->jw_prev; -+ else -+ lc->jw_next->jw_prev = lc->jw_prev; -+ -+ if (lc->jw_prev == NULL) -+ jbg->wait_head = lc->jw_next; -+ else -+ lc->jw_prev->jw_next = lc->jw_next; -+ -+ // Local -+ if (lc->ljw_next == NULL) -+ jbc->lcw_tail = lc->ljw_prev; -+ else -+ lc->ljw_next->ljw_prev = lc->ljw_prev; -+ -+ if (lc->ljw_prev == NULL) -+ jbc->lcw_head = lc->ljw_next; -+ else -+ lc->ljw_prev->ljw_next = lc->ljw_next; -+ -+ // Update good if required -+ if (jbg->wait_good == lc) -+ jbg->wait_good = lc->jw_prev; -+ -+ // Prod -+ lc->jw_job = jb; -+ sem_post(&lc->jw_sem); -+ } -+ -+ pthread_mutex_unlock(&jbg->lock); -+} -+ -+static void job_lc_kill(HEVCRpiLocalContext * const lc) -+{ -+ sem_destroy(&lc->jw_sem); -+} -+ -+static void job_lc_init(HEVCRpiLocalContext * const lc) -+{ -+ lc->jw_next = NULL; -+ lc->jw_prev = NULL; -+ lc->ljw_next = NULL; -+ lc->ljw_prev = NULL; -+ lc->jw_job = NULL; -+ sem_init(&lc->jw_sem, 0, 0); -+} -+ -+// Returns: -+// 0 if we have waited for MV or expect to wait for recon -+// 1 if we haven't waited for MV & do not need to wait for recon -+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb) -+{ -+ if (jb->waited) // reset by rpi_begin -+ return 0; -+ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) -+ { -+ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL && -+ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i]) -+ return 0; -+ } -+ return 1; -+} -+ -+// Submit job if it is full (indicated by having ctu_ts_last set >= 0) -+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc) -+{ -+ HEVCRpiJobCtl *const jbc = s->jbc; -+ HEVCRpiJob * const jb = lc->jb0; -+ -+ av_assert1(jb != NULL); -+ -+ if (jb->ctu_ts_last < 0) { -+ return; -+ } -+ -+ lc->last_progress_good = progress_good(s, jb); -+ jb->waited = !lc->last_progress_good; -+ lc->jb0 = NULL; -+ -+ if (s->offload_recon) -+ { -+ pthread_mutex_lock(&jbc->in_lock); -+ jbc->offloadq[jbc->offload_in] = jb; -+ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); -+ pthread_mutex_unlock(&jbc->in_lock); -+ -+ pass_queue_submit_job(s->passq + 0); // Consumes job eventually -+ } -+ else -+ { -+ pass_queue_do_all(s, jb); // Consumes job before return -+ } -+} -+ -+ -+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes -+// available to receive the next job. -+// -+// Now safe against multiple callers - needed for tiles -+// "normal" and WPP will only call here one at a time -+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ HEVCRpiJobCtl * const jbc = s->jbc; -+ -+ // It is legit for us to already have a job allocated - do nothing in this case -+ if (lc->jb0 != NULL) -+ return; -+ -+ if (s->offload_recon) -+ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much -+ -+ lc->jb0 = job_alloc(jbc, lc); -+ -+ rpi_begin(s, lc->jb0, lc->ts); -+} -+ -+// Free up a job without submission -+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ HEVCRpiJobCtl * const jbc = s->jbc; -+ HEVCRpiJob * const jb = lc->jb0; -+ -+ if (jb == NULL) { -+ return; -+ } -+ -+ lc->jb0 = NULL; -+ -+ job_free(jbc, jb); -+ -+ // If offload then poke sem_out too -+ if (s->offload_recon) { -+ sem_post(&jbc->sem_out); -+ } -+} -+ -+ -+// Call this to wait for all jobs to have completed at the end of a frame -+// Slightly icky as there is no clean way to wait for a sem to count up -+// Not reentrant - call on main thread only -+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) -+{ -+ HEVCRpiJobCtl * const jbc = s->jbc; -+ int i = 0; -+ -+ // We shouldn't reach here with an unsubmitted job -+ av_assert1(lc->jb0 == NULL); -+ -+ // If no offload then there can't be anything to wait for -+ if (!s->offload_recon) { -+ return; -+ } -+ -+ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) -+ { -+ for (i = 0; i != RPI_MAX_JOBS; ++i) { -+ rpi_sem_wait(&jbc->sem_out); -+ } -+ for (i = 0; i != RPI_MAX_JOBS; ++i) { -+ sem_post(&jbc->sem_out); -+ } -+ } -+} -+ -+static void * pass_worker(void *arg) -+{ -+ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; -+ HEVCRpiContext *const s = pq->context; -+ -+ for (;;) -+ { -+ rpi_sem_wait(&pq->sem_in); -+ -+ if (pq->terminate) -+ break; -+ -+ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]); -+ // * should really set jb->passes_done here -+ -+ sem_post(pq->psem_out); -+ } -+ return NULL; -+} -+ -+static void pass_queues_start_all(HEVCRpiContext *const s) -+{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; -+ -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); -+ pqs[i].started = 1; -+ } -+} -+ -+static void pass_queues_term_all(HEVCRpiContext *const s) -+{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; -+ -+ for (i = 0; i != RPI_PASSES; ++i) -+ pqs[i].terminate = 1; -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ if (pqs[i].started) -+ sem_post(&pqs[i].sem_in); -+ } -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ if (pqs[i].started) { -+ pthread_join(pqs[i].thread, NULL); -+ pqs[i].started = 0; -+ } -+ } -+} -+ -+static void pass_queues_kill_all(HEVCRpiContext *const s) -+{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; -+ -+ for (i = 0; i != RPI_PASSES; ++i) -+ pass_queue_kill(pqs + i); -+} -+ -+ -+static void worker_pic_free_one(HEVCRpiJob * const jb) -+{ -+ // Free coeff stuff - allocation not the same for all buffers -+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; -+ -+ if (cf->s[0].buf != NULL) -+ av_freep(&cf->mptr); -+ if (cf->s[2].buf != NULL) -+ gpu_free(&cf->gptr); -+ memset(cf, 0, sizeof(*cf)); -+} -+ -+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) -+{ -+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; -+ -+ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) -+ goto fail; -+ cf->s[2].buf = (int16_t *)cf->gptr.arm; -+ cf->s[3].buf = cf->s[2].buf + coeff_count; -+ -+ // Must be 64 byte aligned for our zero zapping code so over-allocate & -+ // round -+ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) -+ goto fail; -+ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); -+ return 0; -+ -+fail: -+ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); -+ worker_pic_free_one(jb); -+ return -1; -+} -+ -+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) -+{ -+ unsigned int i; -+ for (i = 0; i != 4; ++i) { -+ cf->s[i].n = 0; -+#if RPI_COMPRESS_COEFFS -+ cf->s[i].packed = 1; -+ cf->s[i].packed_n = 0; -+#endif -+ } -+} -+ -+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) -+{ -+ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; -+ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); -+ cfe->n += n; -+ return coeffs; -+} -+ -+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCRpiFrame * const ref, const int val, const int field) -+{ -+ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { -+ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; -+ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field; -+ sem_t * sem = NULL; -+ -+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); -+ if (((volatile int *)ref->tf.progress->data)[field] < val) { -+ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait; -+ -+ av_assert1(pwait->req == -1 && pwait->next == NULL); -+ jb->waited = 1; // Remember that we had to wait for later scheduling -+ -+ pwait->req = val; -+ pwait->next = NULL; -+ if (pstate->first == NULL) -+ pstate->first = pwait; -+ else -+ pstate->last->next = pwait; -+ pstate->last = pwait; -+ sem = &pwait->sem; -+ } -+ pthread_mutex_unlock(&pstate->lock); -+ -+ if (sem != NULL) { -+ rpi_sem_wait(sem); -+ } -+ } -+} -+ -+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field) -+{ -+ HEVCRpiFrameProgressState *const pstate = s->progress_states + field; -+ -+ ((int *)s->ref->tf.progress->data)[field] = val; -+ -+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); -+ { -+ HEVCRpiFrameProgressWait ** ppwait = &pstate->first; -+ HEVCRpiFrameProgressWait * pwait; -+ -+ while ((pwait = *ppwait) != NULL) { -+ if (pwait->req > val) -+ { -+ ppwait = &pwait->next; -+ pstate->last = pwait; -+ } -+ else -+ { -+ *ppwait = pwait->next; -+ pwait->req = -1; -+ pwait->next = NULL; -+ sem_post(&pwait->sem); -+ } -+ } -+ } -+ pthread_mutex_unlock(&pstate->lock); -+} -+ -+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate) -+{ -+ pstate->first = NULL; -+ pstate->last = NULL; -+ pthread_mutex_init(&pstate->lock, NULL); -+} -+ -+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait) -+{ -+ pwait->req = -1; -+ pwait->next = NULL; -+ sem_init(&pwait->sem, 0, 0); -+} -+ -+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate) -+{ -+ av_assert1(pstate->first == NULL); -+ pthread_mutex_destroy(&pstate->lock); -+} -+ -+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait) -+{ -+ sem_destroy(&pwait->sem); -+} -+ -+ -+/** -+ * NOTE: Each function hls_foo correspond to the function foo in the -+ * specification (HLS stands for High Level Syntax). -+ */ -+ -+/** -+ * Section 5.7 -+ */ -+ -+// Realloc the entry point arrays -+static int alloc_entry_points(RpiSliceHeader * const sh, const int n) -+{ -+ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0) -+ { -+ // Round up alloc to multiple of 32 -+ int a = (n + 31) & ~31; -+ -+ // We don't care about the previous contents so probably fastest to simply discard -+ av_freep(&sh->entry_point_offset); -+ av_freep(&sh->offset); -+ av_freep(&sh->size); -+ -+ if (a != 0) -+ { -+ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned)); -+ sh->offset = av_malloc_array(a, sizeof(int)); -+ sh->size = av_malloc_array(a, sizeof(int)); -+ -+ if (!sh->entry_point_offset || !sh->offset || !sh->size) { -+ sh->num_entry_point_offsets = 0; -+ sh->offsets_allocated = 0; -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ sh->offsets_allocated = a; -+ } -+ -+ return 0; -+} -+ -+/* free everything allocated by pic_arrays_init() */ -+static void pic_arrays_free(HEVCRpiContext *s) -+{ -+ av_freep(&s->sao); -+ av_freep(&s->deblock); -+ -+ av_freep(&s->cabac_stash_up); -+ s->cabac_stash_left = NULL; // freed with _up -+ -+ av_freep(&s->mvf_up); -+ av_freep(&s->mvf_left); -+ -+ av_freep(&s->is_pcm); -+ av_freep(&s->is_intra_store); -+ s->is_intra = NULL; -+ av_freep(&s->rpl_tab); -+ s->rpl_tab_size = 0; -+ -+ av_freep(&s->qp_y_tab); -+ av_freep(&s->tab_slice_address); -+ av_freep(&s->filter_slice_edges); -+ -+ av_freep(&s->bs_horizontal); -+ s->bs_vertical = NULL; // freed with H -+ av_freep(&s->bsf_stash_left); -+ av_freep(&s->bsf_stash_up); -+ -+ av_freep(&s->rpl_up); -+ av_freep(&s->rpl_left); -+ -+ alloc_entry_points(&s->sh, 0); -+ -+ av_buffer_pool_uninit(&s->col_mvf_pool); -+} -+ -+/* allocate arrays that depend on frame dimensions */ -+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps) -+{ -+ const unsigned int log2_min_cb_size = sps->log2_min_cb_size; -+ const unsigned int width = sps->width; -+ const unsigned int height = sps->height; -+ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) * -+ ((height >> log2_min_cb_size) + 1); -+ const unsigned int ctb_count = sps->ctb_size; -+ -+ { -+ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); -+ unsigned int h = ((height + 15) & ~15); -+ -+ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size -+ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols -+ } -+ -+ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly -+ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); -+ if (!s->sao || !s->deblock) -+ goto fail; -+ -+ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3)); -+ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3); -+ if (s->cabac_stash_up == NULL) -+ goto fail; -+ -+ // Round width up to max ctb size -+ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); -+ // * Only needed if we have H tiles -+ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); -+ -+ // We can overread by 1 line & one byte in deblock so alloc & zero -+ // We don't need to zero the extra @ start of frame as it will never be -+ // written -+ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); -+ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); -+ if (s->is_pcm == NULL || s->is_intra_store == NULL) -+ goto fail; -+ -+ s->filter_slice_edges = av_mallocz(ctb_count); -+ s->tab_slice_address = av_malloc_array(ctb_count, -+ sizeof(*s->tab_slice_address)); -+ s->qp_y_tab = av_malloc_array(pic_size_in_cb, -+ sizeof(*s->qp_y_tab)); -+ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) -+ goto fail; -+ -+ s->bs_horizontal = av_mallocz(s->bs_size * 2); -+ s->bs_vertical = s->bs_horizontal + s->bs_size; -+ if (s->bs_horizontal == NULL) -+ goto fail; -+ -+ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up)); -+ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left)); -+ if (s->rpl_left == NULL || s->rpl_up == NULL) -+ goto fail; -+ -+ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || -+ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL) -+ goto fail; -+ -+ s->col_mvf_stride = (width + 15) >> 4; -+ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField), -+ av_buffer_allocz); -+ if (s->col_mvf_pool == NULL) -+ goto fail; -+ -+ return 0; -+ -+fail: -+ pic_arrays_free(s); -+ return AVERROR(ENOMEM); -+} -+ -+static void default_pred_weight_table(HEVCRpiContext * const s) -+{ -+ unsigned int i; -+ const unsigned int wt = 1 << QPU_MC_DENOM; -+ s->sh.luma_log2_weight_denom = 0; -+ s->sh.chroma_log2_weight_denom = 0; -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ s->sh.luma_weight_l0[i] = wt; -+ s->sh.luma_offset_l0[i] = 0; -+ s->sh.chroma_weight_l0[i][0] = wt; -+ s->sh.chroma_weight_l0[i][1] = wt; -+ s->sh.chroma_offset_l0[i][0] = 0; -+ s->sh.chroma_offset_l0[i][1] = 0; -+ } -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ s->sh.luma_weight_l1[i] = wt; -+ s->sh.luma_offset_l1[i] = 0; -+ s->sh.chroma_weight_l1[i][0] = wt; -+ s->sh.chroma_weight_l1[i][1] = wt; -+ s->sh.chroma_offset_l1[i][0] = 0; -+ s->sh.chroma_offset_l1[i][1] = 0; -+ } -+} -+ -+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb, -+ const unsigned int refs, -+ int16_t * luma_weight, int16_t * luma_offset, -+ int16_t * chroma_weight, int16_t * chroma_offset) -+{ -+ unsigned int luma_flags; -+ unsigned int chroma_flags; -+ unsigned int i; -+ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8); -+ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range; -+ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM; -+ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM; -+ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom); -+ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom); -+ -+ if (refs == 0) -+ return 0; -+ -+ luma_flags = get_bits(gb, refs); -+ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs); -+ i = 1 << (refs - 1); -+ -+ do -+ { -+ if ((luma_flags & i) != 0) -+ { -+ const int delta_weight = get_se_golomb(gb); -+ const int offset = get_se_golomb(gb); -+ if (delta_weight < -128 || delta_weight > 127 || -+ offset < -wp_offset_half_range || offset >= wp_offset_half_range) -+ { -+ return AVERROR_INVALIDDATA; -+ } -+ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift); -+ *luma_offset++ = offset << wp_offset_bd_shift; -+ } -+ else -+ { -+ *luma_weight++ = luma_weight_base; -+ *luma_offset++ = 0; -+ } -+ -+ if ((chroma_flags & i) != 0) -+ { -+ unsigned int j; -+ for (j = 0; j != 2; ++j) -+ { -+ const int delta_weight = get_se_golomb(gb); -+ const int delta_offset = get_se_golomb(gb); -+ -+ if (delta_weight < -128 || delta_weight > 127 || -+ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range) -+ { -+ return AVERROR_INVALIDDATA; -+ } -+ -+ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift); -+ *chroma_offset++ = av_clip( -+ wp_offset_half_range + delta_offset - -+ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom), -+ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift; -+ } -+ } -+ else -+ { -+ *chroma_weight++ = chroma_weight_base; -+ *chroma_weight++ = chroma_weight_base; -+ *chroma_offset++ = 0; -+ *chroma_offset++ = 0; -+ } -+ } while ((i >>= 1) != 0); -+ -+ return 0; -+} -+ -+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) -+{ -+ int err; -+ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb); -+ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb); -+ -+ if (luma_log2_weight_denom > 7 || -+ chroma_log2_weight_denom > 7) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n", -+ luma_log2_weight_denom, chroma_log2_weight_denom); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; -+ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; -+ -+ if ((err = get_weights(s, gb, s->sh.nb_refs[L0], -+ s->sh.luma_weight_l0, s->sh.luma_offset_l0, -+ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 || -+ (err = get_weights(s, gb, s->sh.nb_refs[L1], -+ s->sh.luma_weight_l1, s->sh.luma_offset_l1, -+ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n"); -+ return err; -+ } -+ -+ return 0; -+} -+ -+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb) -+{ -+ const HEVCRpiSPS *sps = s->ps.sps; -+ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; -+ int prev_delta_msb = 0; -+ unsigned int nb_sps = 0, nb_sh; -+ int i; -+ -+ rps->nb_refs = 0; -+ if (!sps->long_term_ref_pics_present_flag) -+ return 0; -+ -+ if (sps->num_long_term_ref_pics_sps > 0) -+ nb_sps = get_ue_golomb_long(gb); -+ nb_sh = get_ue_golomb_long(gb); -+ -+ if (nb_sps > sps->num_long_term_ref_pics_sps) -+ return AVERROR_INVALIDDATA; -+ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc)) -+ return AVERROR_INVALIDDATA; -+ -+ rps->nb_refs = nb_sh + nb_sps; -+ -+ for (i = 0; i < rps->nb_refs; i++) { -+ uint8_t delta_poc_msb_present; -+ -+ if (i < nb_sps) { -+ uint8_t lt_idx_sps = 0; -+ -+ if (sps->num_long_term_ref_pics_sps > 1) -+ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps)); -+ -+ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps]; -+ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps]; -+ } else { -+ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb); -+ rps->used[i] = get_bits1(gb); -+ } -+ -+ delta_poc_msb_present = get_bits1(gb); -+ if (delta_poc_msb_present) { -+ int64_t delta = get_ue_golomb_long(gb); -+ int64_t poc; -+ -+ if (i && i != nb_sps) -+ delta += prev_delta_msb; -+ -+ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb; -+ if (poc != (int32_t)poc) -+ return AVERROR_INVALIDDATA; -+ rps->poc[i] = poc; -+ prev_delta_msb = delta; -+ } -+ } -+ -+ return 0; -+} -+ -+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps, -+ const HEVCRpiSPS *sps) -+{ -+ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; -+ const HEVCRpiWindow *ow = &sps->output_window; -+ unsigned int num = 0, den = 0; -+ -+ avctx->pix_fmt = sps->pix_fmt; -+ avctx->coded_width = sps->width; -+ avctx->coded_height = sps->height; -+ avctx->width = sps->width - ow->left_offset - ow->right_offset; -+ avctx->height = sps->height - ow->top_offset - ow->bottom_offset; -+ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics; -+ avctx->profile = sps->ptl.general_ptl.profile_idc; -+ avctx->level = sps->ptl.general_ptl.level_idc; -+ -+ ff_set_sar(avctx, sps->vui.sar); -+ -+ if (sps->vui.video_signal_type_present_flag) -+ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG -+ : AVCOL_RANGE_MPEG; -+ else -+ avctx->color_range = AVCOL_RANGE_MPEG; -+ -+ if (sps->vui.colour_description_present_flag) { -+ avctx->color_primaries = sps->vui.colour_primaries; -+ avctx->color_trc = sps->vui.transfer_characteristic; -+ avctx->colorspace = sps->vui.matrix_coeffs; -+ } else { -+ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; -+ avctx->color_trc = AVCOL_TRC_UNSPECIFIED; -+ avctx->colorspace = AVCOL_SPC_UNSPECIFIED; -+ } -+ -+ if (vps->vps_timing_info_present_flag) { -+ num = vps->vps_num_units_in_tick; -+ den = vps->vps_time_scale; -+ } else if (sps->vui.vui_timing_info_present_flag) { -+ num = sps->vui.vui_num_units_in_tick; -+ den = sps->vui.vui_time_scale; -+ } -+ -+ if (num != 0 && den != 0) -+ av_reduce(&avctx->framerate.den, &avctx->framerate.num, -+ num, den, 1 << 30); -+} -+ -+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps) -+{ -+ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts; -+ -+ // Admit to no h/w formats -+ -+ *fmt++ = sps->pix_fmt; -+ *fmt = AV_PIX_FMT_NONE; -+ -+ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts); -+} -+ -+static int is_sps_supported(const HEVCRpiSPS * const sps) -+{ -+ return av_rpi_is_sand_format(sps->pix_fmt) && -+ sps->width <= HEVC_RPI_MAX_WIDTH && -+ sps->height <= HEVC_RPI_MAX_HEIGHT; -+} -+ -+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps, -+ const enum AVPixelFormat pix_fmt) -+{ -+ int ret; -+ -+ pic_arrays_free(s); -+ s->ps.sps = NULL; -+ s->ps.vps = NULL; -+ -+ if (sps == NULL) -+ return 0; -+ -+ if (!is_sps_supported(sps)) -+ return AVERROR_DECODER_NOT_FOUND; -+ -+ ret = pic_arrays_init(s, sps); -+ if (ret < 0) -+ goto fail; -+ -+ export_stream_params(s->avctx, &s->ps, sps); -+ -+ s->avctx->pix_fmt = pix_fmt; -+ -+ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth); -+ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth); -+ -+ // * We don't support cross_component_prediction_enabled_flag but as that -+ // must be 0 unless we have 4:4:4 there is no point testing for it as we -+ // only deal with sand which is never 4:4:4 -+ // [support wouldn't be hard] -+ -+ rpi_hevc_qpu_set_fns(s, sps->bit_depth); -+ -+ av_freep(&s->sao_pixel_buffer_h[0]); -+ av_freep(&s->sao_pixel_buffer_v[0]); -+ -+ if (sps->sao_enabled) -+ { -+ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1; -+ unsigned int c_idx; -+ size_t vsize[3] = {0}; -+ size_t hsize[3] = {0}; -+ -+ for(c_idx = 0; c_idx < c_count; c_idx++) { -+ int w = sps->width >> ctx_hshift(s, c_idx); -+ int h = sps->height >> ctx_vshift(s, c_idx); -+ // ctb height & width are a min of 8 so this must a multiple of 16 -+ // so no point rounding up! -+ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; -+ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; -+ } -+ -+ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] -+ // when we have plaited chroma -+ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); -+ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); -+ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; -+ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; -+ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; -+ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; -+ } -+ -+ s->ps.sps = sps; -+ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; -+ -+ return 0; -+ -+fail: -+ pic_arrays_free(s); -+ s->ps.sps = NULL; -+ return ret; -+} -+ -+static inline int qp_offset_valid(const int qp_offset) -+{ -+ return qp_offset >= -12 && qp_offset <= 12; -+} -+ -+static int hls_slice_header(HEVCRpiContext * const s) -+{ -+ GetBitContext * const gb = &s->HEVClc->gb; -+ RpiSliceHeader * const sh = &s->sh; -+ int i, ret; -+ -+ // Coded parameters -+ sh->first_slice_in_pic_flag = get_bits1(gb); -+ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) { -+ s->seq_decode = (s->seq_decode + 1) & 0xff; -+ s->max_ra = INT_MAX; -+ if (IS_IDR(s)) -+ ff_hevc_rpi_clear_refs(s); -+ } -+ sh->no_output_of_prior_pics_flag = 0; -+ if (IS_IRAP(s)) -+ sh->no_output_of_prior_pics_flag = get_bits1(gb); -+ -+ sh->pps_id = get_ue_golomb_long(gb); -+ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) { -+ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id); -+ return AVERROR_INVALIDDATA; -+ } -+ if (!sh->first_slice_in_pic_flag && -+ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) { -+ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data; -+ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1) -+ sh->no_output_of_prior_pics_flag = 1; -+ -+ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { -+ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; -+ const HEVCRpiSPS *last_sps = s->ps.sps; -+ enum AVPixelFormat pix_fmt; -+ -+ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { -+ if (sps->width != last_sps->width || sps->height != last_sps->height || -+ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering != -+ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering) -+ sh->no_output_of_prior_pics_flag = 0; -+ } -+ ff_hevc_rpi_clear_refs(s); -+ -+ ret = set_sps(s, sps, sps->pix_fmt); -+ if (ret < 0) -+ return ret; -+ -+ pix_fmt = get_format(s, sps); -+ if (pix_fmt < 0) -+ return pix_fmt; -+ -+// ret = set_sps(s, sps, pix_fmt); -+// if (ret < 0) -+// return ret; -+ -+ s->avctx->pix_fmt = pix_fmt; -+ -+ s->seq_decode = (s->seq_decode + 1) & 0xff; -+ s->max_ra = INT_MAX; -+ } -+ -+ sh->dependent_slice_segment_flag = 0; -+ if (!sh->first_slice_in_pic_flag) { -+ int slice_address_length; -+ -+ if (s->ps.pps->dependent_slice_segments_enabled_flag) -+ sh->dependent_slice_segment_flag = get_bits1(gb); -+ -+ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size); -+ sh->slice_segment_addr = get_bitsz(gb, slice_address_length); -+ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Invalid slice segment address: %u.\n", -+ sh->slice_segment_addr); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (!sh->dependent_slice_segment_flag) { -+ sh->slice_addr = sh->slice_segment_addr; -+ s->slice_idx++; -+ } -+ } else { -+ sh->slice_segment_addr = sh->slice_addr = 0; -+ s->slice_idx = 0; -+ s->slice_initialized = 0; -+ } -+ -+ if (!sh->dependent_slice_segment_flag) { -+ s->slice_initialized = 0; -+ -+ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++) -+ skip_bits(gb, 1); // slice_reserved_undetermined_flag[] -+ -+ sh->slice_type = get_ue_golomb_long(gb); -+ if (!(sh->slice_type == HEVC_SLICE_I || -+ sh->slice_type == HEVC_SLICE_P || -+ sh->slice_type == HEVC_SLICE_B)) { -+ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n", -+ sh->slice_type); -+ return AVERROR_INVALIDDATA; -+ } -+ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) { -+ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // when flag is not present, picture is inferred to be output -+ sh->pic_output_flag = 1; -+ if (s->ps.pps->output_flag_present_flag) -+ sh->pic_output_flag = get_bits1(gb); -+ -+ if (s->ps.sps->separate_colour_plane_flag) -+ sh->colour_plane_id = get_bits(gb, 2); -+ -+ if (!IS_IDR(s)) { -+ int poc, pos; -+ -+ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb); -+ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type); -+ if (!sh->first_slice_in_pic_flag && poc != s->poc) { -+ av_log(s->avctx, AV_LOG_WARNING, -+ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc); -+ if (s->avctx->err_recognition & AV_EF_EXPLODE) -+ return AVERROR_INVALIDDATA; -+ poc = s->poc; -+ } -+ s->poc = poc; -+ -+ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb); -+ pos = get_bits_left(gb); -+ if (!sh->short_term_ref_pic_set_sps_flag) { -+ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1); -+ if (ret < 0) -+ return ret; -+ -+ sh->short_term_rps = &sh->slice_rps; -+ } else { -+ int numbits, rps_idx; -+ -+ if (!s->ps.sps->nb_st_rps) { -+ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ numbits = av_ceil_log2(s->ps.sps->nb_st_rps); -+ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0; -+ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx]; -+ } -+ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb); -+ -+ pos = get_bits_left(gb); -+ ret = decode_lt_rps(s, &sh->long_term_rps, gb); -+ if (ret < 0) { -+ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n"); -+ if (s->avctx->err_recognition & AV_EF_EXPLODE) -+ return AVERROR_INVALIDDATA; -+ } -+ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb); -+ -+ if (s->ps.sps->sps_temporal_mvp_enabled_flag) -+ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb); -+ else -+ sh->slice_temporal_mvp_enabled_flag = 0; -+ } else { -+ s->sh.short_term_rps = NULL; -+ s->poc = 0; -+ } -+ -+ /* 8.3.1 */ -+ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 && -+ s->nal_unit_type != HEVC_NAL_TRAIL_N && -+ s->nal_unit_type != HEVC_NAL_TSA_N && -+ s->nal_unit_type != HEVC_NAL_STSA_N && -+ s->nal_unit_type != HEVC_NAL_RADL_N && -+ s->nal_unit_type != HEVC_NAL_RADL_R && -+ s->nal_unit_type != HEVC_NAL_RASL_N && -+ s->nal_unit_type != HEVC_NAL_RASL_R) -+ s->pocTid0 = s->poc; -+ -+ if (s->ps.sps->sao_enabled) { -+ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb); -+ if (ctx_cfmt(s) != 0) { -+ sh->slice_sample_adaptive_offset_flag[1] = -+ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb); -+ } -+ } else { -+ sh->slice_sample_adaptive_offset_flag[0] = 0; -+ sh->slice_sample_adaptive_offset_flag[1] = 0; -+ sh->slice_sample_adaptive_offset_flag[2] = 0; -+ } -+ -+ sh->nb_refs[L0] = sh->nb_refs[L1] = 0; -+ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) { -+ int nb_refs; -+ -+ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active; -+ if (sh->slice_type == HEVC_SLICE_B) -+ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active; -+ -+ if (get_bits1(gb)) { // num_ref_idx_active_override_flag -+ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1; -+ if (sh->slice_type == HEVC_SLICE_B) -+ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1; -+ } -+ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) { -+ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n", -+ sh->nb_refs[L0], sh->nb_refs[L1]); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sh->rpl_modification_flag[0] = 0; -+ sh->rpl_modification_flag[1] = 0; -+ nb_refs = ff_hevc_rpi_frame_nb_refs(s); -+ if (!nb_refs) { -+ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) { -+ sh->rpl_modification_flag[0] = get_bits1(gb); -+ if (sh->rpl_modification_flag[0]) { -+ for (i = 0; i < sh->nb_refs[L0]; i++) -+ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs)); -+ } -+ -+ if (sh->slice_type == HEVC_SLICE_B) { -+ sh->rpl_modification_flag[1] = get_bits1(gb); -+ if (sh->rpl_modification_flag[1] == 1) -+ for (i = 0; i < sh->nb_refs[L1]; i++) -+ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs)); -+ } -+ } -+ -+ if (sh->slice_type == HEVC_SLICE_B) -+ sh->mvd_l1_zero_flag = get_bits1(gb); -+ -+ if (s->ps.pps->cabac_init_present_flag) -+ sh->cabac_init_flag = get_bits1(gb); -+ else -+ sh->cabac_init_flag = 0; -+ -+ sh->collocated_ref_idx = 0; -+ if (sh->slice_temporal_mvp_enabled_flag) { -+ sh->collocated_list = L0; -+ if (sh->slice_type == HEVC_SLICE_B) -+ sh->collocated_list = !get_bits1(gb); -+ -+ if (sh->nb_refs[sh->collocated_list] > 1) { -+ sh->collocated_ref_idx = get_ue_golomb_long(gb); -+ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Invalid collocated_ref_idx: %d.\n", -+ sh->collocated_ref_idx); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ } -+ -+ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || -+ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) -+ { -+ if ((ret = pred_weight_table(s, gb)) != 0) -+ return ret; -+ } -+ else -+ { -+ // Give us unit weights -+ default_pred_weight_table(s); -+ } -+ -+ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); -+ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Invalid number of merging MVP candidates: %d.\n", -+ sh->max_num_merge_cand); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ sh->slice_qp_delta = get_se_golomb(gb); -+ -+ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) { -+ sh->slice_cb_qp_offset = get_se_golomb(gb); -+ sh->slice_cr_qp_offset = get_se_golomb(gb); -+ if (!qp_offset_valid(sh->slice_cb_qp_offset) || -+ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) || -+ !qp_offset_valid(sh->slice_cr_qp_offset) || -+ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset)) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n", -+ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset, -+ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset); -+ return AVERROR_INVALIDDATA; -+ } -+ } else -+ { -+ sh->slice_cb_qp_offset = 0; -+ sh->slice_cr_qp_offset = 0; -+ } -+ -+ if (s->ps.pps->chroma_qp_offset_list_enabled_flag) -+ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb); -+ else -+ sh->cu_chroma_qp_offset_enabled_flag = 0; -+ -+ if (s->ps.pps->deblocking_filter_control_present_flag) { -+ int deblocking_filter_override_flag = 0; -+ -+ if (s->ps.pps->deblocking_filter_override_enabled_flag) -+ deblocking_filter_override_flag = get_bits1(gb); -+ -+ if (deblocking_filter_override_flag) { -+ sh->disable_deblocking_filter_flag = get_bits1(gb); -+ if (!sh->disable_deblocking_filter_flag) { -+ int beta_offset_div2 = get_se_golomb(gb); -+ int tc_offset_div2 = get_se_golomb(gb) ; -+ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 || -+ tc_offset_div2 < -6 || tc_offset_div2 > 6) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Invalid deblock filter offsets: %d, %d\n", -+ beta_offset_div2, tc_offset_div2); -+ return AVERROR_INVALIDDATA; -+ } -+ sh->beta_offset = beta_offset_div2 * 2; -+ sh->tc_offset = tc_offset_div2 * 2; -+ } -+ } else { -+ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf; -+ sh->beta_offset = s->ps.pps->beta_offset; -+ sh->tc_offset = s->ps.pps->tc_offset; -+ } -+ } else { -+ sh->disable_deblocking_filter_flag = 0; -+ sh->beta_offset = 0; -+ sh->tc_offset = 0; -+ } -+ -+ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag && -+ (sh->slice_sample_adaptive_offset_flag[0] || -+ sh->slice_sample_adaptive_offset_flag[1] || -+ !sh->disable_deblocking_filter_flag)) { -+ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb); -+ } else { -+ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag; -+ } -+ sh->no_dblk_boundary_flags = -+ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 : -+ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) | -+ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 : -+ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE); -+ -+ -+ } else if (!s->slice_initialized) { -+ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sh->num_entry_point_offsets = 0; -+ sh->offload_wpp = 0; -+ sh->offload_tiles = 0; -+ -+ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { -+ unsigned num_entry_point_offsets = get_ue_golomb_long(gb); -+ // It would be possible to bound this tighter but this here is simpler -+ if (num_entry_point_offsets > get_bits_left(gb)) { -+ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ sh->num_entry_point_offsets = num_entry_point_offsets; -+ if (sh->num_entry_point_offsets > 0) { -+ int offset_len = get_ue_golomb_long(gb) + 1; -+ -+ if (offset_len < 1 || offset_len > 32) { -+ sh->num_entry_point_offsets = 0; -+ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); -+ return ret; -+ } -+ -+ for (i = 0; i < sh->num_entry_point_offsets; i++) { -+ uint32_t val_minus1 = get_bits_long(gb, offset_len); -+ if (val_minus1 > (1 << 28)) -+ { -+ // We can declare offsets of > 2^28 bad without loss of generality -+ // Will check actual bounds wrt NAL later, but this keeps -+ // the values within bounds we can deal with easily -+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1); -+ return AVERROR_INVALIDDATA; -+ } -+ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size -+ } -+ -+ // Do we want to offload this -+ if (s->threads_type != 0) -+ { -+ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) && -+ s->ps.pps->num_tile_columns > 1; -+ // * We only cope with WPP in a single column -+ // Probably want to deal with that case as tiles rather than WPP anyway -+ // ?? Not actually sure that the main code deals with WPP + multi-col correctly -+ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->ps.pps->num_tile_columns == 1; -+ } -+ } -+ } -+ -+ if (s->ps.pps->slice_header_extension_present_flag) { -+ unsigned int length = get_ue_golomb_long(gb); -+ if (length*8LL > get_bits_left(gb)) { -+ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ for (i = 0; i < length; i++) -+ skip_bits(gb, 8); // slice_header_extension_data_byte -+ } -+ -+ // Inferred parameters -+ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta; -+ if (sh->slice_qp > 51 || -+ sh->slice_qp < -s->ps.sps->qp_bd_offset) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "The slice_qp %d is outside the valid range " -+ "[%d, 51].\n", -+ sh->slice_qp, -+ -s->ps.sps->qp_bd_offset); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (get_bits_left(gb) < 0) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Overread slice header by %d bits\n", -get_bits_left(gb)); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ s->slice_initialized = 1; -+ return 0; -+} -+ -+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry) -+{ -+ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; -+ int c_idx, i; -+ -+ if (s->sh.slice_sample_adaptive_offset_flag[0] || -+ s->sh.slice_sample_adaptive_offset_flag[1]) { -+ if ((lc->ctb_avail & AVAIL_L) != 0) -+ { -+ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); -+ if (sao_merge_left_flag) { -+ *sao = sao[-1]; -+ return; -+ } -+ } -+ if ((lc->ctb_avail & AVAIL_U) != 0) -+ { -+ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); -+ if (sao_merge_up_flag) { -+ *sao = sao[-(int)s->ps.sps->ctb_width]; -+ return; -+ } -+ } -+ } -+ -+ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) { -+ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : -+ s->ps.pps->log2_sao_offset_scale_chroma; -+ int offset_abs[4]; -+ char offset_sign[4] = {0}; -+ -+ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { -+ sao->type_idx[c_idx] = SAO_NOT_APPLIED; -+ continue; -+ } -+ -+ if (c_idx == 2) { -+ sao->type_idx[2] = sao->type_idx[1]; -+ sao->eo_class[2] = sao->eo_class[1]; -+ } else { -+ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc); -+ } -+ -+ // ** Could use BY22 here quite plausibly - this is all bypass stuff -+ // though only per CTB so not very timing critical -+ -+ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) -+ continue; -+ -+ for (i = 0; i < 4; i++) -+ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc); -+ -+ if (sao->type_idx[c_idx] == SAO_BAND) { -+ for (i = 0; i < 4; i++) { -+ if (offset_abs[i] != 0) -+ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc); -+ } -+ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc); -+ } else if (c_idx != 2) { -+ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc); -+ } -+ -+ // Inferred parameters -+ sao->offset_val[c_idx][0] = 0; -+ for (i = 0; i < 4; i++) { -+ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; -+ if (sao->type_idx[c_idx] == SAO_EDGE) { -+ if (i > 1) -+ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; -+ } else if (offset_sign[i]) { -+ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; -+ } -+ } -+ } -+} -+ -+#if 0 -+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) { -+ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4 -+ -+ if (log2_res_scale_abs_plus1 != 0) { -+ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx); -+ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * -+ (1 - 2 * res_scale_sign_flag); -+ } else { -+ lc->tu.res_scale_val = 0; -+ } -+ -+ -+ return 0; -+} -+#endif -+ -+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) -+{ -+ return jb->intra.cmds + jb->intra.n++; -+} -+ -+#define A0(x, y, U, L, UL, UR, DL) \ -+ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) -+ -+#define A1(x, y, U, L, UL, UR, DL) \ -+ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ -+ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) -+ -+#define A2(x, y, U, L, UL, UR, DL) \ -+ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ -+ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) -+ -+#define A3(x, y, U, L, UL, UR, DL) \ -+ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ -+ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) -+ -+#define A4(x, y, U, L, UL, UR, DL) \ -+ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ -+ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) -+ -+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; -+ -+unsigned int ff_hevc_rpi_tb_avail_flags( -+ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) -+{ -+ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size; -+ const unsigned int tb_x = x & ~ctb_mask; -+ const unsigned int tb_y = y & ~ctb_mask; -+ const unsigned int ctb_avail = lc->ctb_avail; -+ -+ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; -+ -+ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); -+ -+ // This deals with both the U & L edges -+ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0) -+ f |= AVAIL_UL; -+ -+ if (x + w < lc->end_of_ctb_x) -+ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR; -+ else if (tb_y == 0) -+ f |= (ctb_avail & AVAIL_UR); -+#if AVAIL_S_U - AVAIL_S_UR < 0 -+#error Shift problem -+#endif -+ -+ // Never any D if Y beyond eoctb -+ if (y + h < lc->end_of_ctb_y) -+ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL; -+#if AVAIL_S_DL - AVAIL_S_L < 0 -+#error Shift problem -+#endif -+ -+// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, -+// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], -+// lc->end_of_ctb_x, lc->end_of_ctb_y); -+ -+ return f; -+} -+ -+#undef A0 -+#undef A1 -+#undef A2 -+#undef A3 -+#undef A4 -+ -+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, -+ unsigned int avail) -+{ -+ // If rpi_enabled then sand - U & V done on U call -+ if (c_idx <= 1) -+ { -+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_INTRA + c_idx; -+ cmd->size = log2_trafo_size; -+ cmd->avail = avail; -+ cmd->i_pred.x = x0; -+ cmd->i_pred.y = y0; -+ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; -+ -+// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); -+ } -+} -+ -+#define CBF_CB0_S 0 -+#define CBF_CB1_S 1 // CB1 must be CB0 + 1 -+#define CBF_CR0_S 2 -+#define CBF_CR1_S 3 -+ -+#define CBF_CB0 (1 << CBF_CB0_S) -+#define CBF_CR0 (1 << CBF_CR0_S) -+#define CBF_CB1 (1 << CBF_CB1_S) -+#define CBF_CR1 (1 << CBF_CR1_S) -+ -+// * Only good for chroma_idx == 1 -+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_cb_size, const unsigned int log2_trafo_size, -+ const unsigned int blk_idx, const int cbf_luma, -+ const unsigned int cbf_chroma) -+{ -+ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1); -+ const unsigned int x0_c = x0 & ~7; -+ const unsigned int y0_c = y0 & ~7; -+ -+ enum ScanType scan_idx = SCAN_DIAG; -+ enum ScanType scan_idx_c = SCAN_DIAG; -+ -+ if (lc->cu.pred_mode == MODE_INTRA) -+ { -+ const unsigned int trafo_size = 1 << log2_trafo_size; -+ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size); -+ -+ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail); -+ -+ if (log2_trafo_size > 2) -+ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail); -+ else if (blk_idx == 3) -+ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, -+ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8)); -+ -+ if (log2_trafo_size < 4) { -+ if (lc->tu.intra_pred_mode >= 6 && -+ lc->tu.intra_pred_mode <= 14) { -+ scan_idx = SCAN_VERT; -+ } else if (lc->tu.intra_pred_mode >= 22 && -+ lc->tu.intra_pred_mode <= 30) { -+ scan_idx = SCAN_HORIZ; -+ } -+ -+ if (lc->tu.intra_pred_mode_c >= 6 && -+ lc->tu.intra_pred_mode_c <= 14) { -+ scan_idx_c = SCAN_VERT; -+ } else if (lc->tu.intra_pred_mode_c >= 22 && -+ lc->tu.intra_pred_mode_c <= 30) { -+ scan_idx_c = SCAN_HORIZ; -+ } -+ } -+ } -+ -+ if (!cbf_luma && cbf_chroma == 0) -+ return 0; -+ -+ if (lc->tu.is_cu_qp_delta_wanted) -+ { -+ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc); -+ const unsigned int cb_mask = ~0U << log2_cb_size; -+ -+ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) || -+ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1))) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "The cu_qp_delta %d is outside the valid range " -+ "[%d, %d].\n", -+ qp_delta, -+ -(26 + (s->ps.sps->qp_bd_offset >> 1)), -+ (25 + (s->ps.sps->qp_bd_offset >> 1))); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ lc->tu.is_cu_qp_delta_wanted = 0; -+ lc->tu.cu_qp_delta = qp_delta; -+ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask); -+ } -+ -+ // * Not main profile & untested due to no conform streams -+ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma && -+ !lc->cu.cu_transquant_bypass_flag) { -+ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc); -+ if (cu_chroma_qp_offset_flag) { -+ int cu_chroma_qp_offset_idx = 0; -+ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { -+ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc); -+ } -+ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; -+ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; -+ } -+ lc->tu.cu_chroma_qp_offset_wanted = 0; -+ } -+ -+ if (cbf_luma) -+ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); -+ -+ if (log2_trafo_size > 2 || blk_idx == 3) -+ { -+ if ((cbf_chroma & CBF_CB0) != 0) -+ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, -+ log2_trafo_size_c, scan_idx_c, 1); -+ if ((cbf_chroma & CBF_CR0) != 0) -+ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, -+ log2_trafo_size_c, scan_idx_c, 2); -+ } -+ -+ return 0; -+} -+ -+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size) -+{ -+ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3); -+} -+ -+ -+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_trafo_size, -+ const unsigned int trafo_depth, const unsigned int blk_idx, -+ const unsigned int cbf_c0) -+{ -+ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1 -+ unsigned int cbf_c1 = cbf_c0; -+ int split_transform_flag; -+ int ret; -+ -+ if (lc->cu.intra_split_flag) { -+ if (trafo_depth == 1) { -+ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx]; -+ if (ctx_cfmt(s) == 3) { -+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx]; -+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx]; -+ } else { -+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; -+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; -+ } -+ } -+ } else { -+ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0]; -+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; -+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; -+ } -+ -+ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size && -+ log2_trafo_size > s->ps.sps->log2_min_tb_size && -+ trafo_depth < lc->cu.max_trafo_depth && -+ !(lc->cu.intra_split_flag && trafo_depth == 0)) -+ { -+ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size); -+ } else { -+ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && -+ lc->cu.pred_mode == MODE_INTER && -+ lc->cu.part_mode != PART_2Nx2N && -+ trafo_depth == 0; -+ -+ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size || -+ (lc->cu.intra_split_flag && trafo_depth == 0) || -+ inter_split; -+ } -+ -+ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) -+ { -+ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3); -+ cbf_c1 = 0; -+ -+ if ((cbf_c0 & CBF_CB0) != 0) -+ { -+ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S; -+ if (wants_c1) -+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S; -+ } -+ -+ if ((cbf_c0 & CBF_CR0) != 0) -+ { -+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S; -+ if (wants_c1) -+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S; -+ } -+ } -+ -+ if (split_transform_flag) { -+ const int trafo_size_split = 1 << (log2_trafo_size - 1); -+ const int x1 = x0 + trafo_size_split; -+ const int y1 = y0 + trafo_size_split; -+ -+#define SUBDIVIDE(x, y, idx) \ -+do { \ -+ ret = hls_transform_tree(s, lc, x, y, \ -+ log2_trafo_size - 1, trafo_depth + 1, idx, \ -+ cbf_c1); \ -+ if (ret < 0) \ -+ return ret; \ -+} while (0) -+ -+ SUBDIVIDE(x0, y0, 0); -+ SUBDIVIDE(x1, y0, 1); -+ SUBDIVIDE(x0, y1, 2); -+ SUBDIVIDE(x1, y1, 3); -+ -+#undef SUBDIVIDE -+ } else { -+ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have -+ // trafo_size == 2 with depth == 0 the issue is moot -+ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) || -+ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth)); -+ -+ ret = hls_transform_unit(s, lc, x0, y0, -+ log2_trafo_size + trafo_depth, log2_trafo_size, -+ blk_idx, cbf_luma, cbf_c1); -+ if (ret < 0) -+ return ret; -+ -+ if (!s->sh.disable_deblocking_filter_flag) { -+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma); -+ } -+ } -+ return 0; -+} -+ -+ -+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) -+{ -+ GetBitContext gb; -+ int ret; -+ -+ ret = init_get_bits(&gb, pcm, length); -+ if (ret < 0) -+ return ret; -+ -+ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), -+ frame_stride1(s->frame, 0), -+ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); -+ -+ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)), -+ s->frame->linesize[1], -+ cb_size >> ctx_hshift(s, 1), -+ cb_size >> ctx_vshift(s, 1), -+ &gb, s->ps.sps->pcm.bit_depth_chroma); -+ -+ return 0; -+} -+ -+ -+// x * 2^(y*2) -+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) -+{ -+ return x << (y * 2); -+} -+ -+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) -+{ -+ // Length in bits -+ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + -+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) + -+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2)); -+ -+ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3); -+ -+ if (!s->sh.disable_deblocking_filter_flag) -+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); -+ -+ // Copy coeffs -+ { -+ const int blen = (length + 7) >> 3; -+ // Round allocated bytes up to nearest 32 to avoid alignment confusion -+ // Allocation is in int16_t s -+ // As we are only using 1 byte per sample and the coeff buffer allows 2 per -+ // sample this rounding doesn't affect the total size we need to allocate for -+ // the coeff buffer -+ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); -+ memcpy(coeffs, pcm, blen); -+ -+ // Our coeff stash assumes that any partially allocated 64byte lump -+ // is zeroed so make that true. -+ { -+ uint8_t * const eopcm = (uint8_t *)coeffs + blen; -+ if ((-(intptr_t)eopcm & 63) != 0) -+ memset(eopcm, 0, -(intptr_t)eopcm & 63); -+ } -+ -+ // Add command -+ { -+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_I_PCM; -+ cmd->size = log2_cb_size; -+ cmd->i_pcm.src = coeffs; -+ cmd->i_pcm.x = x0; -+ cmd->i_pcm.y = y0; -+ cmd->i_pcm.src_len = length; -+ } -+ return 0; -+ } -+} -+ -+ -+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref, -+ const MvXY xy, const int y0, const int height) -+{ -+ if (s->threads_type != 0) { -+ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9); -+ -+ // Progress has to be attached to current job as the actual wait -+ // is in worker_core which can't use lc -+ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no; -+ if (*pr < y) { -+ *pr = y; -+ } -+ } -+} -+ -+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x0, const int y0, const int nPbW, -+ const int nPbH, -+ HEVCRpiMvField * const mv) -+{ -+ enum InterPredIdc inter_pred_idc = PRED_L0; -+ int mvp_flag; -+ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); -+ -+ mv->pred_flag = 0; -+ if (s->sh.slice_type == HEVC_SLICE_B) -+ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); -+ -+ if (inter_pred_idc != PRED_L1) { -+ MvXY mvd; -+ -+ if (s->sh.nb_refs[L0]) -+ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); -+ -+ mv->pred_flag = PF_L0; -+ mvd = ff_hevc_rpi_hls_mvd_coding(lc); -+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, -+ mv, mvp_flag, 0); -+ mv->xy[0] = mvxy_add(mv->xy[0], mvd); -+ } -+ -+ if (inter_pred_idc != PRED_L0) { -+ MvXY mvd = 0; -+ -+ if (s->sh.nb_refs[L1]) -+ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); -+ -+ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI) -+ mvd = ff_hevc_rpi_hls_mvd_coding(lc); -+ -+ mv->pred_flag += PF_L1; -+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, -+ mv, mvp_flag, 1); -+ mv->xy[1] = mvxy_add(mv->xy[1], mvd); -+ } -+} -+ -+ -+static HEVCRpiInterPredQ * -+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) -+{ -+ HEVCRpiInterPredQ * yp = NULL; -+ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr; -+ const unsigned int max_fill = ipe->max_fill; -+ unsigned int load = UINT_MAX; -+ -+ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) { -+ // We will always have enough room between the Qs but if we are -+ // running critically low due to poor scheduling then use fill size -+ // rather than load to determine QPU. This has obvious dire -+ // performance implications but (a) it is better than crashing -+ // and (b) it should (almost) never happen -+ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base; -+ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load; -+ -+ if (tload < load) -+ { -+ yp = ypt; -+ load = tload; -+ } -+ } -+ -+ yp->load += load_val; -+ ipe->used_grp = 1; -+ qpu_mc_link_set(yp->qpu_mc_curr, fn); -+ -+ return yp; -+} -+ -+ -+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) -+{ -+ for (unsigned int i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; -+ -+ qpu_mc_link_set(q->qpu_mc_curr, q->code_sync); -+ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1); -+ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage -+ } -+} -+ -+// Returns 0 on success -+// We no longer check for Q fullness as wew have emergncy code in ctu alloc -+// * However it might be an idea to have some means of spotting that we've used it -+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) -+{ -+ if (!ipe->used_grp) -+ return 0; -+ -+ if ((ipe->curr += ipe->n_grp) >= ipe->n) -+ { -+ ipe->curr = 0; -+ rpi_inter_pred_sync(ipe); -+ } -+ ipe->used = 1; -+ ipe->used_grp = 0; -+ -+ return 0; -+} -+ -+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) -+{ -+ unsigned int i; -+ -+ ipe->curr = 0; -+ ipe->used = 0; -+ ipe->used_grp = 0; -+ for (i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ q->qpu_mc_curr = q->qpu_mc_base; -+ q->load = 0; -+ q->last_l0 = NULL; -+ q->last_l1 = NULL; -+ } -+} -+ -+static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, -+ const unsigned int n_max, const unsigned int n_grp, -+ const unsigned int total_size, const unsigned int min_gap) -+{ -+ int rv; -+ -+ memset(ipe, 0, sizeof(*ipe)); -+ if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL) -+ return AVERROR(ENOMEM); -+ -+ ipe->n_grp = n_grp; -+ ipe->min_gap = min_gap; -+ -+ if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0) -+ av_freep(&ipe->q); -+ return rv; -+} -+ -+ -+#if RPI_QPU_EMU_Y -+#define get_mc_address_y(f) ((f)->data[0]) -+#else -+#define get_mc_address_y(f) get_vc_address_y(f) -+#endif -+#if RPI_QPU_EMU_C -+#define get_mc_address_u(f) ((f)->data[1]) -+#else -+#define get_mc_address_u(f) get_vc_address_u(f) -+#endif -+ -+static inline uint32_t pack_wo_p(const int off, const int mul) -+{ -+ return PACK2(off * 2 + 1, mul); -+} -+ -+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul) -+{ -+ return PACK2(off0 + off1 + 1, mul); -+} -+ -+ -+static void -+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const MvXY mv_xy, -+ const int weight_mul, -+ const int weight_offset, -+ AVFrame *const src_frame) -+{ -+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const unsigned int mx = MV_X(mv_xy) & 3; -+ const unsigned int my = MV_Y(mv_xy) & 3; -+ const unsigned int my_mx = (my << 8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); -+ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; -+ const uint32_t wo = pack_wo_p(weight_offset, weight_mul); -+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); -+ -+ if (my_mx == 0) -+ { -+ const int x1 = x0 + (MV_X(mv_xy) >> 2); -+ const int y1 = y0 + (MV_Y(mv_xy) >> 2); -+ const int bh = nPbH; -+ -+ for (int start_x = 0; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; -+ -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; -+ ++ts->y_pred1_x0y0; -+ -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; -+ -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif -+ -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src_vc_address_y; -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->wo1 = wo; -+ cmd_y->dst_addr = dst_addr + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } -+ } -+ else -+ { -+ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3; -+ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3; -+ const unsigned int bh = nPbH; -+ int start_x = 0; -+ -+#if 1 -+ // As Y-pred operates on two independant 8-wide src blocks we can merge -+ // this pred with the previous one if it the previous one is 8 pel wide, -+ // the same height as the current block, immediately to the left of our -+ // current dest block and mono-pred. -+ -+ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; -+ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) -+ { -+ const int bw = FFMIN(nPbW, 8); -+ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; -+ -+ last_y8_src2->x = x1_m3; -+ last_y8_src2->y = y1_m3; -+ last_y8_src2->base = src_vc_address_y; -+ last_y8_p->w += bw; -+ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); -+ last_y8_p->wo2 = wo; -+ -+ jb->last_y8_p = NULL; -+ jb->last_y8_l1 = NULL; -+ start_x = bw; -+#if RPI_TSTATS -+ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge; -+#endif -+ } -+#endif -+ -+ for (; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; -+ if (mx == 0 && my == 0) -+ ++ts->y_pred1_x0y0; -+ else if (mx == 0) -+ ++ts->y_pred1_x0; -+ else if (my == 0) -+ ++ts->y_pred1_y0; -+ else -+ ++ts->y_pred1_xy; -+ -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; -+ -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif -+ src1->x = x1_m3 + start_x; -+ src1->y = y1_m3; -+ src1->base = src_vc_address_y; -+ if (bw <= 8) -+ { -+ src2->x = MC_DUMMY_X; -+ src2->y = MC_DUMMY_Y; -+#if RPI_QPU_EMU_Y -+ src2->base = s->qpu_dummy_frame_emu; -+#else -+ src2->base = s->qpu_dummy_frame_qpu; -+#endif -+ } -+ else -+ { -+ src2->x = x1_m3 + start_x + 8; -+ src2->y = y1_m3; -+ src2->base = src_vc_address_y; -+ } -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo; -+ cmd_y->wo2 = wo; -+ cmd_y->dst_addr = dst_addr + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ -+ if (bw == 8) { -+ jb->last_y8_l1 = src2; -+ jb->last_y8_p = cmd_y; -+ } -+ } -+ } -+} -+ -+static void -+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const struct HEVCRpiMvField *const mv_field, -+ const AVFrame *const src_frame, -+ const AVFrame *const src_frame2) -+{ -+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const MvXY mv = mv_field->xy[0]; -+ const MvXY mv2 = mv_field->xy[1]; -+ -+ const unsigned int mx = MV_X(mv) & 3; -+ const unsigned int my = MV_Y(mv) & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const unsigned int mx2 = MV_X(mv2) & 3; -+ const unsigned int my2 = MV_Y(mv2) & 3; -+ const unsigned int my2_mx2 = (my2<<8) | mx2; -+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; -+ const unsigned int ref_idx0 = mv_field->ref_idx[0]; -+ const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]); -+ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]); -+ -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); -+ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; -+ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); -+ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; -+ -+ if (my2_mx2_my_mx == 0) -+ { -+ const int x1 = x0 + (MV_X(mv) >> 2); -+ const int y1 = y0 + (MV_Y(mv) >> 2); -+ const int x2 = x0 + (MV_X(mv2) >> 2); -+ const int y2 = y0 + (MV_Y(mv2) >> 2); -+ const int bh = nPbH; -+ -+ // Can do chunks a full 16 wide if we don't want the H filter -+ for (int start_x=0; start_x < nPbW; start_x += 16) -+ { -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; -+ ++ts->y_pred2_x0y0; -+ -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 16); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = 0; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } -+ } -+ else -+ { -+ // Filter requires a run-up of 3 -+ const int x1 = x0 + (MV_X(mv) >> 2) - 3; -+ const int y1 = y0 + (MV_Y(mv) >> 2) - 3; -+ const int x2 = x0 + (MV_X(mv2) >> 2) - 3; -+ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3; -+ const int bh = nPbH; -+ -+ for (int start_x=0; start_x < nPbW; start_x += 8) -+ { // B blocks work 8 at a time -+ // B weights aren't doubled as the QPU code does the same -+ // amount of work as it does for P -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; -+ const unsigned int mmx = mx | mx2; -+ const unsigned int mmy = my | my2; -+ if (mmx == 0 && mmy == 0) -+ ++ts->y_pred2_x0y0; -+ else if (mmx == 0) -+ ++ts->y_pred2_x0; -+ else if (mmy == 0) -+ ++ts->y_pred2_y0; -+ else -+ ++ts->y_pred2_xy; -+ -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 8); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } -+ } -+} -+ -+// h/v shifts fixed at one as that is all the qasm copes with -+static void -+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const unsigned int lx, const int x0_c, const int y0_c, -+ const int nPbW_c, const int nPbH_c, -+ const MvXY mv, -+ const int16_t * const c_weights, -+ const int16_t * const c_offsets, -+ AVFrame * const src_frame) -+{ -+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = 1; // = s->ps.sps->hshift[1]; -+ const int vshift = 1; // = s->ps.sps->vshift[1]; -+ -+ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; -+ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); -+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; -+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]); -+ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]); -+ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; -+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; -+ const unsigned int bh = nPbH_c; -+ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; -+ -+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) -+ { -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); -+ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; -+ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; -+ qpu_mc_src_t * const last_lx = *plast_lx; -+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ -+ last_lx->x = x1_c + start_x; -+ last_lx->y = y1_c; -+ last_lx->base = src_base_u; -+ cmd_c->h = bh; -+ cmd_c->w = bw; -+ cmd_c->coeffs_x = x_coeffs; -+ cmd_c->coeffs_y = y_coeffs; -+ cmd_c->wo_u = wo_u; -+ cmd_c->wo_v = wo_v; -+ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); -+ *plast_lx = &cmd_c->next_src; -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); -+ } -+ return; -+} -+ -+// h/v shifts fixed at one as that is all the qasm copes with -+static void -+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const int x0_c, const int y0_c, -+ const int nPbW_c, const int nPbH_c, -+ const struct HEVCRpiMvField * const mv_field, -+ const int16_t * const c_weights, -+ const int16_t * const c_offsets, -+ const int16_t * const c_weights2, -+ const int16_t * const c_offsets2, -+ AVFrame * const src_frame, -+ AVFrame * const src_frame2) -+{ -+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = 1; // s->ps.sps->hshift[1]; -+ const int vshift = 1; // s->ps.sps->vshift[1]; -+ const MvXY mv = mv_field->xy[0]; -+ const MvXY mv2 = mv_field->xy[1]; -+ -+ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift); -+ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift); -+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; -+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector -+ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; -+ -+ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift); -+ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift); -+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; -+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector -+ -+ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; -+ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; -+ -+ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]); -+ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]); -+ -+ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; -+ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); -+ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; -+ const unsigned int bh = nPbH_c; -+ -+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) -+ { -+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); -+ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; -+ qpu_mc_src_t * const src_l0 = cp->last_l0; -+ qpu_mc_src_t * const src_l1 = cp->last_l1; -+ -+ src_l0->x = x1_c + start_x; -+ src_l0->y = y1_c; -+ src_l0->base = src1_base; -+ src_l1->x = x2_c + start_x; -+ src_l1->y = y2_c; -+ src_l1->base = src2_base; -+ -+ u[0].h = bh; -+ u[0].w = bw; -+ u[0].coeffs_x1 = coefs0_x; -+ u[0].coeffs_y1 = coefs0_y; -+ u[0].weight_u1 = c_weights[0]; // Weight L0 U -+ u[0].weight_v1 = c_weights[1]; // Weight L0 V -+ u[0].coeffs_x2 = coefs1_x; -+ u[0].coeffs_y2 = coefs1_y; -+ u[0].wo_u2 = wo_u2; -+ u[0].wo_v2 = wo_v2; -+ u[0].dst_addr_c = dst_base_u + (start_x << xshl); -+ -+ cp->last_l0 = &u[0].next_src1; -+ cp->last_l1 = &u[0].next_src2; -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); -+ } -+} -+ -+ -+static inline void -+col_stash(const HEVCRpiContext * const s, -+ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0, -+ const HEVCRpiMvField * const mvf) -+{ -+ ColMvField * const col_mvf = s->ref->col_mvf; -+ const unsigned int x = (x0 + 15) >> 4; -+ const unsigned int y = (y0 + 15) >> 4; -+ const unsigned int w = ((x0 + 15 + w0) >> 4) - x; -+ const unsigned int h = ((y0 + 15 + h0) >> 4) - y; -+ -+ if (col_mvf != NULL && w != 0 && h != 0) -+ { -+ // Only record MV from the top left of the 16x16 block -+ -+ const RefPicList * const rpl = s->refPicList; -+ const ColMvField cmv = { -+ .L = { -+ { -+ .poc = (mvf->pred_flag & PF_L0) == 0 ? -+ COL_POC_INTRA : -+ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]), -+ .xy = mvf->xy[0] -+ }, -+ { -+ .poc = (mvf->pred_flag & PF_L1) == 0 ? -+ COL_POC_INTRA : -+ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]), -+ .xy = mvf->xy[1] -+ } -+ } -+ }; -+ -+ ColMvField * p = col_mvf + y * s->col_mvf_stride + x; -+ const unsigned int stride = s->col_mvf_stride - w; -+ unsigned int j = h; -+ -+ do -+ { -+ unsigned int k = w; -+ do -+ { -+ *p++ = cmv; -+ } while (--k != 0); -+ p += stride; -+ } while (--j != 0); -+ } -+} -+ -+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int nPbW, const unsigned int nPbH, -+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) -+{ -+ HEVCRpiJob * const jb = lc->jb0; -+ -+ struct HEVCRpiMvField current_mv = {{0}}; -+ const RefPicList *const refPicList = s->refPicList; -+ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL; -+ -+ if (lc->cu.pred_mode != MODE_SKIP) -+ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); -+ -+ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) { -+ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 : -+ ff_hevc_rpi_merge_idx_decode(s, lc); -+ -+ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, -+ partIdx, merge_idx, ¤t_mv); -+ } else { -+ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, ¤t_mv); -+ } -+ -+ { -+ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); -+ unsigned int i, j; -+ -+ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++) -+ { -+ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++) -+ p[i] = current_mv; -+ p += MVF_STASH_WIDTH_PU; -+ } -+ } -+ -+ col_stash(s, x0, y0, nPbW, nPbH, ¤t_mv); -+ -+ if (current_mv.pred_flag & PF_L0) { -+ ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; -+ if (!ref0) -+ return; -+ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH); -+ } -+ if (current_mv.pred_flag & PF_L1) { -+ ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; -+ if (!ref1) -+ return; -+ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH); -+ } -+ -+ if (current_mv.pred_flag == PF_L0) { -+ const int x0_c = x0 >> ctx_hshift(s, 1); -+ const int y0_c = y0 >> ctx_vshift(s, 1); -+ const int nPbW_c = nPbW >> ctx_hshift(s, 1); -+ const int nPbH_c = nPbH >> ctx_vshift(s, 1); -+ -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0], -+ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], -+ ref0->frame); -+ -+ if (ctx_cfmt(s) != 0) { -+ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0], -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], -+ ref0->frame); -+ return; -+ } -+ } else if (current_mv.pred_flag == PF_L1) { -+ const int x0_c = x0 >> ctx_hshift(s, 1); -+ const int y0_c = y0 >> ctx_vshift(s, 1); -+ const int nPbW_c = nPbW >> ctx_hshift(s, 1); -+ const int nPbH_c = nPbH >> ctx_vshift(s, 1); -+ -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1], -+ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], -+ ref1->frame); -+ -+ if (ctx_cfmt(s) != 0) { -+ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1], -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], -+ ref1->frame); -+ return; -+ } -+ } else if (current_mv.pred_flag == PF_BI) { -+ const int x0_c = x0 >> ctx_hshift(s, 1); -+ const int y0_c = y0 >> ctx_vshift(s, 1); -+ const int nPbW_c = nPbW >> ctx_hshift(s, 1); -+ const int nPbH_c = nPbH >> ctx_vshift(s, 1); -+ -+ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); -+ -+ if (ctx_cfmt(s) != 0) { -+ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, -+ ¤t_mv, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], -+ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], -+ ref0->frame, -+ ref1->frame); -+ return; -+ } -+ } -+} -+ -+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_cb_size, -+ const unsigned int ipm) -+{ -+ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE; -+ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE; -+ -+ { -+ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE)); -+ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm); -+ } -+ -+ // If IRAP then everything is Intra & we avoid ever looking at these -+ // stashes so don't bother setting them -+ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA) -+ { -+ if (s->is_intra != NULL) -+ { -+ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE); -+ } -+ -+ { -+ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); -+ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1 -+ unsigned int n = size_in_pus; -+ -+ do -+ { -+ memset(p, 0, size_in_pus * sizeof(*p)); -+ p += MVF_STASH_WIDTH_PU; -+ } while (--n != 0); -+ } -+ -+ -+ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0) -+ { -+ // Only record top left stuff -+ // Blocks should always be alinged on size boundries -+ // so cannot have overflow from a small block -+ -+ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4); -+ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4)); -+ const unsigned int stride = s->col_mvf_stride - size_in_col; -+ unsigned int j = size_in_col; -+ -+ do -+ { -+ unsigned int k = size_in_col; -+ do -+ { -+ p->L[0].poc = COL_POC_INTRA; -+ p->L[0].xy = 0; -+ p->L[1].poc = COL_POC_INTRA; -+ p->L[1].xy = 0; -+ ++p; -+ } while (--k != 0); -+ p += stride; -+ } while (--j != 0); -+ } -+ } -+} -+ -+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_cb_size) -+{ -+ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC); -+} -+ -+ -+/** -+ * 8.4.1 -+ */ -+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ int x0, int y0, int log2_pu_size, -+ int prev_intra_luma_pred_flag, -+ const unsigned int idx) -+{ -+ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); -+ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; -+ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; -+ -+ // Up does not cross boundries so as we always scan 1 slice-tile-line in an -+ // lc we can just keep 1 CTB lR stashes -+ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job -+ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; -+ const unsigned int cand_left = lc->ipm_left[yb_pu]; -+ -+ unsigned int intra_pred_mode; -+ unsigned int a, b, c; -+ -+ if (cand_left == cand_up) { -+ if (cand_left < 2) { -+ a = INTRA_PLANAR; -+ b = INTRA_DC; -+ c = INTRA_ANGULAR_26; -+ } else { -+ a = cand_left; -+ b = 2 + ((cand_left - 2 - 1 + 32) & 31); -+ c = 2 + ((cand_left - 2 + 1) & 31); -+ } -+ } else { -+ a = cand_left; -+ b = cand_up; -+ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ? -+ INTRA_PLANAR : -+ (cand_left != INTRA_DC && cand_up != INTRA_DC) ? -+ INTRA_DC : -+ INTRA_ANGULAR_26; -+ } -+ -+ if (prev_intra_luma_pred_flag) { -+ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c; -+ } else { -+ // Sort lowest 1st -+ if (a > b) -+ FFSWAP(int, a, b); -+ if (a > c) -+ FFSWAP(int, a, c); -+ if (b > c) -+ FFSWAP(int, b, c); -+ -+ intra_pred_mode = idx; -+ if (intra_pred_mode >= a) -+ intra_pred_mode++; -+ if (intra_pred_mode >= b) -+ intra_pred_mode++; -+ if (intra_pred_mode >= c) -+ intra_pred_mode++; -+ } -+ -+ /* write the intra prediction units into the mv array */ -+ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode); -+ return intra_pred_mode; -+} -+ -+static const uint8_t tab_mode_idx[] = { -+ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, -+ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; -+ -+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_cb_size) -+{ -+ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; -+ uint8_t prev_intra_luma_pred_flag[4]; -+ int split = lc->cu.part_mode == PART_NxN; -+ const unsigned int split_size = (1 << (log2_cb_size - 1)); -+ int chroma_mode; -+ const unsigned int n = split ? 4 : 1; -+ unsigned int i; -+ -+ for (i = 0; i != n; i++) -+ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc); -+ -+ for (i = 0; i < n; i++) { -+ // depending on mode idx is mpm or luma_pred_mode -+ const unsigned int idx = prev_intra_luma_pred_flag[i] ? -+ ff_hevc_rpi_mpm_idx_decode(lc) : -+ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc); -+ -+ lc->pu.intra_pred_mode[i] = -+ luma_intra_pred_mode(s, lc, -+ x0 + ((i & 1) == 0 ? 0 : split_size), -+ y0 + ((i & 2) == 0 ? 0 : split_size), -+ log2_cb_size - split, -+ prev_intra_luma_pred_flag[i], idx); -+ } -+ -+ if (ctx_cfmt(s) == 3) { -+ for (i = 0; i < n; i++) { -+ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); -+ if (chroma_mode != 4) { -+ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode]) -+ lc->pu.intra_pred_mode_c[i] = 34; -+ else -+ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode]; -+ } else { -+ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i]; -+ } -+ } -+ } else if (ctx_cfmt(s) == 2) { -+ int mode_idx; -+ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); -+ if (chroma_mode != 4) { -+ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) -+ mode_idx = 34; -+ else -+ mode_idx = intra_chroma_table[chroma_mode]; -+ } else { -+ mode_idx = lc->pu.intra_pred_mode[0]; -+ } -+ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; -+ } else if (ctx_cfmt(s) != 0) { -+ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); -+ if (chroma_mode != 4) { -+ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) -+ lc->pu.intra_pred_mode_c[0] = 34; -+ else -+ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode]; -+ } else { -+ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0]; -+ } -+ } -+} -+ -+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size) -+{ -+ const unsigned int cb_size = 1 << log2_cb_size; -+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; -+ const unsigned int min_cb_width = s->ps.sps->min_cb_width; -+ const unsigned int x_cb = x0 >> log2_min_cb_size; -+ const unsigned int y_cb = y0 >> log2_min_cb_size; -+ const unsigned int idx = log2_cb_size - 2; -+ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; -+ int skip_flag = 0; -+ -+ lc->cu.x = x0; -+ lc->cu.y = y0; -+ lc->cu.x_split = x0; -+ lc->cu.y_split = y0; -+ -+ lc->cu.pred_mode = MODE_INTRA; -+ lc->cu.part_mode = PART_2Nx2N; -+ lc->cu.intra_split_flag = 0; -+ lc->cu.cu_transquant_bypass_flag = 0; -+ lc->pu.intra_pred_mode[0] = 1; -+ lc->pu.intra_pred_mode[1] = 1; -+ lc->pu.intra_pred_mode[2] = 1; -+ lc->pu.intra_pred_mode[3] = 1; -+ -+ if (s->ps.pps->transquant_bypass_enable_flag) { -+ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc); -+ if (lc->cu.cu_transquant_bypass_flag) -+ set_deblocking_bypass(s, x0, y0, log2_cb_size); -+ } -+ -+ if (s->sh.slice_type != HEVC_SLICE_I) { -+ lc->cu.pred_mode = MODE_INTER; -+ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); -+ } -+ -+ if (skip_flag) { -+ lc->cu.pred_mode = MODE_SKIP; -+ -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); -+ -+ if (!s->sh.disable_deblocking_filter_flag) -+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); -+ } else { -+ int pcm_flag = 0; -+ -+ if (s->sh.slice_type != HEVC_SLICE_I) -+ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc); -+ if (lc->cu.pred_mode != MODE_INTRA || -+ log2_cb_size == s->ps.sps->log2_min_cb_size) { -+ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size); -+ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && -+ lc->cu.pred_mode == MODE_INTRA; -+ } -+ -+ if (lc->cu.pred_mode == MODE_INTRA) { -+ if (lc->cu.part_mode == PART_2Nx2N && -+ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled -+ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && -+ ff_hevc_rpi_pcm_flag_decode(lc) != 0) -+ { -+ int ret; -+ pcm_flag = 1; -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); -+ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0) -+ return ret; -+ -+ if (s->ps.sps->pcm.loop_filter_disable_flag) -+ set_deblocking_bypass(s, x0, y0, log2_cb_size); -+ } else { -+ intra_prediction_unit(s, lc, x0, y0, log2_cb_size); -+ } -+ } else { -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); -+ switch (lc->cu.part_mode) { -+ case PART_2Nx2N: -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); -+ break; -+ case PART_2NxN: -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); -+ lc->cu.y_split = y0 + cb_size / 2; -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); -+ break; -+ case PART_Nx2N: -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); -+ lc->cu.x_split = x0 + cb_size / 2; -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); -+ break; -+ case PART_2NxnU: -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); -+ lc->cu.y_split = y0 + cb_size / 4; -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx); -+ break; -+ case PART_2NxnD: -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx); -+ lc->cu.y_split = y0 + cb_size / 4 * 3; -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx); -+ break; -+ case PART_nLx2N: -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); -+ lc->cu.x_split = x0 + cb_size / 4; -+ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); -+ break; -+ case PART_nRx2N: -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2); -+ lc->cu.x_split = x0 + cb_size / 4 * 3; -+ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); -+ break; -+ case PART_NxN: -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); -+ lc->cu.x_split = x0 + cb_size / 2; -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); -+ lc->cu.y_split = y0 + cb_size / 2; -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); -+ break; -+ } -+ } -+ -+ if (!pcm_flag) { -+ int rqt_root_cbf = 1; -+ -+ if (lc->cu.pred_mode != MODE_INTRA && -+ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { -+ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc); -+ } -+ if (rqt_root_cbf) { -+ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0); -+ int ret; -+ -+ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? -+ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : -+ s->ps.sps->max_transform_hierarchy_depth_inter; -+ // transform_tree does deblock_boundary_strengths -+ ret = hls_transform_tree(s, lc, x0, y0, -+ log2_cb_size, 0, 0, cbf_c); -+ if (ret < 0) -+ return ret; -+ } else { -+ if (!s->sh.disable_deblocking_filter_flag) -+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); -+ } -+ } -+ } -+ -+ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here -+ if (lc->tu.is_cu_qp_delta_wanted) -+ ff_hevc_rpi_set_qPy(s, lc, x0, y0); -+ -+ if(((x0 + (1<qPy_pred = lc->qp_y; -+ } -+ -+ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff); -+ -+ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); -+ -+ return 0; -+} -+ -+// Returns: -+// < 0 Error -+// 0 More data wanted -+// 1 EoSlice / EoPicture -+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int log2_cb_size, const unsigned int cb_depth) -+{ -+ const int cb_size = 1 << log2_cb_size; -+ int ret; -+ int split_cu; -+ -+ lc->ct_depth = cb_depth; -+ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); -+ if (x0 + cb_size <= s->ps.sps->width && -+ y0 + cb_size <= s->ps.sps->height && -+ split_cu) -+ { -+ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); -+ } -+ -+ // Qp delta (and offset) need to remain wanted if cb_size < min until -+ // a coded block is found so we still initial state at depth 0 (outside -+ // this fn) and only reset here -+ if (s->ps.pps->cu_qp_delta_enabled_flag && -+ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) -+ { -+ lc->tu.is_cu_qp_delta_wanted = 1; -+ lc->tu.cu_qp_delta = 0; -+ } -+ if (s->sh.cu_chroma_qp_offset_enabled_flag && -+ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) -+ { -+ lc->tu.cu_chroma_qp_offset_wanted = 1; -+ } -+ -+ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0]; -+ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset; -+ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset; -+ -+ if (split_cu) { -+ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; -+ const int cb_size_split = cb_size >> 1; -+ const int x1 = x0 + cb_size_split; -+ const int y1 = y0 + cb_size_split; -+ -+ int more_data = 0; -+ -+ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ -+ if (more_data && x1 < s->ps.sps->width) { -+ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ } -+ if (more_data && y1 < s->ps.sps->height) { -+ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ } -+ if (more_data && x1 < s->ps.sps->width && -+ y1 < s->ps.sps->height) { -+ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ } -+ -+ if(((x0 + (1<qPy_pred = lc->qp_y; -+ -+ if (more_data) -+ return ((x1 + cb_size_split) < s->ps.sps->width || -+ (y1 + cb_size_split) < s->ps.sps->height); -+ else -+ return 0; -+ } else { -+ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); -+ if (ret < 0) -+ return ret; -+ if ((!((x0 + cb_size) % -+ (1 << (s->ps.sps->log2_ctb_size))) || -+ (x0 + cb_size >= s->ps.sps->width)) && -+ (!((y0 + cb_size) % -+ (1 << (s->ps.sps->log2_ctb_size))) || -+ (y0 + cb_size >= s->ps.sps->height))) { -+ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc); -+ return !end_of_slice_flag; -+ } else { -+ return 1; -+ } -+ } -+ -+ return 0; // NEVER -+} -+ -+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x_ctb, const int y_ctb, const int ctb_addr_ts) -+{ -+ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice -+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; -+ const unsigned int line_w = s->ps.sps->ctb_width; -+ -+ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; -+ -+ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width); -+ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); -+ -+ lc->boundary_flags = 0; -+ -+ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0) -+ lc->boundary_flags |= BOUNDARY_LEFT_TILE; -+ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) -+ lc->boundary_flags |= BOUNDARY_LEFT_SLICE; -+ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0) -+ lc->boundary_flags |= BOUNDARY_UPPER_TILE; -+ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) -+ lc->boundary_flags |= BOUNDARY_UPPER_SLICE; -+ -+ // Use line width rather than tile width for addr_in_slice test as -+ // addr_in_slice is in raster units -+ -+ lc->ctb_avail = -+ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | -+ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | -+ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && -+ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | -+ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && -+ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); -+ // Down-left never avail at CTB level -+} -+ -+ -+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds, -+ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0); -+ -+ // Signal -+ if (y > 0) { -+ // Cast away const as progress is held in s, but this really shouldn't confuse anything -+ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1); -+ } -+ -+ // Job done now -+ // ? Move outside this fn -+ job_free(s->jbc, jb); -+} -+ -+// I-pred, transform_and_add for all blocks types done here -+// All ARM -+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ unsigned int i; -+ HEVCRpiIntraPredEnv * const iap = &jb->intra; -+ const HEVCPredCmd *cmd = iap->cmds; -+ -+#if !RPI_WORKER_WAIT_PASS_0 -+ rpi_sem_wait(&jb->sem); -+ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1 -+#endif -+ -+ for (i = iap->n; i > 0; i--, cmd++) -+ { -+ switch (cmd->type) -+ { -+ case RPI_PRED_INTRA: -+ s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); -+ break; -+ case RPI_PRED_INTRA_C: -+ s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); -+ break; -+ case RPI_PRED_ADD_RESIDUAL: -+ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+ break; -+ case RPI_PRED_ADD_DC: -+ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); -+ break; -+ case RPI_PRED_ADD_RESIDUAL_U: -+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); -+ break; -+ case RPI_PRED_ADD_RESIDUAL_V: -+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); -+ break; -+ case RPI_PRED_ADD_RESIDUAL_C: -+ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+ break; -+ case RPI_PRED_ADD_DC_U: -+ case RPI_PRED_ADD_DC_V: -+ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); -+ break; -+ -+ case RPI_PRED_I_PCM: -+ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); -+ break; -+ -+ default: -+ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); -+ abort(); -+ } -+ } -+ -+ // Mark done -+ iap->n = 0; -+} -+ -+ -+// Set initial uniform job values & zero ctu_count -+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) -+{ -+ unsigned int i; -+ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; -+ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; -+ const HEVCRpiSPS * const sps = s->ps.sps; -+ -+ const uint16_t pic_width_y = sps->width; -+ const uint16_t pic_height_y = sps->height; -+ -+ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1); -+ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1); -+ -+ // We expect the pointer to change if we use another sps -+ if (sps != jb->sps) -+ { -+ worker_pic_free_one(jb); -+ -+ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); -+ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); -+ -+ { -+ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH; -+ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1)); -+ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); -+ } -+ -+ jb->sps = sps; -+ } -+ -+ jb->waited = 0; -+ jb->ctu_ts_first = ctu_ts_first; -+ jb->ctu_ts_last = -1; -+ -+ rpi_inter_pred_reset(cipe); -+ for (i = 0; i < cipe->n; i++) { -+ HEVCRpiInterPredQ * const cp = cipe->q + i; -+ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; -+ -+ u->next_src1.x = 0; -+ u->next_src1.y = 0; -+ u->next_src1.base = 0; -+ u->pic_cw = pic_width_c; -+ u->pic_ch = pic_height_c; -+ u->stride2 = av_rpi_sand_frame_stride2(s->frame); -+ u->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ cp->last_l0 = &u->next_src1; -+ -+ u->next_fn = 0; -+ u->next_src2.x = 0; -+ u->next_src2.y = 0; -+ u->next_src2.base = 0; -+ cp->last_l1 = &u->next_src2; -+ -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); -+ } -+ -+ rpi_inter_pred_reset(yipe); -+ for (i = 0; i < yipe->n; i++) { -+ HEVCRpiInterPredQ * const yp = yipe->q + i; -+ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; -+ -+ y->next_src1.x = 0; -+ y->next_src1.y = 0; -+ y->next_src1.base = 0; -+ y->next_src2.x = 0; -+ y->next_src2.y = 0; -+ y->next_src2.base = 0; -+ y->pic_h = pic_height_y; -+ y->pic_w = pic_width_y; -+ y->stride2 = av_rpi_sand_frame_stride2(s->frame); -+ y->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ y->next_fn = 0; -+ yp->last_l0 = &y->next_src1; -+ yp->last_l1 = &y->next_src2; -+ -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); -+ } -+ -+ jb->last_y8_p = NULL; -+ jb->last_y8_l1 = NULL; -+ -+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { -+ jb->progress_req[i] = -1; -+ } -+ -+ worker_pic_reset(&jb->coeffs); -+} -+ -+ -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s, -+ const vpu_qpu_job_h vqj, -+ rpi_cache_flush_env_t * const rfe, -+ HEVCRpiInterPredEnv * const ipe) -+{ -+ unsigned int i; -+ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; -+ unsigned int max_block = 0; -+ -+ if (!ipe->used) { -+ return 0; -+ } -+ -+ if (ipe->curr != 0) { -+ rpi_inter_pred_sync(ipe); -+ } -+ -+ // Add final commands to Q -+ for(i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const yp = ipe->q + i; -+ qpu_mc_src_t *const p0 = yp->last_l0; -+ qpu_mc_src_t *const p1 = yp->last_l1; -+ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; -+ -+ if (block_size > max_block) -+ max_block = block_size; -+ -+ qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit); -+ -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ p0->x = MC_DUMMY_X; -+ p0->y = MC_DUMMY_Y; -+ p0->base = s->qpu_dummy_frame_qpu; -+ p1->x = MC_DUMMY_X; -+ p1->y = MC_DUMMY_Y; -+ p1->base = s->qpu_dummy_frame_qpu; -+ -+ yp->last_l0 = NULL; -+ yp->last_l1 = NULL; -+ -+ // Add to mailbox list -+ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); -+ mail[i][1] = yp->code_setup; -+ } -+ -+ // We don't need invalidate here as the uniforms aren't changed by the QPU -+ // and leaving them in ARM cache avoids (pointless) pre-reads when writing -+ // new values which seems to give us a small performance advantage -+ // -+ // In most cases we will not have a completely packed set of uniforms and as -+ // we have a 2d invalidate we writeback all uniform Qs to the depth of the -+ // fullest -+ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, -+ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, -+ ipe->n, ipe->max_fill + ipe->min_gap); -+ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); -+ -+ return 1; -+} -+#endif -+ -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s, -+ const vpu_qpu_job_h vqj, -+ rpi_cache_flush_env_t * const rfe, -+ HEVCRpiInterPredEnv * const ipe) -+{ -+ unsigned int i; -+ if (!ipe->used) { -+ return 0; -+ } -+ -+ if (ipe->curr != 0) { -+ rpi_inter_pred_sync(ipe); -+ } -+ -+ // Add final commands to Q -+ for(i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const yp = ipe->q + i; -+ qpu_mc_src_t *const p0 = yp->last_l0; -+ qpu_mc_src_t *const p1 = yp->last_l1; -+ -+ yp->qpu_mc_curr->data[-1] = yp->code_exit; -+ -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ p0->x = MC_DUMMY_X; -+ p0->y = MC_DUMMY_Y; -+ p0->base = s->qpu_dummy_frame_emu; -+ p1->x = MC_DUMMY_X; -+ p1->y = MC_DUMMY_Y; -+ p1->base = s->qpu_dummy_frame_emu; -+ -+ yp->last_l0 = NULL; -+ yp->last_l1 = NULL; -+ } -+ -+ return 1; -+} -+#endif -+ -+ -+#if RPI_QPU_EMU_Y -+#define mc_terminate_add_y mc_terminate_add_emu -+#else -+#define mc_terminate_add_y mc_terminate_add_qpu -+#endif -+#if RPI_QPU_EMU_C -+#define mc_terminate_add_c mc_terminate_add_emu -+#else -+#define mc_terminate_add_c mc_terminate_add_qpu -+#endif -+ -+ -+static void flush_frame(HEVCRpiContext *s,AVFrame *frame) -+{ -+ rpi_cache_buf_t cbuf; -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); -+ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ rpi_cache_flush_finish(rfe); -+} -+ -+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; -+ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; -+ const unsigned int ctb_width = s->ps.sps->ctb_width; -+ RpiBlk *const bounds = &jb->bounds; -+ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); -+ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; -+ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; -+ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; -+ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; -+ -+ bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x); -+ bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y); -+} -+ -+#if RPI_PASSES == 2 -+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s, jb); -+ -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s, jb); -+} -+#endif -+ -+// Core execution tasks -+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb) -+{ -+ int pred_y, pred_c; -+ vpu_qpu_job_env_t qvbuf; -+ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); -+#if RPI_WORKER_WAIT_PASS_0 -+ int do_wait; -+#endif -+ -+ { -+ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; -+ if (cf->s[3].n + cf->s[2].n != 0) -+ { -+ const unsigned int csize = sizeof(cf->s[3].buf[0]); -+ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; -+ unsigned int n16 = (cf->s[2].n >> 8); -+ unsigned int n32 = (cf->s[3].n >> 10); -+#if RPI_COMPRESS_COEFFS -+ if (cf->s[2].packed) { -+ n16 = n16 | (n16<<16); -+ } else { -+ const unsigned int npack16 = (cf->s[2].packed_n>>8); -+ n16 = n16 | (npack16<<16); -+ } -+ if (cf->s[3].packed) { -+ n32 = n32 | (n32<<16); -+ } else { -+ const unsigned int npack32 = (cf->s[3].packed_n>>10); -+ n32 = n32 | (npack32<<16); -+ } -+#endif -+ vpu_qpu_job_add_vpu(vqj, -+ vpu_get_fn(s->ps.sps->bit_depth), -+ vpu_get_constants(), -+ cf->gptr.vc, -+ n16, -+ cf->gptr.vc + offset32, -+ n32, -+ 0); -+ -+ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); -+ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); -+ } -+ } -+ -+ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip); -+ -+// We could take a sync here and try to locally overlap QPU processing with ARM -+// but testing showed a slightly negative benefit with noticable extra complexity -+ -+ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip); -+ -+ // Returns 0 if nothing to do, 1 if sync added -+#if RPI_WORKER_WAIT_PASS_0 -+ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem); -+#else -+ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0) -+ sem_post(&jb->sem); -+#endif -+ -+ rpi_cache_flush_execute(jb->rfe); -+ -+ // Await progress as required -+ // jb->waited will only be clear if we have already tested the progress values -+ // (in worker_submit_job) and found we don't have to wait -+ if (jb->waited) -+ { -+ unsigned int i; -+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { -+ if (jb->progress_req[i] >= 0) { -+ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]); -+ } -+ } -+ } -+ -+ vpu_qpu_job_finish(vqj); -+ -+ // We always work on a rectangular block -+ if (pred_y || pred_c) -+ { -+ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, -+ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, -+ ctx_vshift(s, 1), pred_y, pred_c); -+ } -+ -+ // If we have emulated VPU ops - do it here -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ if (av_rpi_is_sand8_frame(s->frame)) -+ { -+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C -+ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); -+#elif RPI_QPU_EMU_Y -+ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL); -+#else -+ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip); -+#endif -+ } -+ else -+ { -+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C -+ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); -+#elif RPI_QPU_EMU_Y -+ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL); -+#else -+ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip); -+#endif -+ } -+#endif -+ -+#if RPI_WORKER_WAIT_PASS_0 -+ if (do_wait) -+ rpi_sem_wait(&jb->sem); -+ rpi_cache_flush_execute(jb->rfe); -+#endif -+} -+ -+ -+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) -+{ -+ av_freep(&ipe->q); -+ gpu_free(&ipe->gptr); -+} -+ -+static HEVCRpiJob * job_new(void) -+{ -+ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); -+ -+ if (jb == NULL) -+ return NULL; -+ -+ sem_init(&jb->sem, 0, 0); -+ jb->rfe = rpi_cache_flush_init(&jb->flush_buf); -+ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); -+ -+ jb->intra.n = 0; -+ if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL) -+ goto fail1; -+ -+ // * Sizeof the union structure might be overkill but at the moment it -+ // is correct (it certainly isn't going to be too small) -+ // Set max fill to slack/2 from the end of the Q -+ // If we exceed this in any Q then we will schedule by size (which should -+ // mean that we never use that Q again part from syncs) -+ // * Given how agressive the overflow resonse is we could maybe put the -+ // threshold even nearer the end, but I don't expect us to ever hit -+ // it on any real stream anyway. -+ -+ if (rpi_inter_pred_alloc(&jb->chroma_ip, -+ QPU_N_MAX, QPU_N_GRP, -+ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t), -+ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0) -+ goto fail2; -+ if (rpi_inter_pred_alloc(&jb->luma_ip, -+ QPU_N_MAX, QPU_N_GRP, -+ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t), -+ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0) -+ goto fail3; -+ -+ return jb; -+ -+fail3: -+ rpi_free_inter_pred(&jb->luma_ip); -+fail2: -+ av_freep(&jb->intra.cmds); -+fail1: -+ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); -+ rpi_cache_flush_finish(jb->rfe); -+ sem_destroy(&jb->sem); -+ return NULL; -+} -+ -+static void job_delete(HEVCRpiJob * const jb) -+{ -+ worker_pic_free_one(jb); -+ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); -+ rpi_free_inter_pred(&jb->chroma_ip); -+ rpi_free_inter_pred(&jb->luma_ip); -+ av_freep(&jb->intra.cmds); -+ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing -+ sem_destroy(&jb->sem); -+ av_free(jb); -+} -+ -+static void jbg_delete(HEVCRpiJobGlobal * const jbg) -+{ -+ HEVCRpiJob * jb; -+ -+ if (jbg == NULL) -+ return; -+ -+ jb = jbg->free1; -+ while (jb != NULL) -+ { -+ HEVCRpiJob * const jb2 = jb; -+ jb = jb2->next; -+ job_delete(jb2); -+ } -+ -+ pthread_mutex_destroy(&jbg->lock); -+ av_free(jbg); -+} -+ -+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) -+{ -+ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); -+ if (jbg == NULL) -+ return NULL; -+ -+ pthread_mutex_init(&jbg->lock, NULL); -+ -+ while (job_count-- != 0) -+ { -+ HEVCRpiJob * const jb = job_new(); -+ if (jb == NULL) -+ goto fail; -+ -+ jb->next = jbg->free1; -+ jbg->free1 = jb; -+ } -+ -+ return jbg; -+ -+fail: -+ jbg_delete(jbg); -+ return NULL; -+} -+ -+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) -+{ -+ HEVCRpiJobGlobal * jbg; -+ -+ if (jbc == NULL) -+ return; -+ -+ jbg = jbc->jbg; -+ -+ if (jbc->jb1 != NULL) -+ job_delete(jbc->jb1); -+ -+ pthread_mutex_destroy(&jbc->in_lock); -+ sem_destroy(&jbc->sem_out); -+ av_free(jbc); -+ -+ // Deref the global job context -+ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) -+ jbg_delete(jbg); -+} -+ -+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) -+{ -+ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); -+ -+ if (jbc == NULL) -+ return NULL; -+ -+ jbc->jbg = jbg; -+ atomic_fetch_add(&jbg->ref_count, 1); -+ -+ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); -+ pthread_mutex_init(&jbc->in_lock, NULL); -+ -+ if ((jbc->jb1 = job_new()) == NULL) -+ goto fail; -+ jbc->jb1->jbc_local = jbc; -+ -+ return jbc; -+ -+fail: -+ rpi_job_ctl_delete(jbc); -+ return NULL; -+} -+ -+ -+ -+static av_cold void hevc_init_worker(HEVCRpiContext * const s) -+{ -+#if RPI_PASSES == 2 -+ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); -+#elif RPI_PASSES == 3 -+ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); -+ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); -+#else -+#error Passes confused -+#endif -+ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); -+ -+ pass_queues_start_all(s); -+} -+ -+static av_cold void hevc_exit_worker(HEVCRpiContext *s) -+{ -+ pass_queues_term_all(s); -+ -+ pass_queues_kill_all(s); -+ -+ rpi_job_ctl_delete(s->jbc); -+ s->jbc = NULL; -+} -+ -+ -+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) -+{ -+ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; -+ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts]; -+ -+ // Check for obvious disasters -+ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) { -+ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // If dependant then ctb_addr_ts != 0 from previous check -+ if (s->sh.dependent_slice_segment_flag) { -+ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; -+ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { -+ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ if (!s->ps.pps->entropy_coding_sync_enabled_flag && -+ tile_id + s->sh.num_entry_point_offsets >= tiles) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // Tiled stuff must start at start of tile if it has multiple entry points -+ if (!s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->sh.num_entry_point_offsets != 0 && -+ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id]) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ ff_hevc_rpi_cabac_init_decoder(lc); -+ -+ // Setup any required decode vars -+ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag; -+ -+// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot); -+ lc->qp_y = s->sh.slice_qp; -+ -+ // General setup -+ lc->bt_line_no = 0; -+ lc->ts = ctb_addr_ts; -+ return 0; -+} -+ -+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) -+{ -+ const GetBitContext * const gb = &s->HEVClc->gb; -+ RpiSliceHeader * const sh = &s->sh; -+ int i, j; -+ -+ const unsigned int length = nal->size; -+ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte -+ unsigned int cmpt; -+ unsigned int startheader; -+ -+ if (sh->num_entry_point_offsets == 0) { -+ s->data = NULL; -+ return 0; -+ } -+ -+ // offset in slice header includes emulation prevention bytes. -+ // Unfortunately those have been removed by the time we get here so we -+ // have to compensate. The nal layer keeps a track of where they were. -+ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) { -+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { -+ startheader--; -+ cmpt++; -+ } -+ } -+ -+ for (i = 1; i < sh->num_entry_point_offsets; i++) { -+ offset += (sh->entry_point_offset[i - 1] - cmpt); -+ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) { -+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { -+ startheader--; -+ cmpt++; -+ } -+ } -+ if (sh->entry_point_offset[i] <= cmpt) { -+ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt; -+ sh->offset[i - 1] = offset; -+ } -+ -+ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt; -+ if (length < offset) { -+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ sh->size[sh->num_entry_point_offsets - 1] = length - offset; -+ sh->offset[sh->num_entry_point_offsets - 1] = offset; -+ -+ // Remember data start pointer as we won't have nal later -+ s->data = nal->data; -+ return 0; -+} -+ -+ -+// Return -+// < 0 Error -+// 0 OK -+// -+// jb->ctu_ts_last < 0 Job still filling -+// jb->ctu_ts_last >= 0 Job ready -+ -+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) -+{ -+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; -+ const unsigned int ctb_size = (1 << log2_ctb_size); -+ HEVCRpiJob * const jb = lc->jb0; -+ int more_data = 1; -+ unsigned int ctb_addr_ts = lc->ts; -+ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size; -+ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size; -+ -+ lc->unit_done = 0; -+ -+ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) -+ { -+ int q_full; -+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; -+ -+ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); -+ -+ ff_hevc_rpi_cabac_init(s, lc, ctb_flags); -+ -+ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size); -+ -+ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; -+ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; -+ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; -+ -+ // Zap stashes if navail -+ if ((lc->ctb_avail & AVAIL_U) == 0) -+ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3); -+ if ((lc->ctb_avail & AVAIL_L) == 0) -+ { -+ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE); -+ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3); -+ } -+#if MVF_STASH_WIDTH > 64 -+ // Restore left mvf stash at start of tile if not at start of line -+ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) -+ { -+ unsigned int i; -+ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0); -+ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); -+ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) -+ { -+ *dst = *src++; -+ dst += MVF_STASH_WIDTH_PU; -+ } -+ } -+#endif -+ -+ // Set initial tu states -+ lc->tu.cu_qp_delta = 0; -+ lc->tu.is_cu_qp_delta_wanted = 0; -+ lc->tu.cu_chroma_qp_offset_wanted = 0; -+ -+ // Decode -+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0); -+ -+ if (ff_hevc_rpi_cabac_overflow(lc)) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n "); -+ more_data = AVERROR_INVALIDDATA; -+ } -+ -+ if (more_data < 0) { -+ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken -+ return more_data; -+ } -+ -+ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 || -+ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0))) -+ { -+ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 || -+ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n "); -+ return -1; -+ } -+ } -+ -+ // --- Post CTB processing -+ -+ // Stash rpl top/left for deblock that needs to remember such things cross-slice -+ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList; -+ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList; -+ -+ if (!s->is_irap) -+ { -+ // Copy MVF up to up-left & stash to up -+ { -+ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1); -+ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE); -+ -+ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst); -+ -+ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE]; -+ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE); -+ } -+ // Stash sideways if end of tile line but not end of line (no point) -+ // ** Could/should do this @ end of fn -+#if MVF_STASH_WIDTH > 64 -+ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL) -+#endif -+ { -+ unsigned int i; -+ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0); -+ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); -+ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) -+ { -+ *dst++ = *src; -+ src += MVF_STASH_WIDTH_PU; -+ } -+ } -+ } -+ -+ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) -+ ff_hevc_rpi_save_states(s, lc); -+ -+ // Report progress so we can use our MVs in other frames -+ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0) -+ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); -+ -+ // End of line || End of tile line || End of tile -+ // (EoL covers end of frame for our purposes here) -+ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0); -+ -+ // Allocate QPU chunks on fixed size 64 pel boundries rather than -+ // whatever ctb_size is today. -+ // * We might quite like to continue to 64 pel vertical too but that -+ // currently confuses WPP -+ if (((x_ctb + ctb_size) & 63) == 0 || q_full) -+ { -+ int overflow = 0; -+ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) -+ overflow = 1; -+ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) -+ overflow = 1; -+ if (overflow) -+ { -+ // * This is very annoying (and slow) to cope with in WPP so -+ // we treat it as an error there (no known stream triggers this -+ // with the current buffer sizes). Non-wpp should cope fine. -+ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); -+ q_full = 1; -+ } -+ } -+ -+ // Inc TS to next. -+ ctb_addr_ts++; -+ ctb_addr_rs++; -+ x_ctb += ctb_size; -+ -+ if (q_full) -+ { -+ // Do job -+ // Prep for submission -+ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced -+ job_gen_bounds(s, jb); -+ break; -+ } -+ -+ // If max_blocks started as 0 then this will never be true -+ if (--max_blocks == 0) -+ break; -+ } -+ -+ lc->unit_done = (more_data <= 0); -+ lc->ts = ctb_addr_ts; -+ return 0; -+} -+ -+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n) -+{ -+ lc->context = s; -+ lc->jb0 = NULL; -+ lc->lc_n = n; -+ lc->bt_terminate = 0; -+ lc->bt_psem_out = NULL; -+ sem_init(&lc->bt_sem_in, 0, 0); -+} -+ -+#define TRACE_WPP 0 -+#if RPI_EXTRA_BIT_THREADS > 0 -+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts) -+{ -+ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; -+ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; -+} -+ -+// Move local context parameters from an aux bit thread back to the main -+// thread at the end of a slice as processing is going to continue there. -+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep) -+{ -+ if (src_lc == dst_lc) { -+ return; -+ } -+ -+ // Move the job -+ // We will still have an active job if the final line terminates early -+ // Dest should always be null by now -+ av_assert1(dst_lc->jb0 == NULL); -+ dst_lc->jb0 = src_lc->jb0; -+ src_lc->jb0 = NULL; -+ -+ // Always need to store where we are in the bitstream -+ dst_lc->ts = src_lc->ts; -+ dst_lc->gb = src_lc->gb; -+ // Cabac init request will be built at start of next slice -+ -+ // Need to store context if we might have a dependent seg -+ if (is_dep) -+ { -+ dst_lc->qPy_pred = src_lc->qPy_pred; -+ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left)); -+ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); -+ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); -+ } -+} -+ -+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc) -+{ -+ rpi_sem_wait(&lc->bt_sem_in); -+ return lc->bt_terminate; -+} -+ -+// Do one WPP line -+// Will not work correctly over horizontal tile boundries - vertical should be OK -+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first) -+{ -+ const int is_tile = lc->bt_is_tile; -+ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; -+ const unsigned int line = lc->bt_line_no; -+ const unsigned int line_inc = lc->bt_line_inc; -+ const int is_last = (line >= lc->bt_last_line); -+ -+ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); -+ const unsigned int ts_next = -+ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? -+ INT_MAX : -+ is_tile ? -+ s->ps.pps->tile_pos_ts[tile_id + line_inc] : -+ lc->ts + lc->bt_line_width * line_inc; -+ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) -+ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; -+ unsigned int ts_prev; -+ int loop_n = 0; -+ int err = 0; -+ -+ av_assert1(line <= s->sh.num_entry_point_offsets); -+ -+#if TRACE_WPP -+ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, -+ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, -+ line, lc->bt_last_line, s->sh.num_entry_point_offsets, -+ lc->ts, ts_eol, ts_next, partial_size, lc->jb0); -+#endif -+ if (line != 0) -+ { -+ const uint8_t * const data = s->data + s->sh.offset[line - 1]; -+ const unsigned int len = s->sh.size[line - 1]; -+ if ((err = init_get_bits8(&lc->gb, data, len)) < 0) -+ return err; -+ -+ ff_init_cabac_decoder(&lc->cc, data, len); -+ } -+ -+ // We should never be processing a dependent slice here so reset is good -+ // ?? These probably shouldn't be needed (as they should be set by later -+ // logic) but do seem to be required -+ lc->qp_y = s->sh.slice_qp; -+ -+ do -+ { -+ if (!is_last && loop_n > 1) { -+#if TRACE_WPP -+ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); -+#endif -+ sem_post(lc->bt_psem_out); -+ } -+ // The wait for loop_n == 0 has been done in bit_thread -+ if (!is_first && loop_n != 0) -+ { -+#if TRACE_WPP -+ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); -+#endif -+ if (wait_bt_sem_in(lc) != 0) -+ return AVERROR_EXIT; -+ } -+ -+#if TRACE_WPP -+ { -+ int n; -+ sem_getvalue(&lc->bt_sem_in, &n); -+ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); -+ } -+#endif -+ -+ ts_prev = lc->ts; -+ -+ // If we have had an error - do no further decode but do continue -+ // moving signals around so the other threads continue to operate -+ // correctly (or at least as correctly as they can with this line missing) -+ // -+ // Errors in WPP/Tile are less fatal than normal as we have a good idea -+ // of how to restart on the next line so there is no need to give up totally -+ if (err != 0) -+ { -+ lc->unit_done = 0; -+ lc->ts += partial_size; -+ } -+ else -+ { -+ worker_pass0_ready(s, lc); -+ -+ if ((err = fill_job(s, lc, partial_size)) < 0 || -+ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) -+ { -+ if (err == 0) { -+ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); -+ err = AVERROR_INVALIDDATA; -+ } -+ worker_free(s, lc); -+ lc->ts = ts_prev + partial_size; // Pretend we did all that -+ lc->unit_done = 0; -+ } -+ else if (is_tile) -+ { -+ worker_submit_job(s, lc); -+ } -+ } -+ -+ ++loop_n; -+ } while (lc->ts < ts_eol && !lc->unit_done); -+ -+ // If we are on the last line & we didn't get a whole line we must wait for -+ // and sink the sem_posts from the line above / tile to the left. -+ while ((ts_prev += partial_size) < ts_eol) -+ { -+#if TRACE_WPP -+ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); -+#endif -+ if (wait_bt_sem_in(lc) != 0) -+ return AVERROR_EXIT; -+ } -+ -+ lc->bt_line_no += line_inc; -+ -+ if (!is_tile && err == 0) -+ worker_submit_job(s, lc); -+ -+ if (!is_last) { -+ lc->ts = ts_next; -+ -+#if TRACE_WPP -+ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); -+#endif -+ sem_post(lc->bt_psem_out); -+ if (loop_n > 1) { -+#if TRACE_WPP -+ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); -+#endif -+ sem_post(lc->bt_psem_out); -+ } -+ } -+ else -+ { -+ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT -+#if MVF_STASH_WIDTH > 64 -+ // Horrid calculations to work out what we want but luckily this should almost never execute -+ // **** Move to movlc -+ if (!s->is_irap) -+ { -+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts]; -+ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf -+ { -+ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1; -+ unsigned int i; -+ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); -+ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); -+ -+ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i) -+ { -+ *d_mvf = *s_mvf; -+ d_mvf += MVF_STASH_WIDTH_PU; -+ s_mvf += MVF_STASH_WIDTH_PU; -+ } -+ -+ } -+ } -+#endif -+ // When all done poke the thread 0 sem_in one final time -+#if TRACE_WPP -+ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); -+#endif -+ sem_post(&s->HEVClcList[0]->bt_sem_in); -+ } -+ -+#if TRACE_WPP -+ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); -+#endif -+ return err; -+} -+ -+static void wpp_setup_lcs(HEVCRpiContext * const s) -+{ -+ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ const unsigned int line_width = line_ts_width(s, ts); -+ -+ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) -+ { -+ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; -+ lc->ts = ts; -+ lc->bt_is_tile = 0; -+ lc->bt_line_no = i; -+ lc->bt_line_width = line_width; -+ lc->bt_last_line = s->sh.num_entry_point_offsets; -+ lc->bt_line_inc = RPI_BIT_THREADS; -+ ts += line_width; -+ } -+} -+ -+ -+// Can only process tile single row at once -+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row) -+{ -+ const HEVCRpiPPS * const pps = s->ps.pps; -+ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ const unsigned int tile0 = pps->tile_id[ts0]; -+ const unsigned int col0 = tile0 % pps->num_tile_columns; -+ -+ const unsigned int col = (slice_row == 0) ? col0 : 0; -+ unsigned int line = slice_row * pps->num_tile_columns - col0 + col; -+ const unsigned int last_line = FFMIN( -+ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); -+ -+ const unsigned int par = -+ FFMIN(RPI_BIT_THREADS, last_line + 1 - line); -+#if TRACE_WPP -+ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, -+ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); -+#endif -+ for (unsigned int i = 0; i != par; ++i, ++line) -+ { -+ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; -+ const unsigned int tile = tile0 + line; -+ -+ lc->ts = pps->tile_pos_ts[tile]; -+ lc->bt_line_no = line; -+ lc->bt_is_tile = 1; -+ lc->bt_line_width = line_ts_width(s, lc->ts); -+ lc->bt_last_line = last_line; -+ lc->bt_line_inc = par; -+ } -+} -+ -+ -+static void * bit_thread(void * v) -+{ -+ HEVCRpiLocalContext * const lc = v; -+ HEVCRpiContext *const s = lc->context; -+ -+ while (wait_bt_sem_in(lc) == 0) -+ { -+ int err; -+ -+ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp -+ if (lc->bt_terminate) { -+ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); -+ break; -+ } -+ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); -+ } -+ } -+ -+ return NULL; -+} -+ -+static int bit_threads_start(HEVCRpiContext * const s) -+{ -+ if (s->bt_started) -+ return 0; -+ -+ for (int i = 1; i < RPI_BIT_THREADS; ++i) -+ { -+ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] -+ if (s->HEVClcList[i] == NULL) { -+ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) -+ return -1; -+ } -+ -+ bt_lc_init(s, s->HEVClcList[i], i); -+ job_lc_init(s->HEVClcList[i]); -+ } -+ -+ // Link the sems in a circle -+ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) -+ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; -+ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; -+ -+ // Init all lc before starting any threads -+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) -+ { -+ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) -+ return -1; -+ } -+ -+ s->bt_started = 1; -+ return 0; -+} -+ -+static int bit_threads_kill(HEVCRpiContext * const s) -+{ -+ if (!s->bt_started) -+ return 0; -+ s->bt_started = 0; -+ -+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) -+ { -+ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1]; -+ if (lc == NULL) -+ break; -+ -+ lc->bt_terminate = 1; -+ sem_post(&lc->bt_sem_in); -+ pthread_join(s->bit_threads[i], NULL); -+ -+ sem_destroy(&lc->bt_sem_in); -+ job_lc_kill(lc); -+ } -+ return 0; -+} -+#endif -+ -+ -+// If we are at EoT and the row is shorter than the number of jobs -+// we can Q we have to wait for it finish otherwise we risk cache/QPU -+// disasters -+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n) -+{ -+ return -+ s->ps.pps->tile_wpp_inter_disable >= 2 && -+ s->sh.slice_type != HEVC_SLICE_I && -+ n >= 0 && -+ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT; -+} -+ -+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) -+{ -+ HEVCRpiContext * const s = avctxt->priv_data; -+ HEVCRpiLocalContext * const lc = s->HEVClc; -+ int err; -+ -+ // Start of slice -+ if ((err = slice_start(s, lc)) != 0) -+ return err; -+ -+#if RPI_EXTRA_BIT_THREADS > 0 -+ -+ if (s->sh.offload_tiles) -+ { -+ unsigned int slice_row = 0; -+ -+#if TRACE_WPP -+ printf("%s: Do Tiles\n", __func__); -+#endif -+ // Generate & start extra bit threads if they aren't already running -+ bit_threads_start(s); -+ -+ do -+ { -+ // Reset lc lines etc. -+ tile_one_row_setup_lcs(s, slice_row); -+ -+#if TRACE_WPP -+ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", -+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); -+#endif -+ -+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads -+#if TRACE_WPP -+ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", -+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); -+#endif -+ -+ while (lc->bt_line_no <= lc->bt_last_line) { -+ rpi_sem_wait(&lc->bt_sem_in); -+ rpi_run_one_line(s, lc, 0); -+ } -+#if TRACE_WPP -+ printf("%s: Done body\n", __func__); -+#endif -+ -+ // Wait for everything else to finish -+ rpi_sem_wait(&lc->bt_sem_in); -+ -+ ++slice_row; -+ } while (lc->bt_last_line < s->sh.num_entry_point_offsets); -+ -+ -+#if TRACE_WPP -+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); -+#endif -+ } -+ else if (s->sh.offload_wpp) -+ { -+#if TRACE_WPP -+ printf("%s: Do WPP\n", __func__); -+#endif -+ // Generate & start extra bit threads if they aren't already running -+ bit_threads_start(s); -+ -+ // Reset lc lines etc. -+ wpp_setup_lcs(s); -+ -+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads -+#if TRACE_WPP -+ printf("%s: Done 1st\n", __func__); -+#endif -+ -+ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { -+ rpi_sem_wait(&lc->bt_sem_in); -+ rpi_run_one_line(s, lc, 0); -+ } -+#if TRACE_WPP -+ printf("%s: Done body\n", __func__); -+#endif -+ -+ // Wait for everything else to finish -+ rpi_sem_wait(&lc->bt_sem_in); -+ -+#if TRACE_WPP -+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); -+#endif -+ } -+ else -+#endif -+ { -+#if TRACE_WPP -+ printf("%s: Single start: ts=%d\n", __func__, lc->ts); -+#endif -+ // Single bit thread -+ do { -+ // Make sure we have space to prepare the next job -+ worker_pass0_ready(s, lc); -+ -+ if ((err = fill_job(s, lc, 0)) < 0) -+ goto fail; -+ -+ worker_submit_job(s, lc); -+ -+ if (tile_needs_wait(s, lc->ts - 1)) -+ worker_wait(s, lc); -+ -+ } while (!lc->unit_done); -+ -+#if TRACE_WPP -+ printf("%s: Single end: ts=%d\n", __func__, lc->ts); -+#endif -+ } -+ -+ // If we have reached the end of the frame or -+ // then wait for the worker to finish all its jobs -+ if (lc->ts >= s->ps.sps->ctb_size) -+ worker_wait(s, lc); -+ -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ -+ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", -+ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, -+ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, -+ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, -+ ts->y_pred2_hgt16, ts->y_pred2_hle16); -+ memset(ts, 0, sizeof(*ts)); -+ } -+#endif -+ -+ return lc->ts; -+ -+fail: -+ // Cleanup -+ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); -+ // Free our job & wait for temination -+ worker_free(s, lc); -+ worker_wait(s, lc); -+ return err; -+} -+ -+ -+static void set_no_backward_pred(HEVCRpiContext * const s) -+{ -+ int i, j; -+ const RefPicList *const refPicList = s->refPicList; -+ -+ s->no_backward_pred_flag = 0; -+ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag) -+ return; -+ -+ for (j = 0; j < 2; j++) { -+ for (i = 0; i < refPicList[j].nb_refs; i++) { -+ if (refPicList[j].list[i] > s->poc) { -+ s->no_backward_pred_flag = 1; -+ return; -+ } -+ } -+ } -+} -+ -+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal) -+{ -+ int err; -+ if ((err = gen_entry_points(s, nal)) < 0) -+ return err; -+ -+ set_no_backward_pred(s); -+ -+ return rpi_decode_entry(s->avctx, NULL); -+} -+ -+static int set_side_data(HEVCRpiContext *s) -+{ -+ AVFrame *out = s->ref->frame; -+ -+ if (s->sei.frame_packing.present && -+ s->sei.frame_packing.arrangement_type >= 3 && -+ s->sei.frame_packing.arrangement_type <= 5 && -+ s->sei.frame_packing.content_interpretation_type > 0 && -+ s->sei.frame_packing.content_interpretation_type < 3) { -+ AVStereo3D *stereo = av_stereo3d_create_side_data(out); -+ if (!stereo) -+ return AVERROR(ENOMEM); -+ -+ switch (s->sei.frame_packing.arrangement_type) { -+ case 3: -+ if (s->sei.frame_packing.quincunx_subsampling) -+ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX; -+ else -+ stereo->type = AV_STEREO3D_SIDEBYSIDE; -+ break; -+ case 4: -+ stereo->type = AV_STEREO3D_TOPBOTTOM; -+ break; -+ case 5: -+ stereo->type = AV_STEREO3D_FRAMESEQUENCE; -+ break; -+ } -+ -+ if (s->sei.frame_packing.content_interpretation_type == 2) -+ stereo->flags = AV_STEREO3D_FLAG_INVERT; -+ -+ if (s->sei.frame_packing.arrangement_type == 5) { -+ if (s->sei.frame_packing.current_frame_is_frame0_flag) -+ stereo->view = AV_STEREO3D_VIEW_LEFT; -+ else -+ stereo->view = AV_STEREO3D_VIEW_RIGHT; -+ } -+ } -+ -+ if (s->sei.display_orientation.present && -+ (s->sei.display_orientation.anticlockwise_rotation || -+ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) { -+ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16); -+ AVFrameSideData *rotation = av_frame_new_side_data(out, -+ AV_FRAME_DATA_DISPLAYMATRIX, -+ sizeof(int32_t) * 9); -+ if (!rotation) -+ return AVERROR(ENOMEM); -+ -+ av_display_rotation_set((int32_t *)rotation->data, angle); -+ av_display_matrix_flip((int32_t *)rotation->data, -+ s->sei.display_orientation.hflip, -+ s->sei.display_orientation.vflip); -+ } -+ -+ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 -+ // so the side data persists for the entire coded video sequence. -+ if (s->sei.mastering_display.present > 0 && -+ IS_IRAP(s) && s->no_rasl_output_flag) { -+ s->sei.mastering_display.present--; -+ } -+ if (s->sei.mastering_display.present) { -+ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b -+ const int mapping[3] = {2, 0, 1}; -+ const int chroma_den = 50000; -+ const int luma_den = 10000; -+ int i; -+ AVMasteringDisplayMetadata *metadata = -+ av_mastering_display_metadata_create_side_data(out); -+ if (!metadata) -+ return AVERROR(ENOMEM); -+ -+ for (i = 0; i < 3; i++) { -+ const int j = mapping[i]; -+ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0]; -+ metadata->display_primaries[i][0].den = chroma_den; -+ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1]; -+ metadata->display_primaries[i][1].den = chroma_den; -+ } -+ metadata->white_point[0].num = s->sei.mastering_display.white_point[0]; -+ metadata->white_point[0].den = chroma_den; -+ metadata->white_point[1].num = s->sei.mastering_display.white_point[1]; -+ metadata->white_point[1].den = chroma_den; -+ -+ metadata->max_luminance.num = s->sei.mastering_display.max_luminance; -+ metadata->max_luminance.den = luma_den; -+ metadata->min_luminance.num = s->sei.mastering_display.min_luminance; -+ metadata->min_luminance.den = luma_den; -+ metadata->has_luminance = 1; -+ metadata->has_primaries = 1; -+ -+ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n"); -+ av_log(s->avctx, AV_LOG_DEBUG, -+ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n", -+ av_q2d(metadata->display_primaries[0][0]), -+ av_q2d(metadata->display_primaries[0][1]), -+ av_q2d(metadata->display_primaries[1][0]), -+ av_q2d(metadata->display_primaries[1][1]), -+ av_q2d(metadata->display_primaries[2][0]), -+ av_q2d(metadata->display_primaries[2][1]), -+ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1])); -+ av_log(s->avctx, AV_LOG_DEBUG, -+ "min_luminance=%f, max_luminance=%f\n", -+ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance)); -+ } -+ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 -+ // so the side data persists for the entire coded video sequence. -+ if (s->sei.content_light.present > 0 && -+ IS_IRAP(s) && s->no_rasl_output_flag) { -+ s->sei.content_light.present--; -+ } -+ if (s->sei.content_light.present) { -+ AVContentLightMetadata *metadata = -+ av_content_light_metadata_create_side_data(out); -+ if (!metadata) -+ return AVERROR(ENOMEM); -+ metadata->MaxCLL = s->sei.content_light.max_content_light_level; -+ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level; -+ -+ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n"); -+ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n", -+ metadata->MaxCLL, metadata->MaxFALL); -+ } -+ -+ if (s->sei.a53_caption.a53_caption) { -+ AVFrameSideData* sd = av_frame_new_side_data(out, -+ AV_FRAME_DATA_A53_CC, -+ s->sei.a53_caption.a53_caption_size); -+ if (sd) -+ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size); -+ av_freep(&s->sei.a53_caption.a53_caption); -+ s->sei.a53_caption.a53_caption_size = 0; -+ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; -+ } -+ -+ if (s->sei.alternative_transfer.present && -+ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) && -+ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) { -+ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics; -+ } -+ -+ return 0; -+} -+ -+static int hevc_frame_start(HEVCRpiContext * const s) -+{ -+ int ret; -+ -+ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too -+ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); -+ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address)); -+ -+ // Only need to remember intra for CIP -+ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap) -+ s->is_intra = NULL; -+ else -+ { -+ s->is_intra = s->is_intra_store; -+ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); -+ } -+ -+ s->is_decoded = 0; -+ s->first_nal_type = s->nal_unit_type; -+ -+ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); -+ -+ if (s->pkt.nb_nals > s->rpl_tab_size) -+ { -+ // In most cases it will be faster to free & realloc as that doesn't -+ // require (an unwanted) copy -+ av_freep(&s->rpl_tab); -+ s->rpl_tab_size = 0; -+ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL) -+ goto fail; -+ s->rpl_tab_size = s->pkt.nb_nals; -+ } -+ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab)); -+ -+ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); -+ if (ret < 0) -+ goto fail; -+ -+ // Resize rpl_tab to max that we might want -+ ret = ff_hevc_rpi_frame_rps(s); -+ if (ret < 0) { -+ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); -+ goto fail; -+ } -+ -+ s->ref->frame->key_frame = IS_IRAP(s); -+ -+ ret = set_side_data(s); -+ if (ret < 0) -+ goto fail; -+ -+ s->frame->pict_type = 3 - s->sh.slice_type; -+ -+ if (!IS_IRAP(s)) -+ ff_hevc_rpi_bump_frame(s); -+ -+ av_frame_unref(s->output_frame); -+ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0); -+ if (ret < 0) -+ goto fail; -+ -+ ff_thread_finish_setup(s->avctx); -+ -+ return 0; -+ -+fail: -+ if (s->ref) -+ ff_hevc_rpi_unref_frame(s, s->ref, ~0); -+ s->ref = NULL; -+ return ret; -+} -+ -+static inline int is_non_ref_unit_type(const unsigned int nal_unit_type) -+{ -+ // From Table 7-1 -+ return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14 -+} -+ -+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal) -+{ -+ GetBitContext * const gb = &s->HEVClc->gb; -+ int ctb_addr_ts, ret; -+ -+ *gb = nal->gb; -+ s->nal_unit_type = nal->type; -+ s->temporal_id = nal->temporal_id; -+ -+ switch (s->nal_unit_type) { -+ case HEVC_NAL_VPS: -+ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps); -+ if (ret < 0) -+ goto fail; -+ break; -+ case HEVC_NAL_SPS: -+ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps, -+ s->apply_defdispwin); -+ if (ret < 0) -+ goto fail; -+ break; -+ case HEVC_NAL_PPS: -+ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps); -+ if (ret < 0) -+ goto fail; -+ break; -+ case HEVC_NAL_SEI_PREFIX: -+ case HEVC_NAL_SEI_SUFFIX: -+ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type); -+ if (ret < 0) -+ goto fail; -+ break; -+ case HEVC_NAL_TRAIL_R: -+ case HEVC_NAL_TRAIL_N: -+ case HEVC_NAL_TSA_N: -+ case HEVC_NAL_TSA_R: -+ case HEVC_NAL_STSA_N: -+ case HEVC_NAL_STSA_R: -+ case HEVC_NAL_BLA_W_LP: -+ case HEVC_NAL_BLA_W_RADL: -+ case HEVC_NAL_BLA_N_LP: -+ case HEVC_NAL_IDR_W_RADL: -+ case HEVC_NAL_IDR_N_LP: -+ case HEVC_NAL_CRA_NUT: -+ case HEVC_NAL_RADL_N: -+ case HEVC_NAL_RADL_R: -+ case HEVC_NAL_RASL_N: -+ case HEVC_NAL_RASL_R: -+ ret = hls_slice_header(s); -+ if (ret < 0) -+ return ret; -+ -+ // The definition of _N unit types is "non-reference for other frames -+ // with the same temporal_id" so they may/will be ref frames for pics -+ // with a higher temporal_id. -+ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || -+ !is_non_ref_unit_type(s->nal_unit_type); -+ s->offload_recon = s->threads_type != 0 && s->used_for_ref; -+ s->is_irap = IS_IRAP(s); -+ -+#if DEBUG_DECODE_N -+ { -+ static int z = 0; -+ if (IS_IDR(s)) { -+ z = 1; -+ } -+ if (z != 0 && z++ > DEBUG_DECODE_N) { -+ s->is_decoded = 0; -+ break; -+ } -+ } -+#endif -+ if ( -+ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || -+ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || -+ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || -+ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s))) -+ { -+ s->is_decoded = 0; -+ break; -+ } -+ -+ if (s->sh.first_slice_in_pic_flag) { -+ if (s->max_ra == INT_MAX) { -+ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { -+ s->max_ra = s->poc; -+ } else { -+ if (IS_IDR(s)) -+ s->max_ra = INT_MIN; -+ } -+ } -+ -+ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) && -+ s->poc <= s->max_ra) { -+ s->is_decoded = 0; -+ break; -+ } else { -+ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra) -+ s->max_ra = INT_MIN; -+ } -+ -+ ret = hevc_frame_start(s); -+ if (ret < 0) -+ return ret; -+ } else if (!s->ref) { -+ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n"); -+ goto fail; -+ } -+ -+ if (s->nal_unit_type != s->first_nal_type) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Non-matching NAL types of the VCL NALUs: %d %d\n", -+ s->first_nal_type, s->nal_unit_type); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (!s->sh.dependent_slice_segment_flag && -+ s->sh.slice_type != HEVC_SLICE_I) { -+ ret = ff_hevc_rpi_slice_rpl(s); -+ if (ret < 0) { -+ av_log(s->avctx, AV_LOG_WARNING, -+ "Error constructing the reference lists for the current slice.\n"); -+ goto fail; -+ } -+ } -+ -+ ctb_addr_ts = hls_slice_data(s, nal); -+ if (ctb_addr_ts >= s->ps.sps->ctb_size) { -+ s->is_decoded = 1; -+ } -+ -+ if (ctb_addr_ts < 0) { -+ ret = ctb_addr_ts; -+ goto fail; -+ } -+ break; -+ case HEVC_NAL_EOS_NUT: -+ case HEVC_NAL_EOB_NUT: -+ s->seq_decode = (s->seq_decode + 1) & 0xff; -+ s->max_ra = INT_MAX; -+ break; -+ case HEVC_NAL_AUD: -+ case HEVC_NAL_FD_NUT: -+ break; -+ default: -+ av_log(s->avctx, AV_LOG_INFO, -+ "Skipping NAL unit %d\n", s->nal_unit_type); -+ } -+ -+ return 0; -+fail: -+ if (s->avctx->err_recognition & AV_EF_EXPLODE) -+ return ret; -+ return 0; -+} -+ -+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length) -+{ -+ int i, ret = 0; -+ int eos_at_start = 1; -+ -+ s->ref = NULL; -+ s->last_eos = s->eos; -+ s->eos = 0; -+ -+ /* split the input packet into NAL units, so we know the upper bound on the -+ * number of slices in the frame */ -+ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, -+ s->nal_length_size, s->avctx->codec_id, 0, 0); -+ if (ret < 0) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "Error splitting the input into NAL units.\n"); -+ return ret; -+ } -+ -+ for (i = 0; i < s->pkt.nb_nals; i++) { -+ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT || -+ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) { -+ if (eos_at_start) { -+ s->last_eos = 1; -+ } else { -+ s->eos = 1; -+ } -+ } else { -+ eos_at_start = 0; -+ } -+ } -+ -+ /* decode the NAL units */ -+ for (i = 0; i < s->pkt.nb_nals; i++) { -+ ret = decode_nal_unit(s, &s->pkt.nals[i]); -+ if (ret < 0) { -+ av_log(s->avctx, AV_LOG_WARNING, -+ "Error parsing NAL unit #%d.\n", i); -+ goto fail; -+ } -+ } -+ -+fail: // Also success path -+ if (s->ref != NULL) { -+ if (s->used_for_ref && s->threads_type != 0) { -+ ff_hevc_rpi_progress_signal_all_done(s); -+ } -+ else { -+ // Flush frame to real memory as we expect to be able to pass -+ // it straight on to mmal -+ flush_frame(s, s->frame); -+ } -+ } -+ return ret; -+} -+ -+static void print_md5(void *log_ctx, int level, uint8_t md5[16]) -+{ -+ int i; -+ for (i = 0; i < 16; i++) -+ av_log(log_ctx, level, "%02"PRIx8, md5[i]); -+} -+ -+static int verify_md5(HEVCRpiContext *s, AVFrame *frame) -+{ -+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); -+ int pixel_shift; -+ int i, j; -+ -+ if (!desc) -+ return AVERROR(EINVAL); -+ -+ pixel_shift = desc->comp[0].depth > 8; -+ -+ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ", -+ s->poc); -+ -+ /* the checksums are LE, so we have to byteswap for >8bpp formats -+ * on BE arches */ -+#if HAVE_BIGENDIAN -+ if (pixel_shift && !s->checksum_buf) { -+ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size, -+ FFMAX3(frame->linesize[0], frame->linesize[1], -+ frame->linesize[2])); -+ if (!s->checksum_buf) -+ return AVERROR(ENOMEM); -+ } -+#endif -+ -+ for (i = 0; frame->data[i]; i++) { -+ int width = s->avctx->coded_width; -+ int height = s->avctx->coded_height; -+ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width; -+ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; -+ uint8_t md5[16]; -+ -+ av_md5_init(s->md5_ctx); -+ for (j = 0; j < h; j++) { -+ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); -+#if HAVE_BIGENDIAN -+ if (pixel_shift) { -+ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf, -+ (const uint16_t *) src, w); -+ src = s->checksum_buf; -+ } -+#endif -+ av_md5_update(s->md5_ctx, src, w << pixel_shift); -+ } -+ av_md5_final(s->md5_ctx, md5); -+ -+ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { -+ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); -+ print_md5(s->avctx, AV_LOG_DEBUG, md5); -+ av_log (s->avctx, AV_LOG_DEBUG, "; "); -+ } else { -+ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i); -+ print_md5(s->avctx, AV_LOG_ERROR, md5); -+ av_log (s->avctx, AV_LOG_ERROR, " != "); -+ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]); -+ av_log (s->avctx, AV_LOG_ERROR, "\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ av_log(s->avctx, AV_LOG_DEBUG, "\n"); -+ -+ return 0; -+} -+ -+static int all_sps_supported(const HEVCRpiContext * const s) -+{ -+ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { -+ if (s->ps.sps_list[i] != NULL) -+ { -+ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; -+ if (!is_sps_supported(sps)) -+ return 0; -+ } -+ } -+ return 1; -+} -+ -+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first) -+{ -+ int ret, i; -+ -+ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff, -+ &s->nal_length_size, s->avctx->err_recognition, -+ s->apply_defdispwin, s->avctx); -+ if (ret < 0) -+ return ret; -+ -+ /* export stream parameters from the first SPS */ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { -+ if (first && s->ps.sps_list[i]) { -+ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; -+ export_stream_params(s->avctx, &s->ps, sps); -+ break; -+ } -+ } -+ -+ return 0; -+} -+ -+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output, -+ AVPacket *avpkt) -+{ -+ int ret; -+ int new_extradata_size; -+ uint8_t *new_extradata; -+ HEVCRpiContext *s = avctx->priv_data; -+ -+ if (!avpkt->size) { -+ ret = ff_hevc_rpi_output_frame(s, data, 1); -+ if (ret < 0) -+ return ret; -+ -+ *got_output = ret; -+ return 0; -+ } -+ -+ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, -+ &new_extradata_size); -+ if (new_extradata && new_extradata_size > 0) { -+ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0); -+ if (ret < 0) -+ return ret; -+ } -+ -+ s->ref = NULL; -+ ret = decode_nal_units(s, avpkt->data, avpkt->size); -+ if (ret < 0) -+ return ret; -+ -+ /* verify the SEI checksum */ -+ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded && -+ s->sei.picture_hash.is_md5) { -+ ret = verify_md5(s, s->ref->frame); -+ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) { -+ ff_hevc_rpi_unref_frame(s, s->ref, ~0); -+ return ret; -+ } -+ } -+ s->sei.picture_hash.is_md5 = 0; -+ -+ if (s->is_decoded) { -+ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc); -+ s->is_decoded = 0; -+ } -+ -+ if (s->output_frame->buf[0]) { -+ av_frame_move_ref(data, s->output_frame); -+ *got_output = 1; -+ } -+ -+ return avpkt->size; -+} -+ -+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src) -+{ -+ int ret; -+ -+ ret = ff_thread_ref_frame(&dst->tf, &src->tf); -+ if (ret < 0) -+ return ret; -+ -+ if (src->col_mvf_buf != NULL) -+ { -+ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf); -+ if (!dst->col_mvf_buf) -+ goto fail; -+ } -+ dst->col_mvf = src->col_mvf; -+ -+ dst->poc = src->poc; -+ dst->flags = src->flags; -+ dst->sequence = src->sequence; -+ return 0; -+ -+fail: -+ ff_hevc_rpi_unref_frame(s, dst, ~0); -+ return AVERROR(ENOMEM); -+} -+ -+ -+static av_cold int hevc_decode_free(AVCodecContext *avctx) -+{ -+ HEVCRpiContext * const s = avctx->priv_data; -+ int i; -+ -+ pic_arrays_free(s); -+ -+ av_freep(&s->md5_ctx); -+ -+ av_freep(&s->cabac_save); -+ -+#if RPI_EXTRA_BIT_THREADS -+ bit_threads_kill(s); -+#endif -+ -+ hevc_exit_worker(s); -+ for (i = 0; i != 2; ++i) { -+ ff_hevc_rpi_progress_kill_state(s->progress_states + i); -+ } -+ job_lc_kill(s->HEVClc); -+ -+ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] -+ av_freep(&s->sao_pixel_buffer_v[0]); -+ av_frame_free(&s->output_frame); -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); -+ av_frame_free(&s->DPB[i].frame); -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) -+ av_buffer_unref(&s->ps.vps_list[i]); -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) -+ av_buffer_unref(&s->ps.sps_list[i]); -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) -+ av_buffer_unref(&s->ps.pps_list[i]); -+ s->ps.sps = NULL; -+ s->ps.pps = NULL; -+ s->ps.vps = NULL; -+ -+ // Free separately from sLists as used that way by RPI WPP -+ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { -+ av_freep(s->HEVClcList + i); -+ } -+ s->HEVClc = NULL; // Allocated as part of HEVClcList -+ -+ ff_h2645_packet_uninit(&s->pkt); -+ -+ if (s->qpu_init_ok) -+ vpu_qpu_term(); -+ s->qpu_init_ok = 0; -+ -+ return 0; -+} -+ -+ -+static av_cold int hevc_init_context(AVCodecContext *avctx) -+{ -+ HEVCRpiContext *s = avctx->priv_data; -+ int i; -+ -+ s->avctx = avctx; -+ -+ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext)); -+ if (!s->HEVClc) -+ goto fail; -+ s->HEVClcList[0] = s->HEVClc; -+ -+ if (vpu_qpu_init() != 0) -+ goto fail; -+ s->qpu_init_ok = 1; -+ -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ { -+ static const uint32_t dframe[1] = {0x80808080}; -+ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; -+ } -+#endif -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+ s->qpu_dummy_frame_qpu = qpu_dummy(); -+#endif -+ -+ bt_lc_init(s, s->HEVClc, 0); -+ job_lc_init(s->HEVClc); -+ -+ for (i = 0; i != 2; ++i) { -+ ff_hevc_rpi_progress_init_state(s->progress_states + i); -+ } -+ -+ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL) -+ goto fail; -+ -+ if ((s->output_frame = av_frame_alloc()) == NULL) -+ goto fail; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ s->DPB[i].frame = av_frame_alloc(); -+ if (!s->DPB[i].frame) -+ goto fail; -+ s->DPB[i].tf.f = s->DPB[i].frame; -+ s->DPB[i].dpb_no = i; -+ } -+ -+ s->max_ra = INT_MAX; -+ -+ if ((s->md5_ctx = av_md5_alloc()) == NULL) -+ goto fail; -+ -+ s->context_initialized = 1; -+ s->eos = 0; -+ -+ ff_hevc_rpi_reset_sei(&s->sei); -+ -+ return 0; -+ -+fail: -+ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__); -+ hevc_decode_free(avctx); -+ return AVERROR(ENOMEM); -+} -+ -+#if HAVE_THREADS -+static int hevc_update_thread_context(AVCodecContext *dst, -+ const AVCodecContext *src) -+{ -+ HEVCRpiContext *s = dst->priv_data; -+ HEVCRpiContext *s0 = src->priv_data; -+ int i, ret; -+ -+ av_assert0(s->context_initialized); -+ -+ // dst == src can happen according to the comments and in that case -+ // there is nothing to do here -+ if (dst == src) -+ return 0; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); -+ if (s0->DPB[i].frame->buf[0]) { -+ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]); -+ if (ret < 0) -+ return ret; -+ } -+ } -+ -+ if (s->ps.sps != s0->ps.sps) -+ s->ps.sps = NULL; -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) { -+ av_buffer_unref(&s->ps.vps_list[i]); -+ if (s0->ps.vps_list[i]) { -+ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]); -+ if (!s->ps.vps_list[i]) -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { -+ av_buffer_unref(&s->ps.sps_list[i]); -+ if (s0->ps.sps_list[i]) { -+ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]); -+ if (!s->ps.sps_list[i]) -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) { -+ av_buffer_unref(&s->ps.pps_list[i]); -+ if (s0->ps.pps_list[i]) { -+ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]); -+ if (!s->ps.pps_list[i]) -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ if (s->ps.sps != s0->ps.sps) -+ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0) -+ return ret; -+ -+ s->seq_decode = s0->seq_decode; -+ s->seq_output = s0->seq_output; -+ s->pocTid0 = s0->pocTid0; -+ s->max_ra = s0->max_ra; -+ s->eos = s0->eos; -+ s->no_rasl_output_flag = s0->no_rasl_output_flag; -+ -+ s->is_nalff = s0->is_nalff; -+ s->nal_length_size = s0->nal_length_size; -+ -+ s->threads_type = s0->threads_type; -+ -+ if (s0->eos) { -+ s->seq_decode = (s->seq_decode + 1) & 0xff; -+ s->max_ra = INT_MAX; -+ } -+ -+ s->sei.frame_packing = s0->sei.frame_packing; -+ s->sei.display_orientation = s0->sei.display_orientation; -+ s->sei.mastering_display = s0->sei.mastering_display; -+ s->sei.content_light = s0->sei.content_light; -+ s->sei.alternative_transfer = s0->sei.alternative_transfer; -+ -+ // * We do this here as it allows us to easily locate our parents -+ // global job pool, but there really should be a less nasty way -+ if (s->jbc == NULL) -+ { -+ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); -+ hevc_init_worker(s); -+ } -+ -+ return 0; -+} -+#endif -+ -+#include -+static int qpu_ok(void) -+{ -+ static int is_pi3 = -1; -+ if (is_pi3 == -1) -+ { -+ struct stat sb; -+ is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0); -+ } -+ return is_pi3; -+} -+ -+static av_cold int hevc_decode_init(AVCodecContext *avctx) -+{ -+ HEVCRpiContext *s = avctx->priv_data; -+ int ret; -+ -+ if (!qpu_ok()) -+ return AVERROR_DECODER_NOT_FOUND; -+ -+ if ((ret = hevc_init_context(avctx)) < 0) -+ return ret; -+ -+ // If we are a child context then stop now -+ // Everything after this point is either 1st decode setup or global alloc -+ // that must not be repeated -+ // Global info will be copied into children in update_thread_context (we -+ // can't do it here as we have no way of finding the parent context) -+ if (avctx->internal->is_copy) -+ return 0; -+ -+ // Job allocation requires VCSM alloc to work so ensure that we have it -+ // initialised by this point -+ { -+ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); -+ if (jbg == NULL) { -+ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } -+ -+ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) { -+ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } -+ } -+ -+ hevc_init_worker(s); -+ -+ s->eos = 1; -+ -+ if (avctx->extradata_size > 0 && avctx->extradata) { -+ if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0) -+ goto fail; -+ -+ if (!all_sps_supported(s)) { -+ ret = AVERROR_DECODER_NOT_FOUND; -+ goto fail; -+ } -+ } -+ -+ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) -+ s->threads_type = FF_THREAD_FRAME; -+ else -+ s->threads_type = 0; -+ -+ return 0; -+ -+fail: -+ hevc_decode_free(avctx); -+ return ret; -+} -+ -+static void hevc_decode_flush(AVCodecContext *avctx) -+{ -+ HEVCRpiContext *s = avctx->priv_data; -+ ff_hevc_rpi_flush_dpb(s); -+ s->max_ra = INT_MAX; -+ s->eos = 1; -+} -+ -+typedef struct hwaccel_rpi3_qpu_env_s { -+ const AVClass *av_class; -+ AVZcEnvPtr zc; -+} hwaccel_rpi3_qpu_env_t; -+ -+static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame) -+{ -+ hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data; -+ int rv; -+ -+ if (av_rpi_zc_in_use(s)) -+ { -+ rv = s->get_buffer2(s, frame, 0); -+ } -+ else -+ { -+ rv = av_rpi_zc_get_buffer(r3->zc, frame); -+ if (rv == 0) -+ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc -+ } -+ -+ if (rv == 0 && -+ (rv = ff_attach_decode_data(frame)) < 0) -+ { -+ av_frame_unref(frame); -+ } -+ -+ return rv; -+} -+ -+static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx) -+{ -+ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; -+ av_rpi_zc_int_env_freep(&r3->zc); -+ return 0; -+} -+ -+static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx) -+{ -+ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; -+ -+ if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL) -+ goto fail; -+ -+ return 0; -+ -+fail: -+ av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n"); -+ hwaccel_rpi3_qpu_free(avctx); -+ return AVERROR(ENOMEM); -+} -+ -+ -+#define OFFSET(x) offsetof(HEVCRpiContext, x) -+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) -+ -+ -+static const AVOption options[] = { -+ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin), -+ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, -+ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin), -+ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, -+ { NULL }, -+}; -+ -+static const AVClass hevc_rpi_decoder_class = { -+ .class_name = "HEVC RPI decoder", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+}; -+ -+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = { -+ AV_PIX_FMT_SAND128, -+ AV_PIX_FMT_SAND64_10, -+ AV_PIX_FMT_NONE -+}; -+ -+ -+static const AVHWAccel hwaccel_rpi3_qpu = { -+ .name = "Pi3 QPU Hwaccel", -+ .alloc_frame = hwaccel_alloc_frame, -+ .init = hwaccel_rpi3_qpu_init, -+ .uninit = hwaccel_rpi3_qpu_free, -+ .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t), -+ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, -+}; -+ -+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 = -+{ -+ .public = { -+ .pix_fmt = AV_PIX_FMT_SAND128, -+ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, -+ .device_type = AV_HWDEVICE_TYPE_NONE, -+ }, -+ .hwaccel = &hwaccel_rpi3_qpu -+}; -+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 = -+{ -+ .public = { -+ .pix_fmt = AV_PIX_FMT_SAND64_10, -+ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, -+ .device_type = AV_HWDEVICE_TYPE_NONE, -+ }, -+ .hwaccel = &hwaccel_rpi3_qpu -+}; -+ -+ -+static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = { -+ &hevc_rpi_hw_config_sand128, -+ &hevc_rpi_hw_config_sand64_10, -+ NULL -+}; -+ -+ -+AVCodec ff_hevc_rpi_decoder = { -+ .name = "hevc_rpi", -+ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"), -+ .type = AVMEDIA_TYPE_VIDEO, -+ .id = AV_CODEC_ID_HEVC, -+ .priv_data_size = sizeof(HEVCRpiContext), -+ .priv_class = &hevc_rpi_decoder_class, -+ .init = hevc_decode_init, -+ .close = hevc_decode_free, -+ .decode = hevc_rpi_decode_frame, -+ .flush = hevc_decode_flush, -+ .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context), -+ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | -+ AV_CODEC_CAP_HARDWARE | -+ AV_CODEC_CAP_AVOID_PROBING | -+#if 0 -+ // Debugging is often easier without threads getting in the way -+ 0, -+#warning H265 threading turned off -+#else -+ // We only have decent optimisation for frame - so only admit to that -+ AV_CODEC_CAP_FRAME_THREADS, -+#endif -+ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | -+ FF_CODEC_CAP_EXPORTS_CROPPING | -+ FF_CODEC_CAP_ALLOCATE_PROGRESS, -+ .pix_fmts = hevc_rpi_pix_fmts, -+ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), -+ .hw_configs = hevc_rpi_hw_configs, -+// .wrapper_name = "hevc_rpi", -+}; -+ -diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h -new file mode 100644 -index 0000000000..1f94d18673 ---- /dev/null -+++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1091 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVCDEC_H -+#define AVCODEC_RPI_HEVCDEC_H -+ -+#include "config.h" -+ -+#include -+ -+#include "libavutil/buffer.h" -+ -+#include "avcodec.h" -+#include "bswapdsp.h" -+#include "cabac.h" -+#include "get_bits.h" -+#include "rpi_hevcpred.h" -+#include "h2645_parse.h" -+#include "hevc.h" -+#include "rpi_hevc_mv.h" -+#include "rpi_hevc_ps.h" -+#include "rpi_hevc_sei.h" -+#include "rpi_hevcdsp.h" -+#include "internal.h" -+#include "thread.h" -+#include "videodsp.h" -+ -+#if ARCH_ARM -+#include "arm/rpi_hevc_misc_neon.h" -+#endif -+ -+#define MAX_NB_THREADS 16 -+#define SHIFT_CTB_WPP 2 -+ -+//TODO: check if this is really the maximum -+#define MAX_TRANSFORM_DEPTH 5 -+ -+#define MAX_TB_SIZE 32 -+#define MAX_QP 51 -+#define DEFAULT_INTRA_TC_OFFSET 2 -+ -+#define HEVC_CONTEXTS 199 -+ -+#define MRG_MAX_NUM_CANDS 5 -+ -+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 -+ -+// Size of DPB array -+#define HEVC_DPB_ELS 32 -+ -+#define L0 0 -+#define L1 1 -+ -+#define EPEL_EXTRA_BEFORE 1 -+#define EPEL_EXTRA_AFTER 2 -+#define EPEL_EXTRA 3 -+#define QPEL_EXTRA_BEFORE 3 -+#define QPEL_EXTRA_AFTER 4 -+#define QPEL_EXTRA 7 -+ -+#define EDGE_EMU_BUFFER_STRIDE 80 -+ -+#include -+#include "rpi_qpu.h" -+ -+// Max jobs per frame thread. Actual usage will be limited by the size -+// of the global job pool -+// ?? Limits -+#define RPI_MAX_JOBS 8 -+ -+// This is the number of _extra_ bit threads - we will have -+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing -+// -+// 0 is legitimate and will disable our WPP processing -+//#define RPI_EXTRA_BIT_THREADS 0 -+#define RPI_EXTRA_BIT_THREADS 2 -+ -+// Number of separate threads/passes in worker -+// 2 and 3 are the currently valid numbers -+// At the moment 3 seems fractionally faster -+//#define RPI_PASSES 2 -+#define RPI_PASSES 3 -+ -+// Print out various usage stats -+#define RPI_TSTATS 0 -+ -+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form -+#define RPI_COMPRESS_COEFFS 1 -+ -+// Wait for VPU/QPU to finish in worker pass 0 -+// If 0 then the wait is in pass 1 -+// -+// One might expect the better place to wait would be in pass 1 however -+// testing shows that pass 0 produces overall faster decode. -+// Interestingly it is QPU/VPU limited streams that seem to suffer -+// from pass 1 waits, CPU limited ones tend to show a very mild gain. -+// This define exists so it is easy to test this. -+#define RPI_WORKER_WAIT_PASS_0 1 -+ -+// Use ARM emulation of QPU pred -+// These are for debug only as the emulation makes only limited -+// effort to be fast -+#define RPI_QPU_EMU_Y 0 -+#define RPI_QPU_EMU_C 0 -+ -+// Max width & height we are prepared to consider -+// Sand frame shape calc becomes confused with large frames -+// Some buffer alloc also depends on this -+#define HEVC_RPI_MAX_WIDTH 2048 -+#define HEVC_RPI_MAX_HEIGHT 1088 -+ -+ -+// Min CTB size is 16 -+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) -+ -+/** -+ * Value of the luma sample at position (x, y) in the 2D array tab. -+ */ -+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)]) -+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)]) -+ -+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP) -+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \ -+ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP) -+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23) -+ -+enum RPSType { -+ ST_CURR_BEF = 0, -+ ST_CURR_AFT, -+ ST_FOLL, -+ LT_CURR, -+ LT_FOLL, -+ NB_RPS_TYPE, -+}; -+ -+enum SyntaxElement { -+ SAO_MERGE_FLAG = 0, -+ SAO_TYPE_IDX, -+ SAO_EO_CLASS, -+ SAO_BAND_POSITION, -+ SAO_OFFSET_ABS, -+ SAO_OFFSET_SIGN, -+ END_OF_SLICE_FLAG, -+ SPLIT_CODING_UNIT_FLAG, -+ CU_TRANSQUANT_BYPASS_FLAG, -+ SKIP_FLAG, -+ CU_QP_DELTA, -+ PRED_MODE_FLAG, -+ PART_MODE, -+ PCM_FLAG, -+ PREV_INTRA_LUMA_PRED_FLAG, -+ MPM_IDX, -+ REM_INTRA_LUMA_PRED_MODE, -+ INTRA_CHROMA_PRED_MODE, -+ MERGE_FLAG, -+ MERGE_IDX, -+ INTER_PRED_IDC, -+ REF_IDX_L0, -+ REF_IDX_L1, -+ ABS_MVD_GREATER0_FLAG, -+ ABS_MVD_GREATER1_FLAG, -+ ABS_MVD_MINUS2, -+ MVD_SIGN_FLAG, -+ MVP_LX_FLAG, -+ NO_RESIDUAL_DATA_FLAG, -+ SPLIT_TRANSFORM_FLAG, -+ CBF_LUMA, -+ CBF_CB_CR, -+ TRANSFORM_SKIP_FLAG, -+ EXPLICIT_RDPCM_FLAG, -+ EXPLICIT_RDPCM_DIR_FLAG, -+ LAST_SIGNIFICANT_COEFF_X_PREFIX, -+ LAST_SIGNIFICANT_COEFF_Y_PREFIX, -+ LAST_SIGNIFICANT_COEFF_X_SUFFIX, -+ LAST_SIGNIFICANT_COEFF_Y_SUFFIX, -+ SIGNIFICANT_COEFF_GROUP_FLAG, -+ SIGNIFICANT_COEFF_FLAG, -+ COEFF_ABS_LEVEL_GREATER1_FLAG, -+ COEFF_ABS_LEVEL_GREATER2_FLAG, -+ COEFF_ABS_LEVEL_REMAINING, -+ COEFF_SIGN_FLAG, -+ LOG2_RES_SCALE_ABS, -+ RES_SCALE_SIGN_FLAG, -+ CU_CHROMA_QP_OFFSET_FLAG, -+ CU_CHROMA_QP_OFFSET_IDX, -+}; -+ -+enum PartMode { -+ PART_2Nx2N = 0, -+ PART_2NxN = 1, -+ PART_Nx2N = 2, -+ PART_NxN = 3, -+ PART_2NxnU = 4, -+ PART_2NxnD = 5, -+ PART_nLx2N = 6, -+ PART_nRx2N = 7, -+}; -+ -+enum PredMode { -+ MODE_INTER = 0, -+ MODE_INTRA, -+ MODE_SKIP, -+}; -+ -+enum InterPredIdc { -+ PRED_L0 = 0, -+ PRED_L1, -+ PRED_BI, -+}; -+ -+enum PredFlag { -+ PF_INTRA = 0, -+ PF_L0, -+ PF_L1, -+ PF_BI, -+}; -+ -+enum SAOType { -+ SAO_NOT_APPLIED = 0, -+ SAO_BAND, -+ SAO_EDGE, -+ SAO_APPLIED -+}; -+ -+enum SAOEOClass { -+ SAO_EO_HORIZ = 0, -+ SAO_EO_VERT, -+ SAO_EO_135D, -+ SAO_EO_45D, -+}; -+ -+enum ScanType { -+ SCAN_DIAG = 0, -+ SCAN_HORIZ, -+ SCAN_VERT, -+}; -+ -+typedef struct RefPicList { -+ struct HEVCRpiFrame *ref[HEVC_MAX_REFS]; -+ int list[HEVC_MAX_REFS]; -+ uint8_t isLongTerm[HEVC_MAX_REFS]; -+ int nb_refs; -+} RefPicList; -+ -+typedef struct RefPicListTab { -+ RefPicList refPicList[2]; -+} RefPicListTab; -+ -+typedef struct RpiCodingUnit { -+ unsigned int x; // Passed to deblock -+ unsigned int y; -+ unsigned int x_split; -+ unsigned int y_split; -+ -+ enum PredMode pred_mode; ///< PredMode -+ enum PartMode part_mode; ///< PartMode -+ -+ // Inferred parameters -+ uint8_t intra_split_flag; ///< IntraSplitFlag -+ uint8_t max_trafo_depth; ///< MaxTrafoDepth -+ uint8_t cu_transquant_bypass_flag; -+} RpiCodingUnit; -+ -+typedef struct RpiPredictionUnit { -+ uint8_t intra_pred_mode[4]; -+ uint8_t intra_pred_mode_c[4]; -+ uint8_t chroma_mode_c[4]; -+ uint8_t merge_flag; -+} RpiPredictionUnit; -+ -+typedef struct HEVCRpiTransformUnit { -+ int8_t cu_qp_delta; -+ -+ // Inferred parameters; -+ uint8_t intra_pred_mode; -+ uint8_t intra_pred_mode_c; -+ uint8_t chroma_mode_c; -+ uint8_t is_cu_qp_delta_wanted; -+ uint8_t cu_chroma_qp_offset_wanted; -+ const int8_t * qp_divmod6[3]; -+} HEVCRpiTransformUnit; -+ -+typedef struct DBParams { -+ int8_t beta_offset; // -12 to +12 -+ int8_t tc_offset; // -12 to +12 -+} DBParams; -+ -+#define HEVC_FRAME_FLAG_OUTPUT (1 << 0) -+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1) -+#define HEVC_FRAME_FLAG_LONG_REF (1 << 2) -+#define HEVC_FRAME_FLAG_BUMPING (1 << 3) -+ -+struct HEVCRpiJob; -+ -+typedef struct HEVCRpiFrame { -+ AVFrame *frame; -+ ThreadFrame tf; -+ ColMvField *col_mvf; -+ int poc; -+ struct HEVCRpiFrame *collocated_ref; -+ -+ AVBufferRef *col_mvf_buf; -+ -+ /** -+ * A sequence counter, so that old frames are output first -+ * after a POC reset -+ */ -+ uint16_t sequence; -+ -+ /** -+ * A combination of HEVC_FRAME_FLAG_* -+ */ -+ uint8_t flags; -+ -+ // Entry no in DPB - can be used as a small unique -+ // frame identifier (within the current thread) -+ uint8_t dpb_no; -+} HEVCRpiFrame; -+ -+typedef struct HEVCRpiLocalContext { -+ HEVCRpiTransformUnit tu; -+ -+ CABACContext cc; -+ -+ // Vars that allow us to locate everything from just an lc -+ struct HEVCRpiContext * context; // ??? make const ??? -+ unsigned int lc_n; // lc list el no -+ -+ // Job wait links -+ struct HEVCRpiLocalContext * jw_next; -+ struct HEVCRpiLocalContext * jw_prev; -+ struct HEVCRpiLocalContext * ljw_next; -+ struct HEVCRpiLocalContext * ljw_prev; -+ struct HEVCRpiJob * volatile jw_job; -+ sem_t jw_sem; -+ -+ // ?? Wrap in structure ?? -+ sem_t bt_sem_in; -+ sem_t * bt_psem_out; -+ volatile int bt_terminate; -+ unsigned int ts; -+ unsigned int bt_last_line; // Last line in this bit_thread chunk -+ unsigned int bt_line_no; -+ unsigned int bt_line_width; -+ unsigned int bt_line_inc; -+ -+ struct HEVCRpiJob * jb0; -+ char unit_done; // Set once we have dealt with this slice -+ char bt_is_tile; -+ char last_progress_good; -+ char cabac_init_req; -+ -+ uint8_t cabac_state[HEVC_CONTEXTS]; -+ uint8_t stat_coeff[4]; -+ GetBitContext gb; -+ -+ uint8_t ct_depth; -+ int8_t qp_y; -+ int8_t curr_qp_y; -+ int8_t qPy_pred; -+ -+// N.B. Used by asm (neon) - do not change -+#define AVAIL_S_UR 0 -+#define AVAIL_S_U 1 -+#define AVAIL_S_UL 2 -+#define AVAIL_S_L 3 -+#define AVAIL_S_DL 4 -+ -+#define AVAIL_U (1 << AVAIL_S_U) -+#define AVAIL_L (1 << AVAIL_S_L) -+#define AVAIL_UL (1 << AVAIL_S_UL) -+#define AVAIL_UR (1 << AVAIL_S_UR) -+#define AVAIL_DL (1 << AVAIL_S_DL) -+ -+// Intra filters - same number space as avail -+#define FILTER_LIGHT 0x40 -+#define FILTER_STRONG 0x80 -+#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) -+ -+ uint8_t ctb_avail; -+ int end_of_ctb_x; -+ int end_of_ctb_y; -+ -+ RpiCodingUnit cu; -+ RpiPredictionUnit pu; -+ -+#define BOUNDARY_LEFT_SLICE (1 << 0) -+#define BOUNDARY_LEFT_TILE (1 << 1) -+#define BOUNDARY_UPPER_SLICE (1 << 2) -+#define BOUNDARY_UPPER_TILE (1 << 3) -+ /* properties of the boundary of the current CTB for the purposes -+ * of the deblocking filter */ -+ unsigned int boundary_flags; -+ -+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE) -+ uint8_t ipm_left[IPM_TAB_SIZE]; -+ uint8_t ipm_up[IPM_TAB_SIZE]; -+ -+//#define MVF_STASH_WIDTH 128 -+#define MVF_STASH_WIDTH 64 -+#define MVF_STASH_HEIGHT 64 -+#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE) -+#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE) -+ HEVCRpiMvField mvf_ul[1]; -+ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU]; -+ -+ /* +7 is for subpixel interpolation, *2 for high bit depths */ -+// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; -+ /* The extended size between the new edge emu buffer is abused by SAO */ -+// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; -+// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); -+ -+} HEVCRpiLocalContext; -+ -+// Each block can have an intra prediction and an add_residual command -+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH -+ -+// Sand only has 2 planes (Y/C) -+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4)) -+ -+// Command for intra prediction and transform_add of predictions to coefficients -+enum rpi_pred_cmd_e -+{ -+ RPI_PRED_ADD_RESIDUAL, -+ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx -+ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx -+ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V -+ RPI_PRED_ADD_DC, -+ RPI_PRED_ADD_DC_U, // Both U & V are effectively C -+ RPI_PRED_ADD_DC_V, -+ RPI_PRED_INTRA, -+ RPI_PRED_INTRA_C, -+ RPI_PRED_I_PCM, -+ RPI_PRED_CMD_MAX -+}; -+ -+typedef struct HEVCPredCmd { -+ uint8_t type; -+ uint8_t size; // log2 "size" used by all variants -+ uint8_t avail; // i_pred - but left here as they pack well -+ uint8_t dummy; -+ union { -+ struct { // TRANSFORM_ADD -+ uint8_t * dst; -+ const int16_t * buf; -+ uint16_t stride; // Should be good enough for all pic fmts we use -+ int16_t dc; -+ } ta; -+ struct { -+ uint8_t * dst; -+ uint32_t stride; -+ int dc; -+ } dc; -+ struct { // INTRA -+ uint16_t x; -+ uint16_t y; -+ enum IntraPredMode mode; -+ } i_pred; -+ struct { // I_PCM -+ uint16_t x; -+ uint16_t y; -+ const void * src; -+ uint32_t src_len; -+ } i_pcm; -+ }; -+} HEVCPredCmd; -+ -+union qpu_mc_pred_cmd_s; -+struct qpu_mc_pred_y_p_s; -+struct qpu_mc_src_s; -+ -+typedef struct HEVCRpiInterPredQ -+{ -+ union qpu_mc_pred_cmd_u *qpu_mc_base; -+ union qpu_mc_pred_cmd_u *qpu_mc_curr; -+ struct qpu_mc_src_s *last_l0; -+ struct qpu_mc_src_s *last_l1; -+ unsigned int load; -+ uint32_t code_setup; -+ uint32_t code_sync; -+ uint32_t code_exit; -+} HEVCRpiInterPredQ; -+ -+typedef struct HEVCRpiInterPredEnv -+{ -+ HEVCRpiInterPredQ * q; -+ uint8_t n; // Number of Qs -+ uint8_t n_grp; // Number of Q in a group -+ uint8_t curr; // Current Q number (0..n-1) -+ uint8_t used; // 0 if nothing in any Q, 1 otherwise -+ uint8_t used_grp; // 0 if nothing in any Q in the current group -+ unsigned int max_fill; -+ unsigned int min_gap; -+ GPU_MEM_PTR_T gptr; -+} HEVCRpiInterPredEnv; -+ -+typedef struct HEVCRpiIntraPredEnv { -+ unsigned int n; // Number of commands -+ HEVCPredCmd * cmds; -+} HEVCRpiIntraPredEnv; -+ -+typedef struct HEVCRpiCoeffEnv { -+ unsigned int n; -+#if RPI_COMPRESS_COEFFS -+ unsigned int packed; // Equal to 1 if coefficients should be being packed -+ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0 -+#endif -+ int16_t * buf; -+} HEVCRpiCoeffEnv; -+ -+typedef struct HEVCRpiCoeffsEnv { -+ HEVCRpiCoeffEnv s[4]; -+ GPU_MEM_PTR_T gptr; -+ void * mptr; -+} HEVCRpiCoeffsEnv; -+ -+typedef struct HEVCRpiFrameProgressWait { -+ int req; -+ struct HEVCRpiFrameProgressWait * next; -+ sem_t sem; -+} HEVCRpiFrameProgressWait; -+ -+typedef struct HEVCRpiFrameProgressState { -+ struct HEVCRpiFrameProgressWait * first; -+ struct HEVCRpiFrameProgressWait * last; -+ pthread_mutex_t lock; -+} HEVCRpiFrameProgressState; -+ -+typedef struct RpiBlk -+{ -+ unsigned int x; -+ unsigned int y; -+ unsigned int w; -+ unsigned int h; -+} RpiBlk; -+ -+typedef struct HEVCRpiJob { -+ struct HEVCRpiJob * next; // Free chain -+ struct HEVCRpiJobCtl * jbc_local; -+ const HEVCRpiSPS * sps; // sps used to set up this job -+ -+ int waited; -+ int ctu_ts_first; -+ int ctu_ts_last; -+ RpiBlk bounds; // Bounding box of job -+ -+ struct qpu_mc_pred_y_p_s * last_y8_p; -+ struct qpu_mc_src_s * last_y8_l1; -+ rpi_cache_flush_env_t * rfe; -+ -+ HEVCRpiInterPredEnv chroma_ip; -+ HEVCRpiInterPredEnv luma_ip; -+ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no -+ HEVCRpiIntraPredEnv intra; -+ HEVCRpiCoeffsEnv coeffs; -+ HEVCRpiFrameProgressWait progress_wait; -+ sem_t sem; -+ rpi_cache_buf_t flush_buf; -+} HEVCRpiJob; -+ -+struct HEVCRpiContext; -+ -+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb); -+ -+typedef struct HEVCRpiPassQueue -+{ -+// int pending; -+ volatile int terminate; -+ sem_t sem_in; -+ sem_t * psem_out; -+ unsigned int job_n; -+ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread -+ HEVCRpiWorkerFn * worker; -+ pthread_t thread; -+ uint8_t pass_n; // Pass number - debug -+ uint8_t started; -+} HEVCRpiPassQueue; -+ -+ -+struct HEVCRpiJobGlobal; -+ -+typedef struct HEVCRpiJobCtl -+{ -+ sem_t sem_out; -+ -+ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated -+ struct HEVCRpiJobGlobal * jbg; -+ -+ HEVCRpiLocalContext * lcw_head; -+ HEVCRpiLocalContext * lcw_tail; -+ -+ pthread_mutex_t in_lock; -+ int offload_in; -+ -+ HEVCRpiJob *offloadq[RPI_MAX_JOBS]; -+} HEVCRpiJobCtl; -+ -+ -+typedef struct HEVCRpiJobGlobal -+{ -+ intptr_t ref_count; -+ pthread_mutex_t lock; -+ HEVCRpiJob * free1; // Singly linked list of free jobs -+ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job -+ HEVCRpiLocalContext * wait_good; // Last good tail -+ HEVCRpiLocalContext * wait_tail; -+ -+} HEVCRpiJobGlobal; -+ -+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) -+ -+#if RPI_TSTATS -+typedef struct HEVCRpiStats { -+ int y_pred1_y8_merge; -+ int y_pred1_xy; -+ int y_pred1_x0; -+ int y_pred1_y0; -+ int y_pred1_x0y0; -+ int y_pred1_wle8; -+ int y_pred1_wgt8; -+ int y_pred1_hle16; -+ int y_pred1_hgt16; -+ int y_pred2_xy; -+ int y_pred2_x0; -+ int y_pred2_y0; -+ int y_pred2_x0y0; -+ int y_pred2_hle16; -+ int y_pred2_hgt16; -+} HEVCRpiStats; -+#endif -+ -+typedef struct HEVCRpiCabacState -+{ -+ uint8_t rice[4]; -+ uint8_t state[HEVC_CONTEXTS]; -+} HEVCRpiCabacState; -+ -+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels -+#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) -+#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) -+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte -+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el -+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) -+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) -+#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) -+#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row -+#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) -+ -+typedef struct HEVCRpiContext { -+ const AVClass *c; // needed by private avoptions -+ AVCodecContext *avctx; -+ -+ uint8_t threads_type; -+ char qpu_init_ok; -+ -+ /** 1 if the independent slice segment header was successfully parsed */ -+ uint8_t slice_initialized; -+ char used_for_ref; // rpi -+ char is_irap; -+ char offload_recon; -+ uint8_t eos; ///< current packet contains an EOS/EOB NAL -+ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL -+ uint8_t no_backward_pred_flag; -+ uint8_t is_decoded; -+ uint8_t no_rasl_output_flag; -+ -+ -+ /** -+ * Sequence counters for decoded and output frames, so that old -+ * frames are output first after a POC reset -+ */ -+ uint16_t seq_decode; -+ uint16_t seq_output; -+ -+ int width; -+ int height; -+ -+ HEVCRpiJobCtl * jbc; -+ // cabac stash -+ // b0 skip flag -+ // b1+ ct_depth -+ uint8_t * cabac_stash_left; -+ uint8_t * cabac_stash_up; -+ -+ // Function pointers -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ const uint8_t * qpu_dummy_frame_emu; -+#endif -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory -+#endif -+ HEVCRpiQpu qpu; -+ -+ HEVCRpiFrameProgressState progress_states[2]; -+ -+ HEVCRpiCabacState *cabac_save; -+ -+ AVFrame *frame; -+ AVFrame *output_frame; -+ uint8_t *sao_pixel_buffer_h[3]; -+ uint8_t *sao_pixel_buffer_v[3]; -+ -+ unsigned int col_mvf_stride; -+ AVBufferPool *col_mvf_pool; -+ -+ RpiSAOParams *sao; -+ DBParams *deblock; -+ enum HEVCNALUnitType nal_unit_type; -+ int temporal_id; ///< temporal_id_plus1 - 1 -+ HEVCRpiFrame *ref; -+ int poc; -+ int pocTid0; -+ int slice_idx; ///< number of the slice being currently decoded -+ int max_ra; -+ -+ int8_t *qp_y_tab; -+ -+ // Deblocking block strength bitmaps -+ unsigned int bs_stride2; -+ unsigned int bs_size; -+ uint8_t *bs_horizontal; -+ uint8_t *bs_vertical; -+ uint8_t *bsf_stash_up; -+ uint8_t *bsf_stash_left; -+ -+#if HEVC_RPI_MAX_CTBS >= 0xffff -+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0 -+ uint32_t *tab_slice_address; -+#else -+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0 -+ uint16_t *tab_slice_address; -+#endif -+ -+ // Bitfield 1 bit per 8 pels (min pcm size) -+ uint8_t *is_pcm; -+ // Bitfield 1 bit per 8 pels (min cb size) -+ // Only needed for CIP as CIP processing is async to the main thread -+ uint8_t *is_intra; -+ -+ // PU -+ HEVCRpiMvField *mvf_up; -+ HEVCRpiMvField *mvf_left; -+ -+ const RefPicList **rpl_up; -+ const RefPicList **rpl_left; -+ RefPicList * refPicList; -+ -+ // CTB-level flags affecting loop filter operation -+ uint8_t *filter_slice_edges; -+ -+ /** used on BE to byteswap the lines for checksumming */ -+ uint8_t *checksum_buf; -+ int checksum_buf_size; -+ -+ const uint8_t *data; -+ -+ H2645Packet pkt; -+ // type of the first VCL NAL of the current frame -+ enum HEVCNALUnitType first_nal_type; -+ -+ uint8_t context_initialized; -+ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated -+ ///< as a format defined in 14496-15 -+ int apply_defdispwin; -+ -+ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) -+ int nuh_layer_id; -+ -+ struct AVMD5 *md5_ctx; -+ -+ RefPicListTab * rpl_tab; -+ unsigned int rpl_tab_size; -+ -+ uint8_t *is_intra_store; -+ -+ RpiSliceHeader sh; -+ -+ HEVCRpiParamSets ps; -+ -+ HEVCRpiLocalContext *HEVClc; -+ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; -+ -+ HEVCRpiFrame DPB[HEVC_DPB_ELS]; -+ -+ ///< candidate references for the current frame -+ RefPicList rps[5]; -+ -+ HEVCRpiPredContext hpc; -+ HEVCDSPContext hevcdsp; -+ -+ HEVCSEIContext sei; -+ -+ // Put structures that allocate non-trivial storage at the end -+ // These are mostly used indirectly so position in the structure doesn't matter -+ HEVCRpiPassQueue passq[RPI_PASSES]; -+#if RPI_EXTRA_BIT_THREADS > 0 -+ int bt_started; -+ // This simply contains thread descriptors - task setup is held elsewhere -+ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; -+#endif -+#if RPI_TSTATS -+ HEVCRpiStats tstats; -+#endif -+} HEVCRpiContext; -+ -+/** -+ * Mark all frames in DPB as unused for reference. -+ */ -+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s); -+ -+/** -+ * Drop all frames currently in DPB. -+ */ -+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); -+ -+/** -+ * Construct the reference picture sets for the current frame. -+ */ -+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s); -+ -+/** -+ * Construct the reference picture list(s) for the current slice. -+ */ -+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); -+ -+ -+/** -+ * Get the number of candidate references for the current frame. -+ */ -+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s); -+ -+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc); -+ -+/** -+ * Find next frame in output order and put a reference to it in frame. -+ * @return 1 if a frame was output, 0 otherwise -+ */ -+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush); -+ -+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); -+ -+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags); -+ -+unsigned int ff_hevc_rpi_tb_avail_flags( -+ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); -+ -+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, HEVCRpiMvField * const mv); -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int nPbW, const unsigned int nPbH, -+ const unsigned int avail, -+ HEVCRpiMvField * const mv, -+ const unsigned int mvp_lx_flag, const unsigned int LX); -+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); -+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int log2_trafo_size, const int is_coded_block); -+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot); -+ -+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; -+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; -+extern const uint8_t ff_hevc_rpi_qpel_extra[4]; -+ -+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); -+ -+// arm/hevc_misc_neon.S -+// Neon coeff zap fn -+#if HAVE_NEON -+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); -+#endif -+ -+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCRpiFrame * const ref, const int val, const int field); -+ -+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); -+ -+// All of these expect that s->threads_type == FF_THREAD_FRAME -+ -+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCRpiFrame * const ref, const int y) -+{ -+ if (s->threads_type != 0) -+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); -+} -+ -+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) -+{ -+ if (s->used_for_ref && s->threads_type != 0) -+ ff_hevc_rpi_progress_signal_field(s, y, 1); -+} -+ -+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCRpiFrame * const ref, const int y) -+{ -+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); -+} -+ -+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) -+{ -+ if (s->used_for_ref && s->threads_type != 0) -+ { -+ ff_hevc_rpi_progress_signal_field(s, y, 0); -+ } -+} -+ -+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s) -+{ -+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); -+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); -+} -+ -+ -+// Set all done - signal nothing (used in missing refs) -+// Works for both rpi & non-rpi -+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref) -+{ -+ if (ref->tf.progress != NULL) -+ { -+ int * const p = (int *)ref->tf.progress->data; -+ p[0] = INT_MAX; -+ p[1] = INT_MAX; -+ } -+} -+ -+#define HEVC_RPI_420_ONLY 1 -+#define HEVC_RPI_SAND128_ONLY 1 -+ -+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx) -+{ -+#if HEVC_RPI_420_ONLY -+ return cidx == 0 ? 0 : 1; -+#else -+ return s->ps.sps->hshift[cidx]; -+#endif -+} -+ -+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx) -+{ -+#if HEVC_RPI_420_ONLY -+ return cidx == 0 ? 0 : 1; -+#else -+ return s->ps.sps->vshift[cidx]; -+#endif -+} -+ -+static inline int ctx_cfmt(const HEVCRpiContext * const s) -+{ -+#if HEVC_RPI_420_ONLY -+ return 1; -+#else -+ return s->ps.sps->chroma_format_idc; -+#endif -+} -+ -+static inline int frame_stride1(const AVFrame * const frame, const int c_idx) -+{ -+#if HEVC_RPI_SAND128_ONLY -+ return 128; -+#else -+ return frame->linesize[c_idx]; -+#endif -+} -+ -+#if HEVC_RPI_SAND128_ONLY -+// Propagate this decision to later zc includes -+#define RPI_ZC_SAND128_ONLY 1 -+#endif -+ -+#ifndef ff_hevc_rpi_copy_vert -+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+ int i; -+ switch (pixel_shift) -+ { -+ case 2: -+ for (i = 0; i < height; i++) { -+ *(uint32_t *)dst = *(uint32_t *)src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ case 1: -+ for (i = 0; i < height; i++) { -+ *(uint16_t *)dst = *(uint16_t *)src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ default: -+ for (i = 0; i < height; i++) { -+ *dst = *src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ } -+} -+#endif -+ -+ -+#if MVF_STASH_WIDTH == 64 -+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x, const unsigned int y) -+{ -+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); -+ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)); -+} -+ -+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int x, const unsigned int y) -+{ -+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); -+ const unsigned int x0_ctb = x0 & mask_cs_hi; -+ const unsigned int y0_ctb = y0 & mask_cs_hi; -+ -+ return (HEVCRpiMvField *)((y < y0_ctb) ? -+ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) : -+ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) : -+ lc->mvf_stash + -+ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + -+ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE))); -+} -+ -+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, -+ const unsigned int x0, -+ const unsigned int x) -+{ -+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); -+ const unsigned int x0_ctb = x0 & mask_cs_hi; -+ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU; -+} -+ -+#else -+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x, const unsigned int y) -+{ -+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); -+ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1))); -+} -+ -+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, -+ const unsigned int x0, const unsigned int y0, -+ const unsigned int x, const unsigned int y) -+{ -+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); -+ -+ const unsigned int x0_ctb = x0 & mask_cs_hi; -+ const unsigned int y0_ctb = y0 & mask_cs_hi; -+ -+ // If not in the same CTB for Y assume up -+ if (y < y0_ctb) { -+ // If not in the same CTB for X too assume up-left -+ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)); -+ } -+ return mvf_stash_ptr(s, lc, x, y); -+} -+ -+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, -+ const unsigned int x0, -+ const unsigned int x) -+{ -+ return MVF_STASH_WIDTH_PU; -+} -+#endif -+ -+#endif /* AVCODEC_RPI_HEVCDEC_H */ -diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c -new file mode 100644 -index 0000000000..87f3cc9d14 ---- /dev/null -+++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,450 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere -+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "rpi_hevcdsp.h" -+#include "rpi_hevc_mv.h" -+ -+static const int8_t transform[32][32] = { -+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, -+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, -+ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -+ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, -+ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -+ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, -+ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, -+ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, -+ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, -+ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, -+ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -+ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, -+ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -+ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, -+ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, -+ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, -+ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, -+ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, -+ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -+ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, -+ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -+ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, -+ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, -+ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, -+ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, -+ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, -+ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -+ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, -+ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -+ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, -+ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, -+ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, -+ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, -+ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, -+ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -+ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, -+ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -+ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, -+ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, -+ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, -+ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, -+ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, -+ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -+ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, -+ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -+ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, -+ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, -+ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, -+ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, -+ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, -+ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -+ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, -+ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -+ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, -+ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, -+ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, -+ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, -+ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, -+ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -+ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, -+ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -+ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, -+ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, -+ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, -+}; -+ -+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = { -+ { -2, 58, 10, -2}, -+ { -4, 54, 16, -2}, -+ { -6, 46, 28, -4}, -+ { -4, 36, 36, -4}, -+ { -4, 28, 46, -6}, -+ { -2, 16, 54, -4}, -+ { -2, 10, 58, -2}, -+}; -+ -+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = { -+ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0}, -+ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1}, -+ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1} -+}; -+ -+#define BIT_DEPTH 8 -+#include "rpi_hevcdsp_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 9 -+#include "rpi_hevcdsp_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 10 -+#include "rpi_hevcdsp_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 12 -+#include "rpi_hevcdsp_template.c" -+#undef BIT_DEPTH -+ -+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, -+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ int in_inc0, int in_inc1) -+{ -+ int shift = 32; -+ uint32_t bs = 0; -+ for (; pus > 0; pus--) { -+ int strength, out; -+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; -+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; -+ int nr_idx0 = neigh->ref_idx[0]; -+ int nr_idx1 = neigh->ref_idx[1]; -+ int neigh_refL0 = neigh_rpl0[nr_idx0]; -+ int neigh_refL1 = neigh_rpl1[nr_idx1]; -+ -+ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31); -+ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31); -+ -+#if 1 // This more directly matches the original implementation -+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { -+ // same L0 and L1 -+ if (curr_refL0 == neigh_refL0 && -+ curr_refL0 == curr_refL1 && -+ neigh_refL0 == neigh_refL1) { -+ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || -+ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) && -+ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || -+ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)) -+ strength = 1; -+ else -+ strength = 0; -+ } else if (neigh_refL0 == curr_refL0 && -+ neigh_refL1 == curr_refL1) { -+ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || -+ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else if (neigh_refL1 == curr_refL0 && -+ neigh_refL0 == curr_refL1) { -+ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || -+ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else { -+ strength = 1; -+ } -+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV -+ MvXY curr_mv0, neigh_mv0; -+ -+ if (curr->pred_flag & 1) { -+ curr_mv0 = curr->xy[0]; -+ } else { -+ curr_mv0 = curr->xy[1]; -+ curr_refL0 = curr_refL1; -+ } -+ -+ if (neigh->pred_flag & 1) { -+ neigh_mv0 = neigh->xy[0]; -+ } else { -+ neigh_mv0 = neigh->xy[1]; -+ neigh_refL0 = neigh_refL1; -+ } -+ -+ if (curr_refL0 == neigh_refL0) { -+ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else -+ strength = 1; -+ } else -+ strength = 1; -+#else // This has exactly the same effect, but is more suitable for vectorisation -+ MvXY curr_mv[2]; -+ MvXY neigh_mv[2]; -+ memcpy(curr_mv, curr->xy, sizeof curr_mv); -+ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv); -+ -+ if (!(curr->pred_flag & 2)) { -+ curr_mv[1] = curr_mv[0]; -+ curr_refL1 = curr_refL0; -+ } -+ if (!(neigh->pred_flag & 2)) { -+ neigh_mv[1] = neigh_mv[0]; -+ neigh_refL1 = neigh_refL0; -+ } -+ if (!(curr->pred_flag & 1)) { -+ curr_mv[0] = curr_mv[1]; -+ curr_refL0 = curr_refL1; -+ } -+ if (!(neigh->pred_flag & 1)) { -+ neigh_mv[0] = neigh_mv[1]; -+ neigh_refL0 = neigh_refL1; -+ } -+ -+ strength = 1; -+ -+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | -+ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) | -+ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4); -+ -+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | -+ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) | -+ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4); -+ -+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); -+#endif -+ -+ curr += in_inc0 / sizeof (HEVCRpiMvField); -+ neigh += in_inc1 / sizeof (HEVCRpiMvField); -+ -+ for (out = dup; out > 0; out--) -+ { -+ bs = (bs >> 2) | (strength << 30); -+ shift -= 2; -+ } -+ } -+ return bs >> shift; -+} -+ -+ -+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) -+{ -+ unsigned int i, j; -+ -+ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=8) -+ AV_COPY64U(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } else { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=16) -+ AV_COPY128(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } -+} -+ -+ -+ -+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) -+{ -+#undef FUNC -+#define FUNC(a, depth) a ## _ ## depth -+ -+#undef PEL_FUNC -+#define PEL_FUNC(dst1, idx1, idx2, a, depth) \ -+ for(i = 0 ; i < 10 ; i++) \ -+{ \ -+ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \ -+} -+ -+#undef EPEL_FUNCS -+#define EPEL_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \ -+ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \ -+ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \ -+ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth) -+ -+#undef EPEL_UNI_FUNCS -+#define EPEL_UNI_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ -+ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \ -+ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \ -+ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \ -+ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ -+ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \ -+ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \ -+ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth) -+ -+#undef EPEL_BI_FUNCS -+#define EPEL_BI_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ -+ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \ -+ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \ -+ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \ -+ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ -+ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \ -+ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \ -+ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth) -+ -+#undef QPEL_FUNCS -+#define QPEL_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \ -+ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \ -+ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \ -+ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth) -+ -+#undef QPEL_UNI_FUNCS -+#define QPEL_UNI_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \ -+ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth) -+ -+#undef QPEL_BI_FUNCS -+#define QPEL_BI_FUNCS(depth) \ -+ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ -+ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) -+ -+#define SLICED_ADD_RESIDUAL(depth)\ -+ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ -+ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ -+ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ -+ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ -+ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ -+ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ -+ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ -+ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ -+ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ -+ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ -+ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ -+ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ -+ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ -+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) -+#define SLICED_LOOP_FILTERS(depth)\ -+ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \ -+ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ -+ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ -+ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) -+#define SLICED_SAO(depth)\ -+ for (i = 0; i != SAO_FILTER_N; ++i) { \ -+ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ -+ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ -+ } \ -+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ -+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) -+ -+#define HEVC_DSP(depth) \ -+ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ -+ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ -+ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ -+ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ -+ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ -+ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ -+ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ -+ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ -+ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ -+ SLICED_ADD_RESIDUAL(depth); \ -+ hevcdsp->dequant = FUNC(dequant, depth); \ -+ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ -+ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ -+ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ -+ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ -+ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ -+ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ -+ \ -+ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ -+ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ -+ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ -+ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ -+ \ -+ for (i = 0; i != SAO_FILTER_N; ++i) { \ -+ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ -+ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ -+ } \ -+ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ -+ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ -+ SLICED_SAO(depth); \ -+ \ -+ QPEL_FUNCS(depth); \ -+ QPEL_UNI_FUNCS(depth); \ -+ QPEL_BI_FUNCS(depth); \ -+ EPEL_FUNCS(depth); \ -+ EPEL_UNI_FUNCS(depth); \ -+ EPEL_BI_FUNCS(depth); \ -+ \ -+ SLICED_LOOP_FILTERS(depth); \ -+ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ -+ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ -+ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ -+ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \ -+ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \ -+ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \ -+ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \ -+ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth) -+int i = 0; -+ -+ switch (bit_depth) { -+ case 9: -+ HEVC_DSP(9); -+ break; -+ case 10: -+ HEVC_DSP(10); -+ break; -+ case 12: -+ HEVC_DSP(12); -+ break; -+ default: -+ HEVC_DSP(8); -+ break; -+ } -+ -+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; -+ hevcdsp->cpy_blk = cpy_blk; -+ -+ if (ARCH_PPC) -+ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); -+ if (ARCH_X86) -+ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth); -+ if (ARCH_ARM) -+ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth); -+ if (ARCH_MIPS) -+ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth); -+} -diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h -new file mode 100644 -index 0000000000..5a7cdeeb66 ---- /dev/null -+++ b/libavcodec/rpi_hevcdsp.h -@@ -0,0 +1,177 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere -+ * -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVCDSP_H -+#define AVCODEC_RPI_HEVCDSP_H -+ -+#include "hevc.h" -+#include "get_bits.h" -+ -+struct HEVCRpiMvField; -+ -+#define MAX_PB_SIZE 64 -+ -+#define RPI_HEVC_SAO_BUF_STRIDE 160 -+ -+ -+typedef struct RpiSAOParams { -+ uint8_t band_position[3]; ///< sao_band_position (Y,U,V) -+ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V) -+ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V) -+ -+ int16_t offset_val[3][5]; ///> 16; -+ const int dc_u = (dc << 16) >> 16; -+ -+ stride /= sizeof(pixel); -+ -+ for (y = 0; y < size; y++) { -+ for (x = 0; x < size * 2; x += 2) { -+ dst[x] = av_clip_pixel(dst[x] + dc_u); -+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); -+ } -+ dst += stride; -+ } -+} -+ -+ -+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual)(_dst, res, stride, 4); -+} -+ -+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual)(_dst, res, stride, 8); -+} -+ -+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual)(_dst, res, stride, 16); -+} -+ -+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual)(_dst, res, stride, 32); -+} -+ -+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 4); -+} -+ -+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 8); -+} -+ -+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 16); -+} -+ -+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 32); -+} -+ -+// -- U -- (plaited) -+ -+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); -+} -+ -+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); -+} -+ -+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); -+} -+ -+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+// -- V -- (plaited) -+ -+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); -+} -+ -+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); -+} -+ -+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); -+} -+ -+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+// -- C -- (plaited - both U & V) -+ -+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 4); -+} -+ -+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 8); -+} -+ -+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 16); -+} -+ -+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); -+} -+ -+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); -+} -+ -+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); -+} -+ -+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+ -+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) -+{ -+ int16_t *coeffs = (int16_t *) _coeffs; -+ int x, y; -+ int size = 1 << log2_size; -+ -+ if (mode) { -+ coeffs += size; -+ for (y = 0; y < size - 1; y++) { -+ for (x = 0; x < size; x++) -+ coeffs[x] += coeffs[x - size]; -+ coeffs += size; -+ } -+ } else { -+ for (y = 0; y < size; y++) { -+ for (x = 1; x < size; x++) -+ coeffs[x] += coeffs[x - 1]; -+ coeffs += size; -+ } -+ } -+} -+ -+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) -+{ -+ int shift = 15 - BIT_DEPTH - log2_size; -+ int x, y; -+ int size = 1 << log2_size; -+ -+ if (shift > 0) { -+ int offset = 1 << (shift - 1); -+ for (y = 0; y < size; y++) { -+ for (x = 0; x < size; x++) { -+ *coeffs = (*coeffs + offset) >> shift; -+ coeffs++; -+ } -+ } -+ } else { -+ for (y = 0; y < size; y++) { -+ for (x = 0; x < size; x++) { -+ *coeffs = *coeffs << -shift; -+ coeffs++; -+ } -+ } -+ } -+} -+ -+#define SET(dst, x) (dst) = (x) -+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift) -+ -+#define TR_4x4_LUMA(dst, src, step, assign) \ -+ do { \ -+ int c0 = src[0 * step] + src[2 * step]; \ -+ int c1 = src[2 * step] + src[3 * step]; \ -+ int c2 = src[0 * step] - src[3 * step]; \ -+ int c3 = 74 * src[1 * step]; \ -+ \ -+ assign(dst[2 * step], 74 * (src[0 * step] - \ -+ src[2 * step] + \ -+ src[3 * step])); \ -+ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \ -+ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \ -+ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ -+ } while (0) -+ -+static void FUNC(transform_4x4_luma)(int16_t *coeffs) -+{ -+ int i; -+ int shift = 7; -+ int add = 1 << (shift - 1); -+ int16_t *src = coeffs; -+ -+ for (i = 0; i < 4; i++) { -+ TR_4x4_LUMA(src, src, 4, SCALE); -+ src++; -+ } -+ -+ shift = 20 - BIT_DEPTH; -+ add = 1 << (shift - 1); -+ for (i = 0; i < 4; i++) { -+ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); -+ coeffs += 4; -+ } -+} -+ -+#undef TR_4x4_LUMA -+ -+#define TR_4(dst, src, dstep, sstep, assign, end) \ -+ do { \ -+ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ -+ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ -+ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ -+ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ -+ \ -+ assign(dst[0 * dstep], e0 + o0); \ -+ assign(dst[1 * dstep], e1 + o1); \ -+ assign(dst[2 * dstep], e1 - o1); \ -+ assign(dst[3 * dstep], e0 - o0); \ -+ } while (0) -+ -+#define TR_8(dst, src, dstep, sstep, assign, end) \ -+ do { \ -+ int i, j; \ -+ int e_8[4]; \ -+ int o_8[4] = { 0 }; \ -+ for (i = 0; i < 4; i++) \ -+ for (j = 1; j < end; j += 2) \ -+ o_8[i] += transform[4 * j][i] * src[j * sstep]; \ -+ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ -+ \ -+ for (i = 0; i < 4; i++) { \ -+ assign(dst[i * dstep], e_8[i] + o_8[i]); \ -+ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ -+ } \ -+ } while (0) -+ -+#define TR_16(dst, src, dstep, sstep, assign, end) \ -+ do { \ -+ int i, j; \ -+ int e_16[8]; \ -+ int o_16[8] = { 0 }; \ -+ for (i = 0; i < 8; i++) \ -+ for (j = 1; j < end; j += 2) \ -+ o_16[i] += transform[2 * j][i] * src[j * sstep]; \ -+ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ -+ \ -+ for (i = 0; i < 8; i++) { \ -+ assign(dst[i * dstep], e_16[i] + o_16[i]); \ -+ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ -+ } \ -+ } while (0) -+ -+#define TR_32(dst, src, dstep, sstep, assign, end) \ -+ do { \ -+ int i, j; \ -+ int e_32[16]; \ -+ int o_32[16] = { 0 }; \ -+ for (i = 0; i < 16; i++) \ -+ for (j = 1; j < end; j += 2) \ -+ o_32[i] += transform[j][i] * src[j * sstep]; \ -+ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ -+ \ -+ for (i = 0; i < 16; i++) { \ -+ assign(dst[i * dstep], e_32[i] + o_32[i]); \ -+ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ -+ } \ -+ } while (0) -+ -+#define IDCT_VAR4(H) \ -+ int limit2 = FFMIN(col_limit + 4, H) -+#define IDCT_VAR8(H) \ -+ int limit = FFMIN(col_limit, H); \ -+ int limit2 = FFMIN(col_limit + 4, H) -+#define IDCT_VAR16(H) IDCT_VAR8(H) -+#define IDCT_VAR32(H) IDCT_VAR8(H) -+ -+#define IDCT(H) \ -+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ -+ int col_limit) \ -+{ \ -+ int i; \ -+ int shift = 7; \ -+ int add = 1 << (shift - 1); \ -+ int16_t *src = coeffs; \ -+ IDCT_VAR ## H(H); \ -+ \ -+ for (i = 0; i < H; i++) { \ -+ TR_ ## H(src, src, H, H, SCALE, limit2); \ -+ if (limit2 < H && i%4 == 0 && !!i) \ -+ limit2 -= 4; \ -+ src++; \ -+ } \ -+ \ -+ shift = 20 - BIT_DEPTH; \ -+ add = 1 << (shift - 1); \ -+ for (i = 0; i < H; i++) { \ -+ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ -+ coeffs += H; \ -+ } \ -+} -+ -+#define IDCT_DC(H) \ -+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \ -+{ \ -+ int i, j; \ -+ int shift = 14 - BIT_DEPTH; \ -+ int add = 1 << (shift - 1); \ -+ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ -+ \ -+ for (j = 0; j < H; j++) { \ -+ for (i = 0; i < H; i++) { \ -+ coeffs[i + j * H] = coeff; \ -+ } \ -+ } \ -+} -+ -+IDCT( 4) -+IDCT( 8) -+IDCT(16) -+IDCT(32) -+ -+IDCT_DC( 4) -+IDCT_DC( 8) -+IDCT_DC(16) -+IDCT_DC(32) -+ -+#undef TR_4 -+#undef TR_8 -+#undef TR_16 -+#undef TR_32 -+ -+#undef SET -+#undef SCALE -+ -+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, -+ int width, int height) -+{ -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int offset_table[32] = { 0 }; -+ int k, y, x; -+ int shift = BIT_DEPTH - 5; -+ -+ stride_dst /= sizeof(pixel); -+ stride_src /= sizeof(pixel); -+ -+ for (k = 0; k < 4; k++) -+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); -+ dst += stride_dst; -+ src += stride_src; -+ } -+} -+ -+#define CMP(a, b) (((a) > (b)) - ((a) < (b))) -+ -+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, -+ int eo, int width, int height) { -+ -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ int diff0 = CMP(src[x], src[x + a_stride]); -+ int diff1 = CMP(src[x], src[x + b_stride]); -+ int offset_val = edge_idx[2 + diff0 + diff1]; -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+} -+ -+ -+#if BIT_DEPTH == 10 -+// We need a 32 bit variation for the _c restores so hijack bit depth 10 -+#undef pixel -+#undef BIT_DEPTH -+#define pixel uint32_t -+#define BIT_DEPTH 32 -+// All 16 bit variations are the same -+#define sao_edge_restore_0_10 sao_edge_restore_0_9 -+#define sao_edge_restore_1_10 sao_edge_restore_1_9 -+#define sao_edge_restore_0_11 sao_edge_restore_0_9 -+#define sao_edge_restore_1_11 sao_edge_restore_1_9 -+#define sao_edge_restore_0_12 sao_edge_restore_0_9 -+#define sao_edge_restore_1_12 sao_edge_restore_1_9 -+#define sao_edge_restore_0_13 sao_edge_restore_0_9 -+#define sao_edge_restore_1_13 sao_edge_restore_1_9 -+#define sao_edge_restore_0_14 sao_edge_restore_0_9 -+#define sao_edge_restore_1_14 sao_edge_restore_1_9 -+#define sao_edge_restore_0_15 sao_edge_restore_0_9 -+#define sao_edge_restore_1_15 sao_edge_restore_1_9 -+#define sao_edge_restore_0_16 sao_edge_restore_0_9 -+#define sao_edge_restore_1_16 sao_edge_restore_1_9 -+#endif -+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 -+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ int x, y; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int sao_eo_class = sao->eo_class[c_idx]; -+ int init_x = 0, width = _width, height = _height; -+ -+ stride_dst /= sizeof(pixel); -+ stride_src /= sizeof(pixel); -+ -+ if (sao_eo_class != SAO_EO_VERT) { -+ if (borders[0]) { -+ for (y = 0; y < height; y++) { -+ dst[y * stride_dst] = src[y * stride_src]; -+ } -+ init_x = 1; -+ } -+ if (borders[2]) { -+ int offset = width - 1; -+ for (x = 0; x < height; x++) { -+ dst[x * stride_dst + offset] = src[x * stride_src + offset]; -+ } -+ width--; -+ } -+ } -+ if (sao_eo_class != SAO_EO_HORIZ) { -+ if (borders[1]) { -+ for (x = init_x; x < width; x++) -+ dst[x] = src[x]; -+ } -+ if (borders[3]) { -+ ptrdiff_t y_stride_dst = stride_dst * (height - 1); -+ ptrdiff_t y_stride_src = stride_src * (height - 1); -+ for (x = init_x; x < width; x++) -+ dst[x + y_stride_dst] = src[x + y_stride_src]; -+ height--; -+ } -+ } -+} -+ -+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ int x, y; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int sao_eo_class = sao->eo_class[c_idx]; -+ int init_x = 0, init_y = 0, width = _width, height = _height; -+ -+ stride_dst /= sizeof(pixel); -+ stride_src /= sizeof(pixel); -+ -+ if (sao_eo_class != SAO_EO_VERT) { -+ if (borders[0]) { -+ for (y = 0; y < height; y++) { -+ dst[y * stride_dst] = src[y * stride_src]; -+ } -+ init_x = 1; -+ } -+ if (borders[2]) { -+ int offset = width - 1; -+ for (x = 0; x < height; x++) { -+ dst[x * stride_dst + offset] = src[x * stride_src + offset]; -+ } -+ width--; -+ } -+ } -+ if (sao_eo_class != SAO_EO_HORIZ) { -+ if (borders[1]) { -+ for (x = init_x; x < width; x++) -+ dst[x] = src[x]; -+ init_y = 1; -+ } -+ if (borders[3]) { -+ ptrdiff_t y_stride_dst = stride_dst * (height - 1); -+ ptrdiff_t y_stride_src = stride_src * (height - 1); -+ for (x = init_x; x < width; x++) -+ dst[x + y_stride_dst] = src[x + y_stride_src]; -+ height--; -+ } -+ } -+ -+ { -+ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1]; -+ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2]; -+ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3]; -+ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3]; -+ -+ // Restore pixels that can't be modified -+ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) { -+ for(y = init_y+save_upper_left; y< height-save_lower_left; y++) -+ dst[y*stride_dst] = src[y*stride_src]; -+ } -+ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) { -+ for(y = init_y+save_upper_right; y< height-save_lower_right; y++) -+ dst[y*stride_dst+width-1] = src[y*stride_src+width-1]; -+ } -+ -+ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) { -+ for(x = init_x+save_upper_left; x < width-save_upper_right; x++) -+ dst[x] = src[x]; -+ } -+ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) { -+ for(x = init_x+save_lower_left; x < width-save_lower_right; x++) -+ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x]; -+ } -+ if(diag_edge[0] && sao_eo_class == SAO_EO_135D) -+ dst[0] = src[0]; -+ if(diag_edge[1] && sao_eo_class == SAO_EO_45D) -+ dst[width-1] = src[width-1]; -+ if(diag_edge[2] && sao_eo_class == SAO_EO_135D) -+ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1]; -+ if(diag_edge[3] && sao_eo_class == SAO_EO_45D) -+ dst[stride_dst*(height-1)] = src[stride_src*(height-1)]; -+ -+ } -+} -+#endif -+#if BIT_DEPTH == 32 -+#undef BIT_DEPTH -+#undef pixel -+#define BIT_DEPTH 10 -+#define pixel uint16_t -+#endif -+ -+// --- Plaited chroma versions -+ -+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int offset_table_u[32] = { 0 }; -+ int offset_table_v[32] = { 0 }; -+ int k, y, x; -+ int shift = BIT_DEPTH - 5; -+ -+ stride_dst /= sizeof(pixel); -+ stride_src /= sizeof(pixel); -+ width *= 2; -+ -+ for (k = 0; k < 4; k++) -+ { -+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; -+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; -+ } -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x += 2) -+ { -+// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); -+// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); -+ // *** & 31 shouldn't be wanted but just now we generate broken input that -+ // crashes us in 10-bit world -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); -+ } -+ dst += stride_dst; -+ src += stride_src; -+ } -+} -+ -+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, -+ int eo, int width, int height) { -+ -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); -+ -+ stride_dst /= sizeof(pixel); -+ width *= 2; -+ -+ av_assert0(width <= 64); -+ -+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x += 2) { -+ int diff0u = CMP(src[x], src[x + a_stride]); -+ int diff1u = CMP(src[x], src[x + b_stride]); -+ int offset_valu = edge_idx[2 + diff0u + diff1u]; -+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); -+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); -+ int offset_valv = edge_idx[2 + diff0v + diff1v]; -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); -+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+} -+ -+// Do once -+#if BIT_DEPTH == 8 -+// Any old 2 byte 'normal' restore will work for these -+#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 -+#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 -+// We need 32 bit for 9 bit+ -+#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 -+#endif -+ -+#undef CMP -+ -+//////////////////////////////////////////////////////////////////////////////// -+// -+//////////////////////////////////////////////////////////////////////////////// -+static void FUNC(put_hevc_pel_pixels)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = src[x] << (14 - BIT_DEPTH); -+ src += srcstride; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ for (y = 0; y < height; y++) { -+ memcpy(dst, src, width * sizeof(pixel)); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); -+ } -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+//////////////////////////////////////////////////////////////////////////////// -+// -+//////////////////////////////////////////////////////////////////////////////// -+#define QPEL_FILTER(src, stride) \ -+ (filter[0] * src[x - 3 * stride] + \ -+ filter[1] * src[x - 2 * stride] + \ -+ filter[2] * src[x - stride] + \ -+ filter[3] * src[x ] + \ -+ filter[4] * src[x + stride] + \ -+ filter[5] * src[x + 2 * stride] + \ -+ filter[6] * src[x + 3 * stride] + \ -+ filter[7] * src[x + 4 * stride]) -+ -+static void FUNC(put_hevc_qpel_h)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_v)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_hv)(int16_t *dst, -+ uint8_t *_src, -+ ptrdiff_t _srcstride, -+ int height, intptr_t mx, -+ intptr_t my, int width) -+{ -+ int x, y; -+ const int8_t *filter; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ -+ src -= QPEL_EXTRA_BEFORE * srcstride; -+ filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height + QPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; -+ tmp += MAX_PB_SIZE; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ int shift = 14 - BIT_DEPTH; -+ -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ int shift = 14 - BIT_DEPTH; -+ -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+ -+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ const int8_t *filter; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 - BIT_DEPTH; -+ -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= QPEL_EXTRA_BEFORE * srcstride; -+ filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height + QPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ const int8_t *filter; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= QPEL_EXTRA_BEFORE * srcstride; -+ filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height + QPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, -+ intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, -+ intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ -+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, -+ intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ const int8_t *filter; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= QPEL_EXTRA_BEFORE * srcstride; -+ filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height + QPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ const int8_t *filter; -+ pixel *src = (pixel*)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ src -= QPEL_EXTRA_BEFORE * srcstride; -+ filter = ff_hevc_rpi_qpel_filters[mx - 1]; -+ for (y = 0; y < height + QPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_qpel_filters[my - 1]; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+//////////////////////////////////////////////////////////////////////////////// -+// -+//////////////////////////////////////////////////////////////////////////////// -+#define EPEL_FILTER(src, stride) \ -+ (filter[0] * src[x - stride] + \ -+ filter[1] * src[x] + \ -+ filter[2] * src[x + stride] + \ -+ filter[3] * src[x + 2 * stride]) -+ -+static void FUNC(put_hevc_epel_h)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_v)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_hv)(int16_t *dst, -+ uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ -+ src -= EPEL_EXTRA_BEFORE * srcstride; -+ -+ for (y = 0; y < height + EPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; -+ tmp += MAX_PB_SIZE; -+ dst += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int shift = 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); -+ } -+ dst += dststride; -+ src += srcstride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; -+ int shift = 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); -+ src += srcstride; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); -+ dst += dststride; -+ src += srcstride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= EPEL_EXTRA_BEFORE * srcstride; -+ -+ for (y = 0; y < height + EPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 + 1 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= EPEL_EXTRA_BEFORE * srcstride; -+ -+ for (y = 0; y < height + EPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); -+ } -+ dst += dststride; -+ src += srcstride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); -+ } -+ dst += dststride; -+ src += srcstride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); -+ src += srcstride; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = denom + 14 - BIT_DEPTH; -+#if BIT_DEPTH < 14 -+ int offset = 1 << (shift - 1); -+#else -+ int offset = 0; -+#endif -+ -+ src -= EPEL_EXTRA_BEFORE * srcstride; -+ -+ for (y = 0; y < height + EPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ ox = ox * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ } -+} -+ -+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int16_t *src2, -+ int height, int denom, int wx0, int wx1, -+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ ptrdiff_t srcstride = _srcstride / sizeof(pixel); -+ pixel *dst = (pixel *)_dst; -+ ptrdiff_t dststride = _dststride / sizeof(pixel); -+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; -+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; -+ int16_t *tmp = tmp_array; -+ int shift = 14 + 1 - BIT_DEPTH; -+ int log2Wd = denom + shift - 1; -+ -+ src -= EPEL_EXTRA_BEFORE * srcstride; -+ -+ for (y = 0; y < height + EPEL_EXTRA; y++) { -+ for (x = 0; x < width; x++) -+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); -+ src += srcstride; -+ tmp += MAX_PB_SIZE; -+ } -+ -+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; -+ filter = ff_hevc_rpi_epel_filters[my - 1]; -+ -+ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); -+ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + -+ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); -+ tmp += MAX_PB_SIZE; -+ dst += dststride; -+ src2 += MAX_PB_SIZE; -+ } -+} -+ -+// line zero -+#define P3 pix[-4 * xstride] -+#define P2 pix[-3 * xstride] -+#define P1 pix[-2 * xstride] -+#define P0 pix[-1 * xstride] -+#define Q0 pix[0 * xstride] -+#define Q1 pix[1 * xstride] -+#define Q2 pix[2 * xstride] -+#define Q3 pix[3 * xstride] -+ -+// line three. used only for deblocking decision -+#define TP3 pix[-4 * xstride + 3 * ystride] -+#define TP2 pix[-3 * xstride + 3 * ystride] -+#define TP1 pix[-2 * xstride + 3 * ystride] -+#define TP0 pix[-1 * xstride + 3 * ystride] -+#define TQ0 pix[0 * xstride + 3 * ystride] -+#define TQ1 pix[1 * xstride + 3 * ystride] -+#define TQ2 pix[2 * xstride + 3 * ystride] -+#define TQ3 pix[3 * xstride + 3 * ystride] -+ -+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, -+ ptrdiff_t _xstride, ptrdiff_t _ystride, -+ int beta, int *_tc, -+ uint8_t *_no_p, uint8_t *_no_q) -+{ -+ int d, j; -+ pixel *pix = (pixel *)_pix; -+ ptrdiff_t xstride = _xstride / sizeof(pixel); -+ ptrdiff_t ystride = _ystride / sizeof(pixel); -+ -+ beta <<= BIT_DEPTH - 8; -+ -+ for (j = 0; j < 2; j++) { -+ const int dp0 = abs(P2 - 2 * P1 + P0); -+ const int dq0 = abs(Q2 - 2 * Q1 + Q0); -+ const int dp3 = abs(TP2 - 2 * TP1 + TP0); -+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); -+ const int d0 = dp0 + dq0; -+ const int d3 = dp3 + dq3; -+ const int tc = _tc[j] << (BIT_DEPTH - 8); -+ const int no_p = _no_p[j]; -+ const int no_q = _no_q[j]; -+ -+ if (d0 + d3 >= beta) { -+ pix += 4 * ystride; -+ continue; -+ } else { -+ const int beta_3 = beta >> 3; -+ const int beta_2 = beta >> 2; -+ const int tc25 = ((tc * 5 + 1) >> 1); -+ -+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && -+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && -+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { -+ // strong filtering -+ const int tc2 = tc << 1; -+ for (d = 0; d < 4; d++) { -+ const int p3 = P3; -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ const int q3 = Q3; -+ if (!no_p) { -+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); -+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); -+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); -+ } -+ if (!no_q) { -+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); -+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); -+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); -+ } -+ pix += ystride; -+ } -+ } else { // normal filtering -+ int nd_p = 1; -+ int nd_q = 1; -+ const int tc_2 = tc >> 1; -+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) -+ nd_p = 2; -+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) -+ nd_q = 2; -+ -+ for (d = 0; d < 4; d++) { -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; -+ if (abs(delta0) < 10 * tc) { -+ delta0 = av_clip(delta0, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ if (!no_p && nd_p > 1) { -+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); -+ P1 = av_clip_pixel(p1 + deltap1); -+ } -+ if (!no_q && nd_q > 1) { -+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); -+ Q1 = av_clip_pixel(q1 + deltaq1); -+ } -+ } -+ pix += ystride; -+ } -+ } -+ } -+ } -+} -+ -+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride, -+ ptrdiff_t _ystride, int *_tc, -+ uint8_t *_no_p, uint8_t *_no_q) -+{ -+ int d, j, no_p, no_q; -+ pixel *pix = (pixel *)_pix; -+ ptrdiff_t xstride = _xstride / sizeof(pixel); -+ ptrdiff_t ystride = _ystride / sizeof(pixel); -+ -+ for (j = 0; j < 2; j++) { -+ const int tc = _tc[j] << (BIT_DEPTH - 8); -+ if (tc <= 0) { -+ pix += 4 * ystride; -+ continue; -+ } -+ no_p = _no_p[j]; -+ no_q = _no_q[j]; -+ -+ for (d = 0; d < 4; d++) { -+ int delta0; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ pix += ystride; -+ } -+ } -+} -+ -+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, -+ int32_t *tc, uint8_t *no_p, -+ uint8_t *no_q) -+{ -+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q); -+} -+ -+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, -+ int32_t *tc, uint8_t *no_p, -+ uint8_t *no_q) -+{ -+ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q); -+} -+ -+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, -+ int beta, int32_t *tc, uint8_t *no_p, -+ uint8_t *no_q) -+{ -+ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), -+ beta, tc, no_p, no_q); -+} -+ -+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, -+ int beta, int32_t *tc, uint8_t *no_p, -+ uint8_t *no_q) -+{ -+ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, -+ beta, tc, no_p, no_q); -+} -+ -+#undef P3 -+#undef P2 -+#undef P1 -+#undef P0 -+#undef Q0 -+#undef Q1 -+#undef Q2 -+#undef Q3 -+ -+#undef TP3 -+#undef TP2 -+#undef TP1 -+#undef TP0 -+#undef TQ0 -+#undef TQ1 -+#undef TQ2 -+#undef TQ3 -+ -+// line zero -+#define P3 pix_l[0 * xstride] -+#define P2 pix_l[1 * xstride] -+#define P1 pix_l[2 * xstride] -+#define P0 pix_l[3 * xstride] -+#define Q0 pix_r[0 * xstride] -+#define Q1 pix_r[1 * xstride] -+#define Q2 pix_r[2 * xstride] -+#define Q3 pix_r[3 * xstride] -+ -+// line three. used only for deblocking decision -+#define TP3 pix_l[0 * xstride + 3 * ystride] -+#define TP2 pix_l[1 * xstride + 3 * ystride] -+#define TP1 pix_l[2 * xstride + 3 * ystride] -+#define TP0 pix_l[3 * xstride + 3 * ystride] -+#define TQ0 pix_r[0 * xstride + 3 * ystride] -+#define TQ1 pix_r[1 * xstride + 3 * ystride] -+#define TQ2 pix_r[2 * xstride + 3 * ystride] -+#define TQ3 pix_r[3 * xstride + 3 * ystride] -+ -+// This is identical to hevc_loop_filter_luma except that the P/Q -+// components are on separate pointers -+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, -+ uint8_t * _pix_l) -+{ -+ int d, j; -+ pixel *pix_l = (pixel *)_pix_l; -+ pixel *pix_r = (pixel *)_pix_r; -+ const ptrdiff_t xstride = 1; -+ const ptrdiff_t ystride = _stride / sizeof(pixel); -+ -+ beta <<= BIT_DEPTH - 8; -+ -+ for (j = 0; j < 2; j++) { -+ const int dp0 = abs(P2 - 2 * P1 + P0); -+ const int dq0 = abs(Q2 - 2 * Q1 + Q0); -+ const int dp3 = abs(TP2 - 2 * TP1 + TP0); -+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); -+ const int d0 = dp0 + dq0; -+ const int d3 = dp3 + dq3; -+ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8); -+ const int no_p = no_f & 1; -+ const int no_q = no_f & 2; -+ -+ if (d0 + d3 >= beta) { -+ pix_l += 4 * ystride; -+ pix_r += 4 * ystride; -+ continue; -+ } else { -+ const int beta_3 = beta >> 3; -+ const int beta_2 = beta >> 2; -+ const int tc25 = ((tc * 5 + 1) >> 1); -+ -+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && -+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && -+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { -+ // strong filtering -+ const int tc2 = tc << 1; -+ for (d = 0; d < 4; d++) { -+ const int p3 = P3; -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ const int q3 = Q3; -+ if (!no_p) { -+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); -+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); -+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); -+ } -+ if (!no_q) { -+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); -+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); -+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); -+ } -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } else { // normal filtering -+ int nd_p = 1; -+ int nd_q = 1; -+ const int tc_2 = tc >> 1; -+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) -+ nd_p = 2; -+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) -+ nd_q = 2; -+ -+ for (d = 0; d < 4; d++) { -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; -+ if (abs(delta0) < 10 * tc) { -+ delta0 = av_clip(delta0, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ if (!no_p && nd_p > 1) { -+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); -+ P1 = av_clip_pixel(p1 + deltap1); -+ } -+ if (!no_q && nd_q > 1) { -+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); -+ Q1 = av_clip_pixel(q1 + deltaq1); -+ } -+ } -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } -+ } -+ } -+} -+ -+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f) -+{ -+ // Just call the non-2 function having massaged the parameters -+ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16}; -+ uint8_t no_p[2] = {no_f & 1, no_f & 1}; -+ uint8_t no_q[2] = {no_f & 2, no_f & 2}; -+ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q); -+} -+ -+#undef TP3 -+#undef TP2 -+#undef TP1 -+#undef TP0 -+#undef TQ0 -+#undef TQ1 -+#undef TQ2 -+#undef TQ3 -+ -+#undef P3 -+#undef P2 -+#undef P1 -+#undef P0 -+#undef Q0 -+#undef Q1 -+#undef Q2 -+#undef Q3 -+ -+#define P1 pix_l[0 * xstride] -+#define P0 pix_l[1 * xstride] -+#define Q0 pix_r[0 * xstride] -+#define Q1 pix_r[1 * xstride] -+ -+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, -+ ptrdiff_t _ystride, const int32_t *_tc, -+ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) -+{ -+ int d, j, no_p, no_q; -+ pixel *pix_l = (pixel *)_pix_l; -+ pixel *pix_r = (pixel *)_pix_r; -+ ptrdiff_t xstride = _xstride / sizeof(pixel); -+ ptrdiff_t ystride = _ystride / sizeof(pixel); -+ -+ for (j = 0; j < 2; j++) { -+ const int tc = _tc[j] << (BIT_DEPTH - 8); -+ if (tc <= 0) { -+ pix_l += 4 * ystride; -+ pix_r += 4 * ystride; -+ continue; -+ } -+ no_p = _no_p[j]; -+ no_q = _no_q[j]; -+ -+ for (d = 0; d < 4; d++) { -+ int delta0; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } -+} -+ -+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, -+ unsigned int no_f) -+{ -+ uint8_t no_p[2] = {no_f & 1, no_f & 2}; -+ uint8_t no_q[2] = {no_f & 4, no_f & 8}; -+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; -+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); -+ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); -+} -+ -+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f) -+{ -+ uint8_t no_p[2] = {no_f & 1, no_f & 2}; -+ uint8_t no_q[2] = {no_f & 4, no_f & 8}; -+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; -+ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); -+ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); -+} -+ -+#undef P1 -+#undef P0 -+#undef Q0 -+#undef Q1 -+ -diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c -new file mode 100644 -index 0000000000..0aa8809a4b ---- /dev/null -+++ b/libavcodec/rpi_hevcpred.c -@@ -0,0 +1,161 @@ -+/* -+ * HEVC video Decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "rpi_hevcdec.h" -+ -+#include "rpi_hevcpred.h" -+#if (ARCH_ARM) -+#include "arm/rpi_hevcpred_arm.h" -+#endif -+ -+#define PRED_C 0 -+#define BIT_DEPTH 8 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 9 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 10 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 12 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+#undef PRED_C -+ -+#define PRED_C 1 -+#define BIT_DEPTH 8 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 9 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 10 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 12 -+#include "rpi_hevcpred_template.c" -+#undef BIT_DEPTH -+#undef PRED_C -+ -+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) -+{ -+#undef FUNC -+#define FUNC(a, depth) a ## _ ## depth -+ -+#undef FUNCC -+#define FUNCC(a, depth) a ## _ ## depth ## _c -+ -+#define HEVC_PRED_Y(depth) \ -+ hpc->intra_pred = FUNC(intra_pred, depth); \ -+ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ -+ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ -+ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ -+ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ -+ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ -+ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ -+ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ -+ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ -+ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ -+ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ -+ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ -+ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ -+ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ -+ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ -+ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ -+ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ -+ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ -+ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ -+ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ -+ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ -+ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ -+ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ -+ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ -+ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ -+ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ -+ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ -+ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ -+ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); -+ -+#define HEVC_PRED_C(depth) \ -+ hpc->intra_pred_c = FUNCC(intra_pred, depth); \ -+ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ -+ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ -+ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ -+ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ -+ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ -+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ -+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ -+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ -+ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ -+ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ -+ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ -+ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ -+ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ -+ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ -+ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ -+ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ -+ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ -+ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ -+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ -+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ -+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ -+ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ -+ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ -+ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ -+ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); -+ -+#define HEVC_PRED(depth) \ -+ HEVC_PRED_Y(depth); \ -+ HEVC_PRED_C(depth); -+ -+ switch (bit_depth) { -+ case 9: -+ HEVC_PRED(9); -+ break; -+ case 10: -+ HEVC_PRED(10); -+ break; -+ case 12: -+ HEVC_PRED(12); -+ break; -+ default: -+ HEVC_PRED(8); -+ break; -+ } -+ -+#if (ARCH_ARM) -+ ff_hevc_rpi_pred_init_arm(hpc, bit_depth); -+#elif (ARCH_MIPS) -+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); -+#endif -+} -diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h -new file mode 100644 -index 0000000000..9f0edb8798 ---- /dev/null -+++ b/libavcodec/rpi_hevcpred.h -@@ -0,0 +1,123 @@ -+/* -+ * HEVC video Decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVCODEC_RPI_HEVCPRED_H -+#define AVCODEC_RPI_HEVCPRED_H -+ -+#include -+#include -+#include "config.h" -+ -+struct HEVCRpiContext; -+struct HEVCRpiLocalContext; -+ -+enum IntraPredMode { -+ INTRA_PLANAR = 0, -+ INTRA_DC, -+ INTRA_ANGULAR_2, -+ INTRA_ANGULAR_3, -+ INTRA_ANGULAR_4, -+ INTRA_ANGULAR_5, -+ INTRA_ANGULAR_6, -+ INTRA_ANGULAR_7, -+ INTRA_ANGULAR_8, -+ INTRA_ANGULAR_9, -+ INTRA_ANGULAR_10, -+ INTRA_ANGULAR_11, -+ INTRA_ANGULAR_12, -+ INTRA_ANGULAR_13, -+ INTRA_ANGULAR_14, -+ INTRA_ANGULAR_15, -+ INTRA_ANGULAR_16, -+ INTRA_ANGULAR_17, -+ INTRA_ANGULAR_18, -+ INTRA_ANGULAR_19, -+ INTRA_ANGULAR_20, -+ INTRA_ANGULAR_21, -+ INTRA_ANGULAR_22, -+ INTRA_ANGULAR_23, -+ INTRA_ANGULAR_24, -+ INTRA_ANGULAR_25, -+ INTRA_ANGULAR_26, -+ INTRA_ANGULAR_27, -+ INTRA_ANGULAR_28, -+ INTRA_ANGULAR_29, -+ INTRA_ANGULAR_30, -+ INTRA_ANGULAR_31, -+ INTRA_ANGULAR_32, -+ INTRA_ANGULAR_33, -+ INTRA_ANGULAR_34, -+}; -+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 -+#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 -+ -+typedef void intra_filter_fn_t( -+ uint8_t * const left, uint8_t * const top, -+ const unsigned int req, const unsigned int avail, -+ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, -+ const unsigned int stride, -+ const unsigned int top_right_size, const unsigned int down_left_size); -+ -+typedef struct HEVCRpiPredContext { -+ void (*intra_pred)(const struct HEVCRpiContext * const s, -+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, -+ const unsigned int avail, const unsigned int log2_size); -+ -+ intra_filter_fn_t *intra_filter[4]; -+ void (*pred_planar[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride); -+ void (*pred_angular[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); -+ -+ void (*intra_pred_c)(const struct HEVCRpiContext * const s, -+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, -+ const unsigned int avail, const unsigned int log2_size); -+ intra_filter_fn_t *intra_filter_c[4]; -+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride); -+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int mode); -+ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); -+} HEVCRpiPredContext; -+ -+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); -+ -+#endif /* AVCODEC_RPI_HEVCPRED_H */ -diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c -new file mode 100644 -index 0000000000..f2ebcad332 ---- /dev/null -+++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,1407 @@ -+/* -+ * HEVC video decoder -+ * -+ * Copyright (C) 2012 - 2013 Guillaume Martres -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "config.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/rpi_sand_fns.h" -+#include "bit_depth_template.c" -+ -+#include "rpi_hevcdec.h" -+#include "rpi_hevcpred.h" -+ -+#define DUMP_PRED 0 -+ -+#define POS(x, y) src[(x) + stride * (y)] -+ -+// INCLUDED_ONCE defined at EOF -+#ifndef INCLUDED_ONCE -+typedef uint8_t (* c8_dst_ptr_t)[2]; -+typedef const uint8_t (* c8_src_ptr_t)[2]; -+typedef uint16_t (* c16_dst_ptr_t)[2]; -+typedef const uint16_t (* c16_src_ptr_t)[2]; -+ -+// *** On ARM make these NEON registers -+typedef struct pixel4_16 { -+ uint16_t x[4]; -+} pixel4_16; -+typedef struct pixel4_32 { -+ uint32_t x[4]; -+} pixel4_32; -+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) -+{ -+ pixel4_16 t = {{x, x, x, x}}; -+ return t; -+} -+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) -+{ -+ pixel4_32 t = {{x, x, x, x}}; -+ return t; -+} -+#endif -+ -+#if PRED_C -+// For chroma we double pixel size so we copy pairs -+#undef pixel -+#undef pixel2 -+#undef pixel4 -+#undef dctcoef -+#undef INIT_CLIP -+#undef no_rnd_avg_pixel4 -+#undef rnd_avg_pixel4 -+#undef AV_RN2P -+#undef AV_RN4P -+#undef AV_RN4PA -+#undef AV_WN2P -+#undef AV_WN4P -+#undef AV_WN4PA -+#undef CLIP -+#undef FUNC -+#undef FUNCC -+#undef av_clip_pixel -+#undef PIXEL_SPLAT_X4 -+ -+#if BIT_DEPTH == 8 -+#define pixel uint16_t -+#define pixel4 pixel4_16 -+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 -+#define cpel uint8_t -+#define c_src_ptr_t c8_src_ptr_t -+#define c_dst_ptr_t c8_dst_ptr_t -+#else -+#define pixel uint32_t -+#define pixel4 pixel4_32 -+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 -+#define cpel uint16_t -+#define c_src_ptr_t c16_dst_ptr_t -+#define c_dst_ptr_t c16_dst_ptr_t -+#endif -+#define AV_RN4P(p) (*(pixel4*)(p)) -+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) -+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) -+#endif -+ -+ -+// Get PW prior to horrid PRED_C trickery -+#if BIT_DEPTH == 8 -+#define PW 1 -+#else -+#define PW 2 -+#endif -+ -+ -+#if DUMP_PRED && !defined(INCLUDED_ONCE) -+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) -+{ -+ for (unsigned int y = 0; y != size; y++, data += stride * 2) { -+ for (unsigned int x = 0; x != size; x++) { -+ printf("%4d", data[x * 2]); -+ } -+ printf("\n"); -+ } -+ printf("\n"); -+} -+#endif -+ -+#ifndef INCLUDED_ONCE -+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) -+{ -+ if ((n >>= 2) != 0) { -+ uint32_t v4 = v | (v << 8); -+ uint32_t * p = (uint32_t *)ptr; -+ v4 = v4 | (v4 << 16); -+ do { -+ *p++ = v4; -+ } while (--n != 0); -+ } -+} -+ -+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) -+{ -+ if ((n >>= 2) != 0) { -+ uint32_t v2 = v | (v << 16); -+ uint32_t * p = (uint32_t *)ptr; -+ do { -+ *p++ = v2; -+ *p++ = v2; -+ } while (--n != 0); -+ } -+} -+ -+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) -+{ -+ if ((n >>= 2) != 0) { -+ uint32_t * p = (uint32_t *)ptr; -+ do { -+ *p++ = v; -+ *p++ = v; -+ *p++ = v; -+ *p++ = v; -+ } while (--n != 0); -+ } -+} -+ -+// Beware that this inverts the avail ordering -+// For CIP it seems easier this way round -+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask, -+ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, -+ unsigned int s0, unsigned int odd_s) -+{ -+ const unsigned int n = 1 << log2_intra_bits; -+ unsigned int fa = 0; -+ unsigned int i; -+ -+ size >>= 2; // Now in 4-pel units -+ s0 >>= 2; -+ -+ if ((avail & AVAIL_DL) != 0) -+ fa |= ((1 << s0) - 1) << (size - s0); -+ if ((avail & AVAIL_L) != 0) -+ fa |= ((1 << size) - 1) << size; -+ if ((avail & AVAIL_UL) != 0) -+ fa |= 1 << (size << 1); -+ -+ if (odd_s) { -+ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0) -+ fa &= ~1; -+ is_intra += i_stride; -+ } -+ -+ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) { -+ const unsigned int m = ((1 << n) - 1) << i; -+ if ((fa & m) != 0 && (*is_intra & i_mask) == 0) -+ fa &= ~m; -+ } -+ -+ return fa; -+} -+ -+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift, -+ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, -+ unsigned int s1, unsigned int odd_s) -+{ -+ if ((avail & (AVAIL_U | AVAIL_UR)) == 0) -+ { -+ return 0; -+ } -+ else -+ { -+ const unsigned int n = 1 << log2_intra_bits; -+ unsigned int fa = 0; -+ unsigned int i; -+ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift; -+ -+ size >>= 2; // Now in 4-pel units -+ s1 >>= 2; -+ -+ if ((avail & AVAIL_U) != 0) -+ fa |= ((1 << size) - 1); -+ if ((avail & AVAIL_UR) != 0) -+ fa |= ((1 << s1) - 1) << size; -+ -+ if (odd_s) { -+ fa &= im | ~1; -+ im >>= 1; -+ } -+ -+ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) { -+ const unsigned int m = ((1 << n) - 1) << i; -+ if ((im & 1) == 0) -+ fa &= ~m; -+ } -+ return fa; -+ } -+} -+ -+ -+ -+static inline unsigned int rmbd(unsigned int x) -+{ -+#if 1 -+ return __builtin_ctz(x); -+#else -+ unsigned int n = 0; -+ if ((x & 0xffff) == 0) { -+ x >>= 16; -+ n += 16; -+ } -+ if ((x & 0xff) == 0) { -+ x >>= 8; -+ n += 8; -+ } -+ if ((x & 0xf) == 0) { -+ x >>= 4; -+ n += 4; -+ } -+ if ((x & 0x3) == 0) { -+ x >>= 2; -+ n += 2; -+ } -+ -+ return (x & 1) == 0 ? n + 1 : n; -+#endif -+} -+#endif -+ -+ -+static void FUNC(cip_fill)(pixel * const left, pixel * const top, -+ const unsigned int avail_l, const unsigned int avail_u, -+ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, -+ const unsigned int stride, -+ const unsigned int size) -+{ -+ pixel a; -+ unsigned int i; -+ -+ // 1st find DL value -+ if ((avail_l & 1) == 0) { -+ if (avail_l != 0) -+ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; -+ else -+ { -+ // (avail_l | avail_u) != 0 so this must be good -+ const unsigned int n = rmbd(avail_u)*4; -+ a = (n >= size) ? src_ur[n - size] : src_u[n]; -+ } -+ } -+ -+ // L -+ { -+ pixel * d = left + size * 2 - 1; -+ const pixel * s = src_l + (size * 2 - 1) * stride; -+ unsigned int x = avail_l; -+ for (i = 0; i < size * 2; i += 4, x >>= 1) -+ { -+ if ((x & 1) != 0) { -+ // Avail -+ *d-- = *s; -+ s -= stride; -+ *d-- = *s; -+ s -= stride; -+ *d-- = *s; -+ s -= stride; -+ *d-- = a = *s; -+ s -= stride; -+ } -+ else -+ { -+ *d-- = a; -+ *d-- = a; -+ *d-- = a; -+ *d-- = a; -+ s -= stride * 4; -+ } -+ } -+ // UL -+ *d = a = (x & 1) != 0 ? *s : a; -+ } -+ -+ // U -+ { -+ pixel * d = top; -+ const pixel * s = src_u; -+ unsigned int x = avail_u; -+ -+ for (i = 0; i < size; i += 4, x >>= 1) -+ { -+ if ((x & 1) != 0) { -+ // Avail -+ *d++ = *s++; -+ *d++ = *s++; -+ *d++ = *s++; -+ *d++ = a = *s++; -+ } -+ else -+ { -+ *d++ = a; -+ *d++ = a; -+ *d++ = a; -+ *d++ = a; -+ s += 4; -+ } -+ } -+ -+ // UR -+ s = src_ur; -+ for (i = 0; i < size; i += 4, x >>= 1) -+ { -+ if ((x & 1) != 0) { -+ // Avail -+ *d++ = *s++; -+ *d++ = *s++; -+ *d++ = *s++; -+ *d++ = a = *s++; -+ } -+ else -+ { -+ *d++ = a; -+ *d++ = a; -+ *d++ = a; -+ *d++ = a; -+ s += 4; -+ } -+ } -+ } -+} -+ -+ -+#if !PRED_C && PW == 1 -+#define EXTEND(ptr, val, len) extend_8(ptr, val, len) -+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) -+#define EXTEND(ptr, val, len) extend_16(ptr, val, len) -+#else -+#define EXTEND(ptr, val, len) extend_32(ptr, val, len) -+#endif -+ -+// Reqs: -+// -+// Planar: DL[0], L, ul, U, UR[0] -+// DC: dl, L, ul, U, ur -+// A2-9: DL, L, ul, u, ur -+// A10: dl, L, ul, u, ur -+// A11-17 dl, L, UL, U, ur -+// A18-25 dl, L, Ul, U, ur -+// A26 dl, l, ul, U, ur -+// A27-34 dl, l, ul, U, UR -+ -+#ifndef INCLUDED_ONCE -+ -+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; -+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; -+ -+static const uint8_t req_avail_c[35] = -+{ -+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) -+ AVAIL_L | 0 | AVAIL_U, // DC -+ AVAIL_DL | AVAIL_L, // 2 -+ AVAIL_DL | AVAIL_L, // 3 -+ AVAIL_DL | AVAIL_L, // 4 -+ AVAIL_DL | AVAIL_L, // 5 -+ AVAIL_DL | AVAIL_L, // 6 -+ AVAIL_DL | AVAIL_L, // 7 -+ AVAIL_DL | AVAIL_L, // 8 -+ AVAIL_DL | AVAIL_L, // 9 -+ AVAIL_L, // 10 (H) -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 -+ AVAIL_U, // 26 (V) -+ AVAIL_U | AVAIL_UR, // 27 -+ AVAIL_U | AVAIL_UR, // 28 -+ AVAIL_U | AVAIL_UR, // 29 -+ AVAIL_U | AVAIL_UR, // 30 -+ AVAIL_U | AVAIL_UR, // 31 -+ AVAIL_U | AVAIL_UR, // 32 -+ AVAIL_U | AVAIL_UR, // 33 -+ AVAIL_U | AVAIL_UR // 34 -+}; -+ -+static const uint8_t req_avail[4][35] = { -+{ -+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) -+ AVAIL_L | 0 | AVAIL_U, // DC -+ AVAIL_DL | AVAIL_L, // 2 -+ AVAIL_DL | AVAIL_L, // 3 -+ AVAIL_DL | AVAIL_L, // 4 -+ AVAIL_DL | AVAIL_L, // 5 -+ AVAIL_DL | AVAIL_L, // 6 -+ AVAIL_DL | AVAIL_L, // 7 -+ AVAIL_DL | AVAIL_L, // 8 -+ AVAIL_DL | AVAIL_L, // 9 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 -+ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) -+ AVAIL_U | AVAIL_UR, // 27 -+ AVAIL_U | AVAIL_UR, // 28 -+ AVAIL_U | AVAIL_UR, // 29 -+ AVAIL_U | AVAIL_UR, // 30 -+ AVAIL_U | AVAIL_UR, // 31 -+ AVAIL_U | AVAIL_UR, // 32 -+ AVAIL_U | AVAIL_UR, // 33 -+ AVAIL_U | AVAIL_UR // 34 -+}, -+{ // 3 -+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) -+ AVAIL_L | 0 | AVAIL_U, // DC -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 -+ AVAIL_DL | AVAIL_L | 0, // 3 -+ AVAIL_DL | AVAIL_L | 0, // 4 -+ AVAIL_DL | AVAIL_L | 0, // 5 -+ AVAIL_DL | AVAIL_L | 0, // 6 -+ AVAIL_DL | AVAIL_L | 0, // 7 -+ AVAIL_DL | AVAIL_L | 0, // 8 -+ AVAIL_DL | AVAIL_L | 0, // 9 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) -+ AVAIL_U | AVAIL_UR | 0, // 27 -+ AVAIL_U | AVAIL_UR | 0, // 28 -+ AVAIL_U | AVAIL_UR | 0, // 29 -+ AVAIL_U | AVAIL_UR | 0, // 30 -+ AVAIL_U | AVAIL_UR | 0, // 31 -+ AVAIL_U | AVAIL_UR | 0, // 32 -+ AVAIL_U | AVAIL_UR | 0, // 33 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 -+}, -+{ // 4 -+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) -+ AVAIL_L | 0 | AVAIL_U, // DC -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 -+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 -+ AVAIL_DL | AVAIL_L | 0, // 9 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 -+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) -+ AVAIL_U | AVAIL_UR | 0, // 27 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 -+ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 -+}, -+{ // 5 -+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) -+ AVAIL_L | 0 | AVAIL_U, // DC -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 -+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 -+ AVAIL_L | 0, // 10 (H) -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 -+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 -+ AVAIL_U | 0, // 26 (V) -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 -+ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 -+} -+}; -+ -+ -+#endif -+ -+#define filter_light1 FUNC(filter_light1) -+static inline pixel filter_light1(pixel a, pixel b, pixel c) -+{ -+ return (a + b*2 + c + 2) >> 2; -+} -+ -+#define filter_light FUNC(filter_light) -+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) -+{ -+ pixel p0; -+ pixel p2 = *src; -+ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels -+ unsigned int n_minus_1 = n - 1; -+ -+ do -+ { -+ src += sstride; -+ p0 = p1; -+ p1 = p2; -+ p2 = *src; -+ *dst++ = filter_light1(p0, p1, p2); -+ } while (--n_minus_1 != 0); -+ *dst = filter_light1(p1, p2, pn); -+} -+ -+#define filter_strong FUNC(filter_strong) -+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) -+{ -+ unsigned int a = 64 * p0 + 32; -+ const int v = p1 - p0; -+ -+ do -+ { -+ *dst++ = (a += v) >> 6; -+ } while (--n != 0); -+} -+ -+#define intra_filter FUNC(intra_filter) -+static av_always_inline void intra_filter( -+ pixel * const left, pixel * const top, -+ const unsigned int req, const unsigned int avail, -+ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, -+ const unsigned int stride, -+ const unsigned int top_right_size, const unsigned int down_left_size, -+ const unsigned int log2_size) -+{ -+ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); -+ const unsigned int size = 1 << log2_size; -+ -+ // a_ is the first pel in a section working round dl -> ur -+ // b_ is the last -+ // Beware that top & left work out from UL so usage of a_ & b_ may -+ // swap between them. It is a bad naming scheme but I have found no -+ // better -+ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; -+ const pixel * b_dl = src_l + size * stride; -+ const pixel * a_l = src_l + (size - 1) * stride; -+ const pixel * b_l = src_l; -+ const pixel * ab_ul = src_l - stride; -+ const pixel * a_u = src_u; -+ const pixel * b_u = src_u + size - 1; -+ const pixel * a_ur = src_ur; -+ const pixel * b_ur = src_ur + top_right_size - 1; -+ -+ const unsigned int want = req & ~avail; -+ const unsigned int have = req & avail; -+ unsigned int i; -+ -+ if ((avail & AVAIL_DL) == 0) -+ { -+ a_dl = a_ur; -+ if ((avail & AVAIL_U) != 0) -+ a_dl = a_u; -+ if ((avail & AVAIL_UL) != 0) -+ a_dl = ab_ul; -+ if ((avail & AVAIL_L) != 0) -+ a_dl = a_l; -+ b_dl = a_dl; -+ } -+ -+ if ((avail & AVAIL_L) == 0) -+ { -+ a_l = b_dl; -+ b_l = b_dl; -+ } -+ if ((avail & AVAIL_UL) == 0) -+ { -+ ab_ul = b_l; -+ } -+ if ((avail & AVAIL_U) == 0) -+ { -+ a_u = ab_ul; -+ b_u = ab_ul; -+ } -+ if ((avail & AVAIL_UR) == 0) -+ { -+ a_ur = b_u; -+ b_ur = b_u; -+ } -+ -+ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints -+ { -+ if ((req & AVAIL_UL) != 0) -+ left[-1] = *ab_ul; -+ -+ if ((want & AVAIL_L) != 0) -+ EXTEND(left, *a_l, size); -+ if ((want & AVAIL_DL) != 0) -+ EXTEND(left + size, *a_dl, size); -+ if ((want & AVAIL_U) != 0) -+ EXTEND(top, *a_u, size); -+ if ((want & AVAIL_UR) != 0) -+ EXTEND(top + size, *a_ur, size); -+ -+ if ((have & AVAIL_U) != 0) -+ // Always good - even with sand -+ memcpy(top, a_u, size * sizeof(pixel)); -+ if ((have & AVAIL_UR) != 0) -+ { -+ memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); -+ EXTEND(top + size + top_right_size, *b_ur, -+ size - top_right_size); -+ } -+ if ((have & AVAIL_L) != 0) -+ { -+ for (i = 0; i < size; i++) -+ left[i] = b_l[stride * i]; -+ } -+ if ((have & AVAIL_DL) != 0) -+ { -+ for (i = 0; i < down_left_size; i++) -+ left[i + size] = b_dl[stride * i]; -+ EXTEND(left + size + down_left_size, *a_dl, -+ size - down_left_size); -+ } -+ } -+ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint -+ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && -+ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) -+ { -+ if ((req & (AVAIL_U | AVAIL_UR)) != 0) -+ filter_strong(top, *ab_ul, *b_ur, size * 2); -+ left[-1] = *ab_ul; -+ if ((req & (AVAIL_L | AVAIL_DL)) != 0) -+ filter_strong(left, *ab_ul, *a_dl, size*2); -+ } -+ else -+ { -+ // Same code for both have & want for UL -+ if ((req & AVAIL_UL) != 0) -+ { -+ left[-1] = filter_light1(*b_l, *ab_ul, *a_u); -+ } -+ -+ if ((want & AVAIL_L) != 0) -+ { -+ EXTEND(left, *a_l, size); -+ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; -+ } -+ if ((want & AVAIL_DL) != 0) -+ { -+ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding -+ EXTEND(left + size, *a_l, size); -+ } -+ if ((want & AVAIL_U) != 0) -+ { -+ EXTEND(top, *a_u, size); -+ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; -+ } -+ if ((want & AVAIL_UR) != 0) -+ { -+ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding -+ EXTEND(top + size, *a_ur, size); -+ } -+ -+ if ((have & AVAIL_U) != 0) -+ { -+ filter_light(top, *ab_ul, a_u, *a_ur, 1, size); -+ } -+ if ((have & AVAIL_UR) != 0) { -+ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); -+ top[size*2 - 1] = *b_ur; -+ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); -+ } -+ if ((have & AVAIL_L) != 0) -+ { -+ filter_light(left, *ab_ul, b_l, *b_dl, stride, size); -+ } -+ if ((have & AVAIL_DL) != 0) -+ { -+ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); -+ left[size*2 - 1] = *a_dl; -+ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); -+ } -+ } -+} -+ -+#define INTRA_FILTER(log2_size) \ -+static void FUNC(intra_filter_ ## log2_size)( \ -+ uint8_t * const left, uint8_t * const top, \ -+ const unsigned int req, const unsigned int avail, \ -+ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ -+ const unsigned int stride, \ -+ const unsigned int top_right_size, const unsigned int down_left_size) \ -+{ \ -+ intra_filter((pixel *)left, (pixel *)top, req, avail, \ -+ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ -+} -+ -+INTRA_FILTER(2) -+INTRA_FILTER(3) -+INTRA_FILTER(4) -+INTRA_FILTER(5) -+ -+#undef intra_filter -+#undef INTRA_FILTER -+ -+static void FUNC(intra_pred)(const HEVCRpiContext * const s, -+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, -+ const unsigned int log2_size) -+{ -+ // c_idx will alaways be 1 for _c versions and 0 for y -+ const unsigned int c_idx = PRED_C; -+ const unsigned int hshift = ctx_hshift(s, c_idx); -+ const unsigned int vshift = ctx_vshift(s, c_idx); -+ const unsigned int size = (1 << log2_size); -+ const unsigned int x = x0 >> hshift; -+ const unsigned int y = y0 >> vshift; -+ -+ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); -+ pixel *const src = c_idx == 0 ? -+ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : -+ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); -+ -+ // Align so we can do multiple loads in the asm -+ // Padded to 16 byte boundary so as not to confuse anything -+ DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]); -+ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); -+ -+ pixel * const left = left_array + 16 / sizeof(pixel); -+ const pixel * top_pred = top; -+ -+ const pixel * src_l = src - 1; -+ const pixel * src_u = src - stride; -+ const pixel * src_ur = src_u + size; -+#if !PRED_C -+ const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable; -+#else -+ const unsigned int req = req_avail_c[mode]; -+#endif -+ -+ // If we have nothing to pred from then fill with grey -+ // This isn't a common case but dealing with it here means we don't have to -+ // test for it later -+ if (avail == 0) -+ { -+dc_only: -+#if !PRED_C -+ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); -+#else -+ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); -+#endif -+ return; -+ } -+ -+ { -+ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs -+ const AVFrame * const frame = s->frame; -+ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 -+ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; -+ if ((x & mask) == 0) -+ src_l -= stripe_adj; -+ if (((x + size) & mask) == 0) -+ src_ur += stripe_adj; -+ } -+ -+ // Can deal with I-slices in 'normal' code even if CIP -+ // This also means that we don't need to generate (elsewhere) is_intra -+ // for IRAP frames -+ if (s->ps.pps->constrained_intra_pred_flag == 1 && -+ s->sh.slice_type != HEVC_SLICE_I) -+ { -+ // * If we ever actually care about CIP performance then we should -+ // special case out size 4 stuff (can be done by 'normal') and -+ // have 8-pel avail masks -+ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)), -+ -(int)(s->ps.sps->pcm_width), -+ 1 << (((x - 1) >> (3 - hshift)) & 7), -+ 1 - hshift, -+ avail, -+ size, -+ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), -+ vshift != 0 ? 0 : (y >> 2) & 1); -+ -+ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)), -+ (x >> (3 - hshift)) & 7, -+ 1 - hshift, -+ avail, -+ size, -+ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size), -+ hshift != 0 ? 0 : (x >> 2) & 1); -+ -+ // Anything left? -+ if ((avail_l | avail_u) == 0) -+ goto dc_only; -+ -+ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); -+ -+#if !PRED_C -+ if ((req & FILTER_LIGHT) != 0) -+ { -+ const unsigned threshold = 1 << (BIT_DEPTH - 5); -+ if ((req & FILTER_STRONG) != 0 && -+ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && -+ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) -+ { -+ filter_strong(top, left[-1], top[63], 64); -+ filter_strong(left, left[-1], left[63], 64); -+ } else -+ { -+ // LHS writes UL too so copy for top -+ const pixel p_ul = left[-1]; -+ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); -+ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); -+ } -+ } -+#endif -+ } -+ else -+ { -+ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); -+ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && -+ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) -+ { -+ top_pred = src_u; -+ } -+ else -+ { -+#if !PRED_C -+ s->hpc.intra_filter[log2_size - 2] -+#else -+ s->hpc.intra_filter_c[log2_size - 2] -+#endif -+ ((uint8_t *)left, (uint8_t *)top, req, avail, -+ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), -+ ur_size, -+ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); -+ } -+ } -+ -+ -+#if !PRED_C -+ switch (mode) { -+ case INTRA_PLANAR: -+ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride); -+ break; -+ case INTRA_DC: -+ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride); -+ break; -+ case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ default: -+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ } -+#else -+ switch (mode) { -+ case INTRA_PLANAR: -+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride); -+ break; -+ case INTRA_DC: -+ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride); -+ break; -+ case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ default: -+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, -+ (uint8_t *)left, stride, -+ mode); -+ break; -+ } -+ -+#if DUMP_PRED -+ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); -+ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); -+ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); -+ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); -+#endif -+#endif -+} -+ -+#if !PRED_C -+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, -+ const uint8_t *_left, ptrdiff_t stride, -+ int trafo_size) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ const pixel *top = (const pixel *)_top; -+ const pixel *left = (const pixel *)_left; -+ int size = 1 << trafo_size; -+ for (y = 0; y < size; y++) -+ for (x = 0; x < size; x++) -+ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + -+ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); -+} -+#else -+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, -+ const uint8_t * _left, ptrdiff_t stride, -+ int trafo_size) -+{ -+ int x, y; -+ int size = 1 << trafo_size; -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ const c_src_ptr_t top = (c_src_ptr_t)_top; -+ const c_src_ptr_t left = (c_src_ptr_t)_left; -+ -+ for (y = 0; y < size; y++, src += stride) -+ { -+ for (x = 0; x < size; x++) -+ { -+ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + -+ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); -+ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + -+ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); -+ } -+ } -+} -+#endif -+ -+#define PRED_PLANAR(size)\ -+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ -+ const uint8_t *left, ptrdiff_t stride) \ -+{ \ -+ FUNC(pred_planar)(src, top, left, stride, size + 2); \ -+} -+ -+PRED_PLANAR(0) -+PRED_PLANAR(1) -+PRED_PLANAR(2) -+PRED_PLANAR(3) -+ -+#undef PRED_PLANAR -+ -+#if !PRED_C -+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, int log2_size) -+{ -+ int i, j, x, y; -+ int size = (1 << log2_size); -+ pixel *src = (pixel *)_src; -+ const pixel *top = (const pixel *)_top; -+ const pixel *left = (const pixel *)_left; -+ int dc = size; -+ pixel4 a; -+ for (i = 0; i < size; i++) -+ dc += left[i] + top[i]; -+ -+ dc >>= log2_size + 1; -+ -+ a = PIXEL_SPLAT_X4(dc); -+ -+ for (i = 0; i < size; i++) -+ for (j = 0; j < size; j+=4) -+ AV_WN4P(&POS(j, i), a); -+ -+// if (c_idx == 0 && size < 32) -+// As we now have separate fns for y & c - no need to test that -+ if (size < 32) -+ { -+ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; -+ for (x = 1; x < size; x++) -+ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; -+ for (y = 1; y < size; y++) -+ POS(0, y) = (left[y] + 3 * dc + 2) >> 2; -+ } -+} -+#else -+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, int log2_size) -+{ -+ unsigned int i, j; -+ const unsigned int size = (1 << log2_size); -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ const c_src_ptr_t top = (c_src_ptr_t)_top; -+ const c_src_ptr_t left = (c_src_ptr_t)_left; -+ unsigned int dc0 = size; -+ unsigned int dc1 = size; -+ -+ for (i = 0; i < size; i++) -+ { -+ dc0 += left[i][0] + top[i][0]; -+ dc1 += left[i][1] + top[i][1]; -+ } -+ -+ dc0 >>= log2_size + 1; -+ dc1 >>= log2_size + 1; -+ -+ for (i = 0; i < size; i++, src += stride) -+ { -+ for (j = 0; j < size; ++j) -+ { -+ src[j][0] = dc0; -+ src[j][1] = dc1; -+ -+ } -+ } -+} -+#endif -+ -+#define PRED_DC(size)\ -+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ -+ const uint8_t *left, ptrdiff_t stride) \ -+{ \ -+ FUNC(pred_dc)(src, top, left, stride, size + 2); \ -+} -+ -+PRED_DC(0) -+PRED_DC(1) -+PRED_DC(2) -+PRED_DC(3) -+ -+#undef PRED_DC -+ -+ -+ -+ -+#if !PRED_C -+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) -+{ -+ int i, j; -+ int size = (1 << log2_size); -+ pixel *src = (pixel *)_src; -+ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); -+ -+ for (i = 0; i < size; i++) -+ for (j = 0; j < size; j+=4) -+ AV_WN4P(&POS(j, i), a); -+} -+#else -+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) -+{ -+ unsigned int i, j; -+ const unsigned int size = (1 << log2_size); -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ const pixel a = (1 << (BIT_DEPTH - 1)); -+ -+ for (i = 0; i < size; i++, src += stride) -+ { -+ for (j = 0; j < size; ++j) -+ { -+ src[j][0] = a; -+ src[j][1] = a; -+ } -+ } -+} -+#endif -+ -+#define PRED_DC0(size)\ -+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ -+{ \ -+ FUNC(pred_dc0)(src, stride, size + 2); \ -+} -+ -+PRED_DC0(0) -+PRED_DC0(1) -+PRED_DC0(2) -+PRED_DC0(3) -+ -+#undef PRED_DC0 -+ -+ -+ -+ -+#ifndef ANGLE_CONSTS -+#define ANGLE_CONSTS -+static const int intra_pred_angle[] = { -+ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, -+ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 -+}; -+static const int inv_angle[] = { -+ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, -+ -630, -910, -1638, -4096 -+}; -+#endif -+ -+#if !PRED_C -+static av_always_inline void FUNC(pred_angular)(uint8_t *_src, -+ const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, -+ int mode, int size) -+{ -+ int x, y; -+ pixel *src = (pixel *)_src; -+ const pixel *top = (const pixel *)_top; -+ const pixel *left = (const pixel *)_left; -+ -+ int angle = intra_pred_angle[mode - 2]; -+ pixel ref_array[3 * MAX_TB_SIZE + 4]; -+ pixel *ref_tmp = ref_array + size; -+ const pixel *ref; -+ int last = (size * angle) >> 5; -+ -+ if (mode >= 18) { -+ ref = top - 1; -+ -+ if (angle < 0) -+ { -+ memcpy(ref_tmp + 1, top, size * PW); -+ ref_tmp[0] = left[-1]; -+ -+ for (x = last; x <= -1; x++) -+ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; -+ ref = ref_tmp; -+ } -+ -+ for (y = 0; y < size; y++) { -+ int idx = ((y + 1) * angle) >> 5; -+ int fact = ((y + 1) * angle) & 31; -+ if (fact) { -+ for (x = 0; x < size; x += 4) { -+ POS(x , y) = ((32 - fact) * ref[x + idx + 1] + -+ fact * ref[x + idx + 2] + 16) >> 5; -+ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] + -+ fact * ref[x + 1 + idx + 2] + 16) >> 5; -+ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] + -+ fact * ref[x + 2 + idx + 2] + 16) >> 5; -+ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] + -+ fact * ref[x + 3 + idx + 2] + 16) >> 5; -+ } -+ } else { -+ for (x = 0; x < size; x += 4) -+ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); -+ } -+ } -+ if (mode == 26 && size < 32) { -+ for (y = 0; y < size; y++) -+ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); -+ } -+ -+ } else { -+ ref = left - 1; -+ if (angle < 0 && last < -1) { -+ for (x = 0; x <= size; x += 4) -+ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1])); -+ // Inv angle <= -256 so top offset >= 0 -+ for (x = last; x <= -1; x++) -+ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; -+ ref = ref_tmp; -+ } -+ -+ for (x = 0; x < size; x++) { -+ int idx = ((x + 1) * angle) >> 5; -+ int fact = ((x + 1) * angle) & 31; -+ if (fact) { -+ for (y = 0; y < size; y++) { -+ POS(x, y) = ((32 - fact) * ref[y + idx + 1] + -+ fact * ref[y + idx + 2] + 16) >> 5; -+ } -+ } else { -+ for (y = 0; y < size; y++) -+ POS(x, y) = ref[y + idx + 1]; -+ } -+ } -+ if (mode == 10 && size < 32) { -+ for (x = 0; x < size; x += 4) { -+ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1)); -+ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1)); -+ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1)); -+ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1)); -+ } -+ } -+ } -+} -+#else -+static av_always_inline void FUNC(pred_angular)(uint8_t *_src, -+ const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, -+ int mode, int size) -+{ -+ int x, y; -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ c_src_ptr_t top = (c_src_ptr_t)_top; -+ c_src_ptr_t left = (c_src_ptr_t)_left; -+ -+ const int angle = intra_pred_angle[mode - 2]; -+ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; -+ c_dst_ptr_t ref_tmp = ref_array + size; -+ c_src_ptr_t ref; -+ const int last = (size * angle) >> 5; -+ -+ if (mode >= 18) { -+ ref = top - 1; -+ if (angle < 0) { -+ memcpy(ref_tmp + 1, top, size * 2 * PW); -+ ref_tmp[0][0] = left[-1][0]; -+ ref_tmp[0][1] = left[-1][1]; -+ for (x = last; x <= -1; x++) -+ { -+ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; -+ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; -+ } -+ ref = (c_src_ptr_t)ref_tmp; -+ } -+ -+ for (y = 0; y < size; y++, src += stride) { -+ const int idx = ((y + 1) * angle) >> 5; -+ const int fact = ((y + 1) * angle) & 31; -+ if (fact) { -+ for (x = 0; x < size; ++x) { -+ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + -+ fact * ref[x + idx + 2][0] + 16) >> 5; -+ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + -+ fact * ref[x + idx + 2][1] + 16) >> 5; -+ } -+ } else { -+ memcpy(src, ref + idx + 1, size * 2 * PW); -+ } -+ } -+ } else { -+ ref = left - 1; -+ if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); -+ for (x = last; x <= -1; x++) -+ { -+ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; -+ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; -+ } -+ ref = (c_src_ptr_t)ref_tmp; -+ } -+ -+ for (x = 0; x < size; x++, src++) { -+ const int idx = ((x + 1) * angle) >> 5; -+ const int fact = ((x + 1) * angle) & 31; -+ if (fact) { -+ for (y = 0; y < size; y++) { -+ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + -+ fact * ref[y + idx + 2][0] + 16) >> 5; -+ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + -+ fact * ref[y + idx + 2][1] + 16) >> 5; -+ } -+ } else { -+ for (y = 0; y < size; y++) -+ { -+ src[y * stride][0] = ref[y + idx + 1][0]; -+ src[y * stride][1] = ref[y + idx + 1][1]; -+ } -+ } -+ } -+ } -+} -+#endif -+ -+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, -+ ptrdiff_t stride, int mode) -+{ -+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); -+} -+ -+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, -+ ptrdiff_t stride, int mode) -+{ -+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); -+} -+ -+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, -+ ptrdiff_t stride, int mode) -+{ -+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); -+} -+ -+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, -+ ptrdiff_t stride, int mode) -+{ -+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); -+} -+ -+#undef cpel -+#undef c_src_ptr_t -+#undef c_dst_ptr_t -+ -+#undef EXTEND -+#undef POS -+#undef PW -+ -+#undef filter_light1 -+#undef filter_light -+#undef filter_strong -+#undef ref_gen -+ -+#ifndef INCLUDED_ONCE -+#define INCLUDED_ONCE -+#endif -+ -diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c -new file mode 100644 -index 0000000000..98a0b104b7 ---- /dev/null -+++ b/libavcodec/rpi_mailbox.c -@@ -0,0 +1,155 @@ -+/* -+Copyright (c) 2012, Broadcom Europe Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define MAJOR_NUM 100 -+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) -+#define DEVICE_FILE_NAME "/dev/vcio" -+ -+#include "rpi_mailbox.h" -+//#include -+ -+/* -+ * use ioctl to send mbox property message -+ */ -+ -+static int mbox_property(int file_desc, void *buf) -+{ -+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); -+ -+ if (ret_val < 0) { -+ printf("ioctl_set_msg failed:%d\n", ret_val); -+ } -+ -+#ifdef DEBUG -+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; -+ for (i=0; i -+#include -+#include -+#include -+ -+#include "config.h" -+ -+#include "libavutil/avassert.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#include -+#include -+#include -+#pragma GCC diagnostic pop -+ -+#include "rpi_mem.h" -+#include "rpi_zc_frames.h" -+ -+ -+#define OPT_PREFER_CMA 0 -+ -+struct rpi_cache_flush_env_s { -+ struct vcsm_user_clean_invalid2_s v; -+}; -+ -+ -+// GPU memory alloc fns (internal) -+ -+static void gpu_free_internal(GPU_MEM_PTR_T * const p) -+{ -+ if (p->arm != NULL) -+ vcsm_unlock_ptr(p->arm); -+ if (p->vcsm_handle != 0) -+ vcsm_free(p->vcsm_handle); -+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again -+} -+ -+ -+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, -+ const int numbytes, const unsigned int cache_type, const char * const name) -+{ -+ memset(p, 0, sizeof(*p)); -+ p->numbytes = (numbytes + 255) & ~255; // Round up -+ -+ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name); -+ goto fail; -+ } -+ if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name); -+ goto fail; -+ } -+ if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name); -+ goto fail; -+ } -+ if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name); -+ goto fail; -+ } -+ -+ return 0; -+ -+fail: -+ gpu_free_internal(p); -+ return AVERROR(ENOMEM); -+} -+ -+// Public gpu fns -+ -+// Allocate memory on GPU -+// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes -+// Returns 0 on success. -+// This allocates memory that will not be cached in ARM's data cache. -+// Therefore safe to use without data cache flushing. -+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) -+{ -+ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached"); -+} -+ -+// This allocates data that will be -+// Cached in ARM L2 -+// Uncached in VPU L2 -+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) -+{ -+ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached"); -+} -+ -+void gpu_free(GPU_MEM_PTR_T * const p) { -+ gpu_free_internal(p); -+} -+ -+void rpi_mem_gpu_uninit(void) -+{ -+ vcsm_exit(); -+ bcm_host_deinit(); -+} -+ -+int rpi_mem_gpu_init(const unsigned int flags) -+{ -+ const int wants_cma = bcm_host_is_fkms_active(); -+ int use_cma; -+ -+ (void)flags; -+ -+ if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0) -+ use_cma = 1; -+ else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0) -+ use_cma = 0; -+ else -+ return AVERROR(EINVAL); -+ -+ bcm_host_init(); -+ -+ return use_cma + 1; -+} -+ -+// ---------------------------------------------------------------------------- -+// -+// Cache flush functions -+ -+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s)) -+ -+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf) -+{ -+ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf; -+ *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}}; -+ return rfe; -+} -+ -+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) -+{ -+ // Nothing needed -+} -+ -+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) -+{ -+ int rc = 0; -+ if (rfe->v.op_count != 0) { -+ if (vcsm_clean_invalid2(&rfe->v) != 0) -+ { -+ const int err = errno; -+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err); -+ rc = AVERROR(err); -+ } -+ rfe->v.op_count = 0; -+ } -+ return rc; -+} -+ -+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) -+{ -+ int rc = rpi_cache_flush_execute(rfe);; -+ -+ return rc; -+} -+ -+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) -+{ -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ -+ av_assert1(rfe->v.op_count <= CACHE_EL_MAX); -+ -+ b->invalidate_mode = mode; -+ b->block_count = blocks; -+ b->start_address = gm->arm + offset0; -+ b->block_size = block_size; -+ b->inter_block_stride = block_stride; -+} -+ -+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset, const unsigned int size) -+{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || size == 0) -+ return; -+ -+ av_assert1(offset <= gm->numbytes); -+ av_assert1(size <= gm->numbytes); -+ av_assert1(offset + size <= gm->numbytes); -+ -+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); -+} -+ -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) -+{ -+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); -+} -+ -+ -+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) -+{ -+#if !RPI_ONE_BUF -+#error Fixme! (NIF) -+#endif -+ if (gpu_is_buf1(frame)) { -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); -+ } -+ else -+ { -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); -+ } -+} -+ -+// Flush an area of a frame -+// Width, height, x0, y0 in luma pels -+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, -+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, -+ const unsigned int uv_shift, const int do_luma, const int do_chroma) -+{ -+ const unsigned int y_offset = frame->linesize[0] * y0; -+ const unsigned int y_size = frame->linesize[0] * height; -+ // Round UV up/down to get everything -+ const unsigned int uv_rnd = (1U << uv_shift) >> 1; -+ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); -+ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; -+ -+#if 0 -+ // *** frame->height is cropped height so not good -+ // As all unsigned they will also reject -ve -+ // Test individually as well as added to reject overflow -+ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped -+ av_assert0(n <= (unsigned int)frame->height); -+ av_assert0(start_line + n <= (unsigned int)frame->height); -+#endif -+ -+ if (!gpu_is_buf1(frame)) -+ { -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); -+ } -+ } -+ else if (!av_rpi_is_sand_frame(frame)) -+ { -+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); -+ } -+ } -+ else -+ { -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); -+ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); -+ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C -+ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); -+ -+ if (do_chroma) -+ { -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ b->invalidate_mode = mode; -+ b->block_count = block_count; -+ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); -+ b->block_size = uv_size; -+ b->inter_block_stride = stride1 * stride2; -+ } -+ if (do_luma) -+ { -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ b->invalidate_mode = mode; -+ b->block_count = block_count; -+ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); -+ b->block_size = y_size; -+ b->inter_block_stride = stride1 * stride2; -+ } -+ } -+} -+ -+// Call this to clean and invalidate a region of memory -+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) -+{ -+ rpi_cache_buf_t cbuf; -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); -+ rpi_cache_flush_add_gm_ptr(rfe, p, mode); -+ rpi_cache_flush_finish(rfe); -+} -+ -diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h -new file mode 100644 -index 0000000000..a451079806 ---- /dev/null -+++ b/libavcodec/rpi_mem.h -@@ -0,0 +1,88 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#ifndef RPI_MEM_H -+#define RPI_MEM_H -+ -+typedef struct gpu_mem_ptr_s { -+ unsigned char *arm; // Pointer to memory mapped on ARM side -+ int vc_handle; // Videocore handle of relocatable memory -+ int vcsm_handle; // Handle for use by VCSM -+ int vc; // Address for use in GPU code -+ int numbytes; // Size of memory block -+} GPU_MEM_PTR_T; -+ -+// General GPU functions -+ -+#define GPU_INIT_GPU 1 -+#define GPU_INIT_CMA 2 -+ -+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); -+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); -+extern void gpu_free(GPU_MEM_PTR_T * const p); -+int rpi_mem_gpu_init(const unsigned int flags); -+void rpi_mem_gpu_uninit(void); -+ -+// Cache flush stuff -+ -+struct rpi_cache_flush_env_s; -+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; -+ -+typedef struct {uint32_t t[33];} rpi_cache_buf_t; -+ -+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf); -+// Free env without flushing -+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); -+// Do the accumulated flush & clear but do not free the env -+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); -+// Do the accumulated flush & free the env -+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); -+ -+typedef enum -+{ -+ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, -+ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, -+ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 -+} rpi_cache_flush_mode_t; -+ -+struct AVFrame; -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, -+ const unsigned int offset, const unsigned int size); -+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); -+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode, -+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, -+ const unsigned int uv_shift, const int do_luma, const int do_chroma); -+ -+// init, add, finish for one gm ptr -+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); -+ -+#endif -diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c -new file mode 100644 -index 0000000000..cb7b96119e ---- /dev/null -+++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,776 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+ -+#include -+#include -+#include -+#include -+#include -+#include "libavutil/avassert.h" -+ -+#include "config.h" -+ -+#include -+#include -+ -+#include -+ -+#include "rpi_mailbox.h" -+#include "rpi_mem.h" -+#include "rpi_qpu.h" -+#include "rpi_hevc_shader.h" -+#include "rpi_hevc_transform8.h" -+#include "rpi_hevc_transform10.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) -+#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 -+ -+// Add profile flags to all QPU requests - generates output in "vcdbg log msg" -+// Beware this is expensive and will probably throw off all other timing by >10% -+#define RPI_TRACE_QPU_PROFILE_ALL 0 -+ -+// QPU "noflush" flags -+// a mixture of flushing & profiling -+ -+#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed -+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers -+#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results -+#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling -+#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) -+ -+#define vcos_verify_ge0(x) ((x)>=0) -+ -+// Size in 32bit words -+#define QPU_CODE_SIZE 4098 -+#define VPU_CODE_SIZE 16384 -+ -+static const short rpi_transMatrix2even[32][16] = { // Even rows first -+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, -+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, -+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, -+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, -+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, -+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, -+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, -+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, -+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, -+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, -+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, -+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, -+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, -+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, -+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, -+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, -+// Odd rows -+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, -+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, -+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, -+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, -+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, -+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, -+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, -+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, -+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, -+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, -+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, -+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, -+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, -+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, -+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, -+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} -+}; -+ -+// Code/constants on GPU -+struct GPU -+{ -+// unsigned int qpu_code[QPU_CODE_SIZE]; -+ unsigned int vpu_code8[VPU_CODE_SIZE]; -+ unsigned int vpu_code10[VPU_CODE_SIZE]; -+ short transMatrix2even[16*16*2]; -+}; -+ -+#define WAIT_COUNT_MAX 16 -+ -+typedef struct trace_time_one_s -+{ -+ int count; -+ int64_t start[WAIT_COUNT_MAX]; -+ int64_t total[WAIT_COUNT_MAX]; -+} trace_time_one_t; -+ -+typedef struct trace_time_wait_s -+{ -+ unsigned int jcount; -+ int64_t start0; -+ int64_t last_update; -+ trace_time_one_t active; -+ trace_time_one_t wait; -+} trace_time_wait_t; -+ -+typedef struct vq_wait_s -+{ -+ sem_t sem; -+ struct vq_wait_s * next; -+} vq_wait_t; -+ -+#define VQ_WAIT_POOL_SIZE 16 -+typedef struct vq_wait_pool_s -+{ -+ vq_wait_t * head; -+ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; -+} vq_wait_pool_t; -+ -+static void vq_wait_pool_init(vq_wait_pool_t * const pool); -+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); -+ -+typedef struct gpu_env_s -+{ -+ int open_count; -+ int init_count; -+ int vpu_i_cache_flushed; -+ GPU_MEM_PTR_T qpu_code_gm_ptr; -+ GPU_MEM_PTR_T code_gm_ptr; -+ GPU_MEM_PTR_T dummy_gm_ptr; -+ vq_wait_pool_t wait_pool; -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ trace_time_wait_t ttw; -+#endif -+} gpu_env_t; -+ -+// Stop more than one thread trying to allocate memory or use the processing resources at once -+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; -+static gpu_env_t * gpu = NULL; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ -+static int64_t ns_time(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; -+} -+ -+ -+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 -+ -+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) -+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) -+#define T_ARG(t) T_SEC(t), T_MS(t) -+#define T_FMT "%u.%03u" -+ -+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) -+{ -+ // Update totals for levels that are still pending -+ for (int i = 0; i < tto->count; ++i) { -+ tto->total[i] += now - tto->start[i]; -+ tto->start[i] = now; -+ } -+ -+ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", -+ prefix, -+ T_ARG(now - start0 - tto->total[0]), -+ T_ARG(tto->total[0]), -+ T_ARG(tto->total[1]), -+ T_ARG(tto->total[2]), -+ T_ARG(tto->total[3])); -+} -+ -+ -+static void tto_start(trace_time_one_t * const tto, const int64_t now) -+{ -+ av_assert0(tto->count < WAIT_COUNT_MAX); -+ tto->start[tto->count++] = now; -+} -+ -+static void tto_end(trace_time_one_t * const tto, const int64_t now) -+{ -+ const int n = --tto->count; -+ av_assert0(n >= 0); -+ tto->total[n] += now - tto->start[n]; -+} -+ -+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) -+{ -+ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); -+ tto_print(&ttw->active, now, ttw->start0, "Active"); -+ tto_print(&ttw->wait, now, ttw->start0, " Wait"); -+} -+ -+#endif -+ -+// GPU memory alloc fns (internal) -+ -+static void gpu_free_internal(GPU_MEM_PTR_T * const p) -+{ -+ if (p->arm != NULL) -+ vcsm_unlock_ptr(p->arm); -+ if (p->vcsm_handle != 0) -+ vcsm_free(p->vcsm_handle); -+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again -+} -+ -+ -+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, -+ const int numbytes, const unsigned int cache_type, const char * const name) -+{ -+ memset(p, 0, sizeof(*p)); -+ p->numbytes = (numbytes + 255) & ~255; // Round up -+ -+ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 || -+ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 || -+ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL || -+ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) -+ { -+ gpu_free_internal(p); -+ return AVERROR(ENOMEM); -+ } -+ return 0; -+} -+ -+ -+// GPU init, free, lock, unlock -+ -+static void gpu_term(void) -+{ -+ gpu_env_t * const ge = gpu; -+ -+ // We have to hope that eveything has terminated... -+ gpu = NULL; -+ -+ vc_gpuserv_deinit(); -+ -+ gpu_free_internal(&ge->code_gm_ptr); -+ gpu_free_internal(&ge->qpu_code_gm_ptr); -+ gpu_free_internal(&ge->dummy_gm_ptr); -+ -+ vcsm_exit(); -+ -+ vq_wait_pool_deinit(&ge->wait_pool); -+ -+ free(ge); -+} -+ -+ -+// Connect to QPU, returns 0 on success. -+static int gpu_init(gpu_env_t ** const gpu) { -+ volatile struct GPU* ptr; -+ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); -+ int rv; -+ *gpu = NULL; -+ -+ if (ge == NULL) -+ return -1; -+ -+ vq_wait_pool_init(&ge->wait_pool); -+ -+ vcsm_init(); -+ -+ // Now copy over the QPU code into GPU memory -+ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0) -+ return rv; -+ -+ { -+ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader; -+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes); -+ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes); -+ } -+ -+ // And the VPU code -+ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0) -+ return rv; -+ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; -+ -+ // Zero everything so we have zeros between the code bits -+ memset((void *)ptr, 0, sizeof(*ptr)); -+ { -+ int num_bytes = sizeof(rpi_hevc_transform8); -+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); -+ } -+ { -+ int num_bytes = sizeof(rpi_hevc_transform10); -+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); -+ } -+ // And the transform coefficients -+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); -+ -+ // Generate a dummy "frame" & fill with 0x80 -+ // * Could reset to 1 <dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0) -+ return rv; -+ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000); -+ -+ *gpu = ge; -+ return 0; -+} -+ -+ -+ -+static void gpu_unlock(void) { -+ pthread_mutex_unlock(&gpu_mutex); -+} -+ -+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. -+static gpu_env_t * gpu_lock(void) { -+ pthread_mutex_lock(&gpu_mutex); -+ -+ av_assert1(gpu != NULL); -+ return gpu; -+} -+ -+static gpu_env_t * gpu_lock_ref(void) -+{ -+ pthread_mutex_lock(&gpu_mutex); -+ -+ if (gpu == NULL) { -+ int rv = gpu_init(&gpu); -+ if (rv != 0) { -+ gpu_unlock(); -+ return NULL; -+ } -+ } -+ -+ ++gpu->open_count; -+ return gpu; -+} -+ -+static void gpu_unlock_unref(gpu_env_t * const ge) -+{ -+ if (--ge->open_count == 0) -+ gpu_term(); -+ -+ gpu_unlock(); -+} -+ -+static inline gpu_env_t * gpu_ptr(void) -+{ -+ av_assert1(gpu != NULL); -+ return gpu; -+} -+ -+unsigned int vpu_get_fn(const unsigned int bit_depth) { -+ uint32_t a = 0; -+ -+ // Make sure that the gpu is initialized -+ av_assert1(gpu != NULL); -+ switch (bit_depth){ -+ case 8: -+ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); -+ break; -+ case 10: -+ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); -+ break; -+ default: -+ av_assert0(0); -+ } -+ return a; -+} -+ -+unsigned int vpu_get_constants(void) { -+ av_assert1(gpu != NULL); -+ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even)); -+} -+ -+void gpu_ref(void) -+{ -+ gpu_lock_ref(); -+ gpu_unlock(); -+} -+ -+void gpu_unref(void) -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ gpu_unlock_unref(ge); -+} -+ -+// ---------------------------------------------------------------------------- -+ -+ -+// Wait abstractions - mostly so we can easily add profile code -+static void vq_wait_pool_init(vq_wait_pool_t * const wp) -+{ -+ unsigned int i; -+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { -+ sem_init(&wp->pool[i].sem, 0, 0); -+ wp->pool[i].next = wp->pool + i + 1; -+ } -+ wp->head = wp->pool + 0; -+ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; -+} -+ -+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) -+{ -+ unsigned int i; -+ wp->head = NULL; -+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { -+ sem_destroy(&wp->pool[i].sem); -+ wp->pool[i].next = NULL; -+ } -+} -+ -+ -+// If sem_init actually takes time then maybe we want a pool... -+static vq_wait_t * vq_wait_new(void) -+{ -+ gpu_env_t * const ge = gpu_lock_ref(); -+ vq_wait_t * const wait = ge->wait_pool.head; -+ ge->wait_pool.head = wait->next; -+ wait->next = NULL; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ tto_start(&ge->ttw.active, ns_time()); -+#endif -+ -+ gpu_unlock(); -+ return wait; -+} -+ -+static void vq_wait_delete(vq_wait_t * const wait) -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ wait->next = ge->wait_pool.head; -+ ge->wait_pool.head = wait; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ trace_time_wait_t * const ttw = &ge->ttw; -+ const int64_t now = ns_time(); -+ ++ttw->jcount; -+ tto_end(&ttw->wait, now); -+ -+ if (ttw->start0 == 0) -+ { -+ ttw->start0 = ttw->active.start[0]; -+ ttw->last_update = ttw->start0; -+ } -+ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) -+ { -+ ttw->last_update += WAIT_TIME_PRINT_PERIOD; -+ ttw_print(ttw, now); -+ } -+ } -+#endif -+ gpu_unlock_unref(ge); -+} -+ -+static void vq_wait_wait(vq_wait_t * const wait) -+{ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ const int64_t now = ns_time(); -+ gpu_env_t * const ge = gpu_lock(); -+ tto_start(&ge->ttw.wait, now); -+ gpu_unlock(); -+ } -+#endif -+ -+ while (sem_wait(&wait->sem) == -1 && errno == EINTR) -+ /* loop */; -+} -+ -+static void vq_wait_post(vq_wait_t * const wait) -+{ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ gpu_env_t *const ge = gpu_lock(); -+ tto_end(&ge->ttw.active, ns_time()); -+ gpu_unlock(); -+ } -+#endif -+ -+ sem_post(&wait->sem); -+} -+ -+ -+ -+// Header comments were wrong for these two -+#define VPU_QPU_MASK_QPU 1 -+#define VPU_QPU_MASK_VPU 2 -+ -+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; -+ -+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf) -+{ -+// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); -+ vpu_qpu_job_env_t * vqj = buf; -+// memset(vqj, 0, sizeof(*vqj)); -+ vqj->n = 0; -+ vqj->mask = 0; -+ return vqj; -+} -+ -+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) -+{ -+// memset(vqj, 0, sizeof(*vqj)); -+// free(vqj); -+} -+ -+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) -+{ -+ struct gpu_job_s * const j = vqj->j + vqj->n++; -+ av_assert1(vqj->n <= VPU_QPU_JOB_MAX); -+ return j; -+} -+ -+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, -+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) -+{ -+ if (vpu_code != 0) { -+ struct gpu_job_s *const j = new_job(vqj); -+ vqj->mask |= VPU_QPU_MASK_VPU; -+ -+ j->command = EXECUTE_VPU; -+ j->callback.func = 0; -+ j->callback.cookie = NULL; -+ // The bottom two bits of the execute address contain no-flush flags -+ // b0 will flush the VPU I-cache if unset so we nearly always want that set -+ // as we never reload code -+ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; -+ j->u.v.q[1] = r0; -+ j->u.v.q[2] = r1; -+ j->u.v.q[3] = r2; -+ j->u.v.q[4] = r3; -+ j->u.v.q[5] = r4; -+ j->u.v.q[6] = r5; -+ gpu->vpu_i_cache_flushed = 1; -+ } -+} -+ -+// flags are QPU_FLAGS_xxx -+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) -+{ -+ if (n != 0) { -+ struct gpu_job_s *const j = new_job(vqj); -+ vqj->mask |= VPU_QPU_MASK_QPU; -+ -+ j->command = EXECUTE_QPU; -+ j->callback.func = 0; -+ j->callback.cookie = NULL; -+ -+ j->u.q.jobs = n; -+#if RPI_TRACE_QPU_PROFILE_ALL -+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; -+#else -+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; -+#endif -+ j->u.q.timeout = 5000; -+ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ } -+} -+ -+// Convert callback to sem post -+static void vpu_qpu_job_callback_wait(void * v) -+{ -+ vq_wait_post(v); -+} -+ -+// Poke a user-supplied sem -+static void vpu_qpu_job_callback_sem(void * v) -+{ -+ sem_post((sem_t *)v); -+} -+ -+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) -+{ -+ vq_wait_t * wait; -+ -+ if (vqj->mask == 0) { -+ *wait_h = NULL; -+ return; -+ } -+ -+ // We are going to want a sync object -+ wait = vq_wait_new(); -+ -+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync -+ // If we only posted one thing or only QPU jobs -+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) -+ { -+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); -+ av_assert1(j->callback.func == 0); -+ -+ j->callback.func = vpu_qpu_job_callback_wait; -+ j->callback.cookie = wait; -+ } -+ else -+ { -+ struct gpu_job_s *const j = new_job(vqj); -+ -+ j->command = EXECUTE_SYNC; -+ j->u.s.mask = vqj->mask; -+ j->callback.func = vpu_qpu_job_callback_wait; -+ j->callback.cookie = wait; -+ } -+ -+ vqj->mask = 0; -+ *wait_h = wait; -+} -+ -+// Returns 0 if no sync added ('cos Q empty), 1 if sync added -+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem) -+{ -+ // If nothing on q then just return -+ if (vqj->mask == 0) -+ return 0; -+ -+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync -+ // If we only posted one thing or only QPU jobs -+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) -+ { -+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); -+ av_assert1(j->callback.func == 0); -+ -+ j->callback.func = vpu_qpu_job_callback_sem; -+ j->callback.cookie = sem; -+ } -+ else -+ { -+ struct gpu_job_s *const j = new_job(vqj); -+ -+ j->command = EXECUTE_SYNC; -+ j->u.s.mask = vqj->mask; -+ j->callback.func = vpu_qpu_job_callback_sem; -+ j->callback.cookie = sem; -+ } -+ -+ vqj->mask = 0; -+ return 1; -+} -+ -+ -+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) -+{ -+ if (vqj->n == 0) -+ return 0; -+ -+ return vc_gpuserv_execute_code(vqj->n, vqj->j); -+} -+ -+// Simple wrapper of start + delete -+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) -+{ -+ int rv; -+ rv = vpu_qpu_job_start(vqj); -+ vpu_qpu_job_delete(vqj); -+ return rv; -+} -+ -+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) -+{ -+ if (wait_h != NULL) -+ { -+ vq_wait_t * const wait = *wait_h; -+ if (wait != NULL) { -+ *wait_h = NULL; -+ vq_wait_wait(wait); -+ vq_wait_delete(wait); -+ } -+ } -+} -+ -+int vpu_qpu_init() -+{ -+ gpu_env_t * const ge = gpu_lock_ref(); -+ if (ge == NULL) -+ return -1; -+ -+ if (ge->init_count++ == 0) -+ { -+ vc_gpuserv_init(); -+ } -+ -+ gpu_unlock(); -+ return 0; -+} -+ -+void vpu_qpu_term() -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ -+ if (--ge->init_count == 0) { -+ vc_gpuserv_deinit(); -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ ttw_print(&ge->ttw, ns_time()); -+#endif -+ } -+ -+ gpu_unlock_unref(ge); -+} -+ -+uint32_t qpu_fn(const int * const mc_fn) -+{ -+ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader); -+} -+ -+uint32_t qpu_dummy(void) -+{ -+ return gpu->dummy_gm_ptr.vc; -+} -+ -+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) -+{ -+ // Dummy values we can catch with emulation -+ qf->y_pxx = ~1U; -+ qf->y_bxx = ~2U; -+ qf->y_p00 = ~3U; -+ qf->y_b00 = ~4U; -+ qf->c_pxx = ~5U; -+ qf->c_bxx = ~6U; -+ -+ switch (bit_depth) { -+ case 8: -+ qf->y_pxx = qpu_fn(mc_filter_y_pxx); -+ qf->y_pxx = qpu_fn(mc_filter_y_pxx); -+ qf->y_bxx = qpu_fn(mc_filter_y_bxx); -+ qf->y_p00 = qpu_fn(mc_filter_y_p00); -+ qf->y_b00 = qpu_fn(mc_filter_y_b00); -+ qf->c_pxx = qpu_fn(mc_filter_c_p); -+ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); -+ qf->c_bxx = qpu_fn(mc_filter_c_b); -+ break; -+ case 10: -+ qf->c_pxx = qpu_fn(mc_filter_c10_p); -+ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); -+ qf->c_bxx = qpu_fn(mc_filter_c10_b); -+ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); -+ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); -+ qf->y_p00 = qpu_fn(mc_filter_y10_p00); -+ qf->y_b00 = qpu_fn(mc_filter_y10_b00); -+ break; -+ default: -+ return -1; -+ } -+ return 0; -+} -+ -diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h -new file mode 100644 -index 0000000000..8777687021 ---- /dev/null -+++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,103 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#ifndef RPI_QPU_H -+#define RPI_QPU_H -+ -+#include "rpi_mem.h" -+#include "rpi_zc_frames.h" -+ -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#pragma GCC diagnostic ignored "-Wstrict-prototypes" -+#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s -+#pragma GCC diagnostic pop -+ -+// QPU specific functions -+ -+typedef struct HEVCRpiQpu { -+ uint32_t c_pxx; -+ uint32_t c_pxx_l1; -+ uint32_t c_bxx; -+ uint32_t y_pxx; -+ uint32_t y_bxx; -+ uint32_t y_p00; -+ uint32_t y_b00; -+} HEVCRpiQpu; -+ -+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); -+ -+uint32_t qpu_fn(const int * const mc_fn); -+uint32_t qpu_dummy(void); -+ -+#define QPU_N_GRP 4 -+#define QPU_N_MAX 12 -+ -+#define QPU_MAIL_EL_VALS 2 -+ -+struct vpu_qpu_wait_s; -+typedef struct vq_wait_s * vpu_qpu_wait_h; -+ -+// VPU specific functions -+ -+struct vpu_qpu_job_env_s; -+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; -+ -+#define VPU_QPU_JOB_MAX 4 -+struct vpu_qpu_job_env_s -+{ -+ unsigned int n; -+ unsigned int mask; -+ struct gpu_job_s j[VPU_QPU_JOB_MAX]; -+}; -+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; -+ -+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf); -+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); -+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, -+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); -+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); -+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); -+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem); -+int vpu_qpu_job_start(const vpu_qpu_job_h vqj); -+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); -+ -+extern unsigned int vpu_get_fn(const unsigned int bit_depth); -+extern unsigned int vpu_get_constants(void); -+ -+// Waits for previous post_codee to complete and Will null out *wait_h after use -+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); -+int vpu_qpu_init(void); -+void vpu_qpu_term(void); -+ -+void gpu_ref(void); -+void gpu_unref(void); -+ -+#endif -diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c -new file mode 100644 -index 0000000000..37be9a0f49 ---- /dev/null -+++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,1227 @@ -+#include "config.h" -+ -+#include "libavcodec/avcodec.h" -+#include "rpi_mem.h" -+#include "rpi_mailbox.h" -+#include "rpi_zc.h" -+#include "libavutil/avassert.h" -+#include -+ -+#include "libavutil/buffer_internal.h" -+ -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#include -+#include -+#pragma GCC diagnostic pop -+ -+#define TRACE_ALLOC 0 -+#define DEBUG_ALWAYS_KEEP_LOCKED 0 -+ -+struct ZcPoolEnt; -+ -+typedef struct ZcPool -+{ -+ size_t numbytes; -+ struct ZcPoolEnt * head; -+ pthread_mutex_t lock; -+} ZcPool; -+ -+typedef struct ZcPoolEnt -+{ -+ size_t numbytes; -+ -+ unsigned int vcsm_handle; -+ unsigned int vc_handle; -+ void * map_arm; -+ unsigned int map_vc; -+ -+ struct ZcPoolEnt * next; -+ struct ZcPool * pool; -+} ZcPoolEnt; -+ -+typedef struct ZcOldCtxVals -+{ -+ int thread_safe_callbacks; -+ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); -+ void * opaque; -+} ZcOldCtxVals; -+ -+typedef struct AVZcEnv -+{ -+ unsigned int refcount; -+ ZcOldCtxVals old; -+ -+ void * pool_env; -+ av_rpi_zc_alloc_buf_fn_t * alloc_buf; -+ av_rpi_zc_free_pool_fn_t * free_pool; -+ -+ unsigned int pool_size; -+} ZcEnv; -+ -+typedef struct ZcUserBufEnv { -+ void * v; -+ const av_rpi_zc_buf_fn_tab_t * fn; -+ size_t numbytes; -+ int offset; -+} ZcUserBufEnv; -+ -+#define ZC_BUF_INVALID 0 -+#define ZC_BUF_VALID 1 -+#define ZC_BUF_NEVER 2 -+ -+typedef struct ZcBufEnv { -+ GPU_MEM_PTR_T gmem; -+ AVZcEnvPtr zc; -+ int is_valid; -+ AVBufferRef * user; -+ AVRpiZcFrameGeometry geo; -+ size_t size_y; -+ size_t size_c; -+ size_t size_pic; -+ ssize_t offset; -+ pthread_mutex_t lock; -+ pthread_cond_t cond; -+} ZcBufEnv; -+ -+ -+ -+ -+ -+ -+#define ALLOC_PAD 0 -+#define ALLOC_ROUND 0x1000 -+#define STRIDE_ROUND 64 -+#define STRIDE_OR 0 -+ -+#define DEBUG_ZAP0_BUFFERS 0 -+ -+static inline int av_rpi_is_sand_format(const int format) -+{ -+ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) || -+ (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10); -+} -+ -+static inline int av_rpi_is_sand_frame(const AVFrame * const frame) -+{ -+ return av_rpi_is_sand_format(frame->format); -+} -+ -+//---------------------------------------------------------------------------- -+// -+// Internal pool stuff -+ -+// Pool entry functions -+ -+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size) -+{ -+ ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt)); -+ -+ // Round up to 4k & add 4k -+ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); -+ -+ if (zp == NULL) { -+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); -+ goto fail0; -+ } -+ -+ // The 0x80 here maps all pages here rather than waiting for lazy mapping -+ // BEWARE that in GPU land a later unlock/lock pair will put us back into -+ // lazy mode - which will also break cache invalidate calls. -+ if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); -+ goto fail1; -+ } -+ -+#if TRACE_ALLOC -+ printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle); -+#endif -+ -+ zp->numbytes = alloc_size; -+ zp->pool = pool; -+ return zp; -+ -+fail1: -+ av_free(zp); -+fail0: -+ return NULL; -+} -+ -+static void zc_pool_ent_free(ZcPoolEnt * const zp) -+{ -+#if TRACE_ALLOC -+ printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle); -+#endif -+ -+ if (zp->vcsm_handle != 0) -+ { -+ // VC addr & handle need no dealloc -+ if (zp->map_arm != NULL) -+ vcsm_unlock_hdl(zp->vcsm_handle); -+ vcsm_free(zp->vcsm_handle); -+ } -+ av_free(zp); -+} -+ -+//---------------------------------------------------------------------------- -+// -+// Pool functions -+ -+static void zc_pool_free_ent_list(ZcPoolEnt * p) -+{ -+ while (p != NULL) -+ { -+ ZcPoolEnt * const zp = p; -+ p = p->next; -+ zc_pool_ent_free(zp); -+ } -+} -+ -+static void zc_pool_flush(ZcPool * const pool) -+{ -+ ZcPoolEnt * p = pool->head; -+ pool->head = NULL; -+ pool->numbytes = ~0U; -+ zc_pool_free_ent_list(p); -+} -+ -+static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes) -+{ -+ ZcPoolEnt * zp = NULL; -+ ZcPoolEnt * flush_list = NULL; -+ size_t numbytes; -+ -+ pthread_mutex_lock(&pool->lock); -+ -+ numbytes = pool->numbytes; -+ -+ // If size isn't close then dump the pool -+ // Close in this context means within 128k -+ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) -+ { -+ flush_list = pool->head; -+ pool->head = NULL; -+ pool->numbytes = numbytes = req_bytes; -+ } -+ else if (pool->head != NULL) -+ { -+ zp = pool->head; -+ pool->head = zp->next; -+ } -+ -+ pthread_mutex_unlock(&pool->lock); -+ -+ zc_pool_free_ent_list(flush_list); -+ -+ if (zp == NULL) -+ zp = zc_pool_ent_alloc(pool, numbytes); -+ -+ return zp; -+} -+ -+static void zc_pool_put_ent(ZcPoolEnt * const zp) -+{ -+ ZcPool * const pool = zp == NULL ? NULL : zp->pool; -+ if (zp != NULL) -+ { -+ pthread_mutex_lock(&pool->lock); -+#if TRACE_ALLOC -+ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes); -+#endif -+ -+ if (pool->numbytes == zp->numbytes) -+ { -+ zp->next = pool->head; -+ pool->head = zp; -+ pthread_mutex_unlock(&pool->lock); -+ } -+ else -+ { -+ pthread_mutex_unlock(&pool->lock); -+ zc_pool_ent_free(zp); -+ } -+ } -+} -+ -+static ZcPool * -+zc_pool_new(void) -+{ -+ ZcPool * const pool = av_mallocz(sizeof(*pool)); -+ if (pool == NULL) -+ return NULL; -+ -+ pool->numbytes = -1; -+ pool->head = NULL; -+ pthread_mutex_init(&pool->lock, NULL); -+ return pool; -+} -+ -+static void -+zc_pool_delete(ZcPool * const pool) -+{ -+ if (pool != NULL) -+ { -+ pool->numbytes = -1; -+ zc_pool_flush(pool); -+ pthread_mutex_destroy(&pool->lock); -+ av_free(pool); -+ } -+} -+ -+//============================================================================ -+// -+// ZC implementation using above pool implementation -+// -+// Fn table fns... -+ -+static void zc_pool_free_v(void * v) -+{ -+ zc_pool_put_ent(v); -+} -+ -+static unsigned int zc_pool_ent_vcsm_handle_v(void * v) -+{ -+ ZcPoolEnt * zp = v; -+ return zp->vcsm_handle; -+} -+ -+static unsigned int zc_pool_ent_vc_handle_v(void * v) -+{ -+ ZcPoolEnt * zp = v; -+ if (zp->vc_handle == 0) -+ { -+ if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0) -+ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n", -+ __func__, zp->vcsm_handle); -+ } -+ return zp->vc_handle; -+} -+ -+static void * zc_pool_ent_map_arm_v(void * v) -+{ -+ ZcPoolEnt * zp = v; -+ if (zp->map_arm == NULL) -+ { -+ if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL) -+ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n", -+ __func__, zp->vcsm_handle); -+ } -+ return zp->map_arm; -+} -+ -+static unsigned int zc_pool_ent_map_vc_v(void * v) -+{ -+ ZcPoolEnt * zp = v; -+ if (zp->map_vc == 0) -+ { -+ if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0) -+ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n", -+ __func__, zp->vcsm_handle); -+ } -+ return zp->map_vc; -+} -+ -+static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = { -+ .free = zc_pool_free_v, -+ .vcsm_handle = zc_pool_ent_vcsm_handle_v, -+ .vc_handle = zc_pool_ent_vc_handle_v, -+ .map_arm = zc_pool_ent_map_arm_v, -+ .map_vc = zc_pool_ent_map_vc_v, -+}; -+ -+// ZC Env fns -+ -+// Delete pool -+// All buffers guaranteed freed by now -+static void -+zc_pool_delete_v(void * v) -+{ -+ zc_pool_delete((ZcPool *)v); -+ rpi_mem_gpu_uninit(); -+} -+ -+// Allocate a new ZC buffer -+static AVBufferRef * -+zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo) -+{ -+ ZcPool * const pool = v; -+ ZcPoolEnt *const zp = zc_pool_get_ent(pool, size); -+ AVBufferRef * buf; -+ -+ (void)geo; // geo ignored here -+ -+ if (zp == NULL) { -+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); -+ goto fail0; -+ } -+ -+ if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n"); -+ goto fail2; -+ } -+ -+ return buf; -+ -+fail2: -+ zc_pool_put_ent(zp); -+fail0: -+ return NULL; -+} -+ -+// Init wrappers - the public fns -+ -+AVZcEnvPtr -+av_rpi_zc_int_env_alloc(void * logctx) -+{ -+ ZcEnv * zc; -+ ZcPool * pool_env; -+ -+ if (rpi_mem_gpu_init(0) < 0) -+ return NULL; -+ -+ if ((pool_env = zc_pool_new()) == NULL) -+ goto fail1; -+ -+ if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL) -+ goto fail2; -+ -+ return zc; -+ -+fail2: -+ zc_pool_delete(pool_env); -+fail1: -+ rpi_mem_gpu_uninit(); -+ return NULL; -+} -+ -+void -+av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp) -+{ -+ const AVZcEnvPtr zc = *zcp; -+ *zcp = NULL; -+ if (zc != NULL) -+ av_rpi_zc_env_release(zc); -+} -+ -+//============================================================================ -+// -+// Geometry -+// -+// This is a separate chunck to the rest -+ -+// Get mailbox fd - should be in a lock when called -+// Rely on process close to close it -+static int mbox_fd(void) -+{ -+ static int fd = -1; -+ if (fd != -1) -+ return fd; -+ return (fd = mbox_open()); -+} -+ -+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( -+ const int format, const unsigned int video_width, const unsigned int video_height) -+{ -+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; -+ -+ AVRpiZcFrameGeometry geo = { -+ .format = format, -+ .video_width = video_width, -+ .video_height = video_height -+ }; -+ -+ switch (format) -+ { -+ case AV_PIX_FMT_YUV420P: -+ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; -+ geo.stride_c = geo.stride_y / 2; -+ geo.height_y = (video_height + 32 + 31) & ~31; -+ geo.height_c = geo.height_y / 2; -+ geo.planes_c = 2; -+ geo.stripes = 1; -+ geo.bytes_per_pel = 1; -+ geo.stripe_is_yc = 1; -+ break; -+ -+ case AV_PIX_FMT_YUV420P10: -+ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; -+ geo.stride_c = geo.stride_y / 2; -+ geo.height_y = (video_height + 32 + 31) & ~31; -+ geo.height_c = geo.height_y / 2; -+ geo.planes_c = 2; -+ geo.stripes = 1; -+ geo.bytes_per_pel = 2; -+ geo.stripe_is_yc = 1; -+ break; -+ -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ { -+ const unsigned int stripe_w = 128; -+ -+ static VC_IMAGE_T img = {0}; -+ -+ // Given the overhead of calling the mailbox keep a stashed -+ // copy as we will almost certainly just want the same numbers again -+ // but that means we need a lock -+ pthread_mutex_lock(&sand_lock); -+ -+ if (img.width != video_width || img.height != video_height) -+ { -+ VC_IMAGE_T new_img = { -+ .type = VC_IMAGE_YUV_UV, -+ .width = video_width, -+ .height = video_height -+ }; -+ -+ mbox_get_image_params(mbox_fd(), &new_img); -+ img = new_img; -+ } -+ -+ geo.stride_y = stripe_w; -+ geo.stride_c = stripe_w; -+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; -+ geo.height_c = img.pitch / stripe_w - geo.height_y; -+ geo.stripe_is_yc = 1; -+ if (geo.height_y * stripe_w > img.pitch) -+ { -+ // "tall" sand - all C blocks now follow Y -+ geo.height_y = img.pitch / stripe_w; -+ geo.height_c = geo.height_y; -+ geo.stripe_is_yc = 0; -+ } -+ geo.planes_c = 1; -+ geo.stripes = (video_width + stripe_w - 1) / stripe_w; -+ geo.bytes_per_pel = 1; -+ -+ pthread_mutex_unlock(&sand_lock); -+#if 0 -+ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", -+ video_width, video_height, -+ geo.stride_y, geo.stride_c, -+ geo.height_y, geo.height_c, -+ geo.stripes, img.pitch); -+#endif -+ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); -+ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); -+ break; -+ } -+ -+ case AV_PIX_FMT_RPI4_10: -+ { -+ const unsigned int stripe_w = 128; // bytes -+ -+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; -+ static VC_IMAGE_T img = {0}; -+ -+ // Given the overhead of calling the mailbox keep a stashed -+ // copy as we will almost certainly just want the same numbers again -+ // but that means we need a lock -+ pthread_mutex_lock(&sand_lock); -+ -+ if (img.width != video_width || img.height != video_height) -+ { -+ VC_IMAGE_T new_img = { -+ .type = VC_IMAGE_YUV10COL, -+ .width = video_width, -+ .height = video_height -+ }; -+ -+ mbox_get_image_params(mbox_fd(), &new_img); -+ img = new_img; -+ } -+ -+ geo.stride_y = stripe_w; -+ geo.stride_c = stripe_w; -+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; -+ geo.height_c = img.pitch / stripe_w - geo.height_y; -+ geo.planes_c = 1; -+ geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w; -+ geo.bytes_per_pel = 1; -+ geo.stripe_is_yc = 1; -+ -+ pthread_mutex_unlock(&sand_lock); -+ -+#if 0 -+ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", -+ video_width, video_height, -+ geo.stride_y, geo.stride_c, -+ geo.height_y, geo.height_c, -+ geo.stripes, img.pitch); -+#endif -+ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); -+ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); -+ break; -+ } -+ -+ case AV_PIX_FMT_SAND64_16: -+ case AV_PIX_FMT_SAND64_10: -+ { -+ const unsigned int stripe_w = 128; // bytes -+ -+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; -+ static VC_IMAGE_T img = {0}; -+ -+ // Given the overhead of calling the mailbox keep a stashed -+ // copy as we will almost certainly just want the same numbers again -+ // but that means we need a lock -+ pthread_mutex_lock(&sand_lock); -+ -+ if (img.width != video_width || img.height != video_height) -+ { -+ VC_IMAGE_T new_img = { -+ .type = VC_IMAGE_YUV_UV_16, -+ .width = video_width, -+ .height = video_height -+ }; -+ -+ mbox_get_image_params(mbox_fd(), &new_img); -+ img = new_img; -+ } -+ -+ geo.stride_y = stripe_w; -+ geo.stride_c = stripe_w; -+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; -+ geo.height_c = img.pitch / stripe_w - geo.height_y; -+ geo.planes_c = 1; -+ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; -+ geo.bytes_per_pel = 2; -+ geo.stripe_is_yc = 1; -+ -+ pthread_mutex_unlock(&sand_lock); -+ break; -+ } -+ -+ default: -+ break; -+ } -+ return geo; -+} -+ -+//============================================================================ -+// -+// ZC Env fns -+// -+// Frame copy fns -+ -+static AVBufferRef * zc_copy(const AVZcEnvPtr zc, -+ const AVFrame * const src) -+{ -+ AVFrame dest_frame; -+ AVFrame * const dest = &dest_frame; -+ unsigned int i; -+ uint8_t * psrc, * pdest; -+ -+ dest->format = src->format; -+ dest->width = src->width; -+ dest->height = src->height; -+ -+ if (av_rpi_zc_get_buffer(zc, dest) != 0 || -+ av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0) -+ { -+ return NULL; -+ } -+ -+ for (i = 0, psrc = src->data[0], pdest = dest->data[0]; -+ i != dest->height; -+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) -+ { -+ memcpy(pdest, psrc, dest->width); -+ } -+ for (i = 0, psrc = src->data[1], pdest = dest->data[1]; -+ i != dest->height / 2; -+ ++i, psrc += src->linesize[1], pdest += dest->linesize[1]) -+ { -+ memcpy(pdest, psrc, dest->width / 2); -+ } -+ for (i = 0, psrc = src->data[2], pdest = dest->data[2]; -+ i != dest->height / 2; -+ ++i, psrc += src->linesize[2], pdest += dest->linesize[2]) -+ { -+ memcpy(pdest, psrc, dest->width / 2); -+ } -+ -+ return dest->buf[0]; -+} -+ -+ -+static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc, -+ const AVFrame * const src) -+{ -+ assert(0); -+ return NULL; -+} -+ -+ -+static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc, -+ const AVFrame * const src, const unsigned int src_bits) -+{ -+ assert(0); -+ return NULL; -+} -+ -+//---------------------------------------------------------------------------- -+// -+// Public info extraction calls -+ -+static void zc_buf_env_free_cb(void * opaque, uint8_t * data); -+ -+static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf) -+{ -+ // Kludge where we check the free fn to check this is really -+ // one of our buffers - can't think of a better way -+ return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL : -+ av_buffer_get_opaque(buf); -+} -+ -+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf) -+{ -+ // As gmem is the first el NULL should be preserved -+ return &pic_zbe_ptr(buf)->gmem; -+} -+ -+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref) -+{ -+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); -+ return p == NULL ? 0 : p->vcsm_handle; -+} -+ -+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref) -+{ -+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); -+ return p == NULL ? -1 : p->vc_handle; -+} -+ -+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) -+{ -+ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); -+ return zbe == NULL ? 0 : zbe->offset; -+} -+ -+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) -+{ -+ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); -+ return zbe == NULL ? 0 : zbe->size_pic; -+} -+ -+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) -+{ -+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); -+ return p == NULL ? 0 : p->numbytes; -+} -+ -+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref) -+{ -+ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); -+ return zbe == NULL ? NULL : &zbe->geo; -+} -+ -+AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc, -+ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) -+{ -+ av_assert0(!maycopy || zc != NULL); -+ -+ if (frame->format != AV_PIX_FMT_YUV420P && -+ frame->format != AV_PIX_FMT_YUV420P10 && -+ !av_rpi_is_sand_frame(frame)) -+ { -+ av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); -+ return NULL; -+ } -+ -+ if (frame->buf[1] != NULL || frame->format != expected_format) -+ { -+#if RPI_ZC_SAND_8_IN_10_BUF -+ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) -+ { -+// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); -+ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); -+ } -+#endif -+ -+ if (maycopy) -+ { -+ if (frame->buf[1] != NULL) -+ av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); -+ else -+ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); -+ -+ switch (frame->format) -+ { -+ case AV_PIX_FMT_YUV420P10: -+ return zc_420p10_to_sand128(zc, frame); -+ -+ case AV_PIX_FMT_SAND64_10: -+ return zc_sand64_16_to_sand128(zc, frame, 10); -+ -+ default: -+ return zc_copy(zc, frame); -+ } -+ } -+ else -+ { -+ if (frame->buf[1] != NULL) -+ av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); -+ else -+ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); -+ return NULL; -+ } -+ } -+ -+ if (pic_gm_ptr(frame->buf[0]) == NULL) -+ { -+ if (maycopy) -+ { -+ av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__); -+ return zc_copy(zc, frame); -+ } -+ else -+ { -+ av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__); -+ return NULL; -+ } -+ } -+ -+ return av_buffer_ref(frame->buf[0]); -+} -+ -+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref) -+{ -+ if (fr_ref != NULL) -+ { -+ av_buffer_unref(&fr_ref); -+ } -+} -+ -+//---------------------------------------------------------------------------- -+ -+// Extract user environment from an AVBufferRef -+void * av_rpi_zc_buf_v(AVBufferRef * const buf) -+{ -+ ZcBufEnv * const zbe = pic_zbe_ptr(buf); -+ if (zbe != NULL && zbe->user != NULL) -+ { -+ const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data; -+ return zub == NULL ? NULL : zub->v; -+ } -+ return NULL; -+} -+ -+// AV buffer pre-free callback -+static void zc_user_buf_free_cb(void * opaque, uint8_t * data) -+{ -+ if (opaque != NULL) -+ { -+ ZcUserBufEnv * const zub = opaque; -+ -+ if (zub->fn->free) -+ zub->fn->free(zub->v); -+ -+ av_free(zub); -+ } -+} -+ -+static void zc_buf_env_free_cb(void * opaque, uint8_t * data) -+{ -+ if (opaque != NULL) -+ { -+ ZcBufEnv * const zbe = opaque; -+ -+ av_buffer_unref(&zbe->user); -+ -+ if (zbe->zc != NULL) -+ av_rpi_zc_env_release(zbe->zc); -+ -+ pthread_cond_destroy(&zbe->cond); -+ pthread_mutex_destroy(&zbe->lock); -+ av_free(zbe); -+ } -+} -+ -+ -+// Wrap the various ZC bits in an AV Buffer and resolve those things we want -+// resolved now. -+// Currently we resolve everything, but in future we might not -+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab) -+{ -+ AVBufferRef *buf; -+ ZcUserBufEnv * zub; -+ -+ if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL) -+ return NULL; -+ -+ zub->fn = fn_tab; -+ zub->v = v; -+ zub->numbytes = numbytes; -+ zub->offset = addr_offset; -+ -+ if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n"); -+ av_free(zub); -+ return NULL; -+ } -+ -+ return buf; -+} -+ -+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode) -+{ -+ ZcBufEnv * const zbe = pic_zbe_ptr(buf); -+ -+ if (zbe == NULL) -+ return AVERROR(EINVAL); -+ -+ if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid) -+ return AVERROR(EAGAIN); -+ -+ if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid) -+ { -+ pthread_mutex_lock(&zbe->lock); -+ while (!zbe->is_valid) -+ pthread_cond_wait(&zbe->cond, &zbe->lock); -+ pthread_mutex_unlock(&zbe->lock); -+ } -+ -+ if (zbe->is_valid == ZC_BUF_NEVER) -+ return AVERROR(EINVAL); -+ -+ // Do alloc if we need it -+ if (zbe->user == NULL) -+ { -+ ZcEnv * const zc = zbe->zc; -+ const ZcUserBufEnv * zub; -+ -+ av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID); -+ -+ if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); -+ goto fail; -+ } -+ zub = (const ZcUserBufEnv *)zbe->user->data; -+ -+ // Track -+ -+ zbe->offset = zub->offset; -+ zbe->gmem.numbytes = zub->numbytes; -+ if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle); -+ goto fail; -+ } -+ -+ if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n"); -+ goto fail; -+ } -+ -+ if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle); -+ goto fail; -+ } -+ if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle); -+ goto fail; -+ } -+ -+ buf->buffer->data = zbe->gmem.arm + zbe->offset; -+ buf->buffer->size = zbe->size_pic; -+ -+ // In this mode we shouldn't have anyone waiting for us -+ // so no need to signal -+ if (alloc_mode == ZC_RESOLVE_ALLOC_VALID) -+ zbe->is_valid = 1; -+ } -+ -+ // Just overwrite - no point in testing -+ buf->data = zbe->gmem.arm + zbe->offset; -+ buf->size = zbe->size_pic; -+ return 0; -+ -+fail: -+ av_buffer_unref(&zbe->user); -+ return AVERROR(ENOMEM); -+} -+ -+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc) -+{ -+ int rv; -+ -+ // Do alloc if we need it -+ if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0) -+ return rv; -+ -+ // If we are a framebuf copy then the alloc can be done but we haven't -+ // imported its results yet -+ if (frame->data[0] == NULL) -+ { -+ const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); -+ -+ frame->linesize[0] = zbe->geo.stride_y; -+ frame->linesize[1] = zbe->geo.stride_c; -+ frame->linesize[2] = zbe->geo.stride_c; -+ // abuse: linesize[3] = "stripe stride" -+ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). -+ // In a general case this makes the calculation an xor and multiply rather -+ // than a divide and multiply -+ if (zbe->geo.stripes > 1) -+ frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y; -+ -+ frame->data[0] = frame->buf[0]->data; -+ frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes); -+ if (zbe->geo.planes_c > 1) -+ frame->data[2] = frame->data[1] + zbe->size_c; -+ -+ frame->extended_data = frame->data; -+ // Leave extended buf alone -+ } -+ -+ return 0; -+} -+ -+int av_rpi_zc_set_valid_frame(AVFrame * const frame) -+{ -+ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); -+ -+ if (zbe == NULL) -+ return AVERROR(EINVAL); -+ -+ zbe->is_valid = ZC_BUF_VALID; -+ pthread_cond_broadcast(&zbe->cond); -+ -+ return 0; -+} -+ -+int av_rpi_zc_set_broken_frame(AVFrame * const frame) -+{ -+ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); -+ -+ if (zbe == NULL) -+ return AVERROR(EINVAL); -+ -+ zbe->is_valid = ZC_BUF_NEVER; -+ pthread_cond_broadcast(&zbe->cond); -+ -+ return 0; -+} -+ -+void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size) -+{ -+ zc->pool_size = pool_size; -+} -+ -+unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc) -+{ -+ return zc->pool_size; -+} -+ -+int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame) -+{ -+#if 1 -+ ZcBufEnv * zbe = av_mallocz(sizeof(*zbe)); -+ -+ for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) { -+ frame->buf[i] = NULL; -+ frame->data[i] = NULL; -+ frame->linesize[i] = 0; -+ } -+ -+ if (zbe == NULL) -+ return AVERROR(ENOMEM); -+ -+ if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL) -+ { -+ av_free(zbe); -+ return AVERROR(ENOMEM); -+ } -+ -+ pthread_mutex_init(&zbe->lock, NULL); -+ pthread_cond_init(&zbe->cond, NULL); -+ zbe->zc = zc; -+ atomic_fetch_add(&zc->refcount, 1); -+ -+ zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use -+ zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y; -+ zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c; -+ zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes; -+ -+#else -+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); -+ const unsigned int size_y = geo.stride_y * geo.height_y; -+ const unsigned int size_c = geo.stride_c * geo.height_c; -+ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; -+ AVBufferRef * buf; -+ unsigned int i; -+ -+// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic); -+ -+ if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL) -+ { -+ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); -+ return AVERROR(ENOMEM); -+ } -+ -+ // Track -+ atomic_fetch_add(&zc->refcount, 1); -+ pic_zbe_ptr(buf)->zc = zc; -+ -+ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) { -+ frame->buf[i] = NULL; -+ frame->data[i] = NULL; -+ frame->linesize[i] = 0; -+ } -+ -+ frame->buf[0] = buf; -+ -+ frame->linesize[0] = geo.stride_y; -+ frame->linesize[1] = geo.stride_c; -+ frame->linesize[2] = geo.stride_c; -+ // abuse: linesize[3] = "stripe stride" -+ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). -+ // In a general case this makes the calculation an xor and multiply rather -+ // than a divide and multiply -+ if (geo.stripes > 1) -+ frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y; -+ -+ frame->data[0] = buf->data; -+ frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes); -+ if (geo.planes_c > 1) -+ frame->data[2] = frame->data[1] + size_c; -+ -+ frame->extended_data = frame->data; -+ // Leave extended buf alone -+ -+#if RPI_ZC_SAND_8_IN_10_BUF != 0 -+ // *** If we intend to use this for real we will want a 2nd buffer pool -+ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge -+#endif -+#endif -+ -+ return 0; -+} -+ -+void av_rpi_zc_env_release(const AVZcEnvPtr zc) -+{ -+ const int n = atomic_fetch_add(&zc->refcount, -1); -+ if (n == 1) // was 1, now 0 -+ { -+ zc->free_pool(zc->pool_env); -+ av_free(zc); -+ } -+} -+ -+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, -+ void * pool_env, -+ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, -+ av_rpi_zc_free_pool_fn_t * free_pool_fn) -+{ -+ ZcEnv * zc; -+ -+ if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL) -+ { -+ av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n"); -+ return NULL; -+ } -+ -+ *zc = (ZcEnv){ -+ .refcount = ATOMIC_VAR_INIT(1), -+ .pool_env = pool_env, -+ .alloc_buf = alloc_buf_fn, -+ .free_pool = free_pool_fn, -+ .pool_size = 0 -+ }; -+ -+ return zc; -+} -+ -+//============================================================================ -+// -+// External ZC initialisation -+ -+#define RPI_GET_BUFFER2 1 -+ -+ -+static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) -+{ -+#if !RPI_GET_BUFFER2 -+ return avcodec_default_get_buffer2(s, frame, flags); -+#else -+ int rv; -+ -+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) -+ { -+// printf("Do default alloc: format=%#x\n", frame->format); -+ rv = avcodec_default_get_buffer2(s, frame, flags); -+ } -+ else if (frame->format == AV_PIX_FMT_YUV420P || -+ av_rpi_is_sand_frame(frame)) -+ { -+ if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0) -+ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); -+ } -+ else -+ { -+ rv = avcodec_default_get_buffer2(s, frame, flags); -+ } -+ -+#if 0 -+ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, -+ frame->format, frame->width, frame->height, -+ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], -+ frame->data[0], frame->data[1], frame->data[2], -+ frame->buf[0], frame->buf[1], frame->buf[2], -+ av_buffer_get_opaque(frame->buf[0])); -+#endif -+ return rv; -+#endif -+} -+ -+int av_rpi_zc_in_use(const struct AVCodecContext * const s) -+{ -+ return s->get_buffer2 == zc_get_buffer2; -+} -+ -+int av_rpi_zc_init2(struct AVCodecContext * const s, -+ void * pool_env, -+ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, -+ av_rpi_zc_free_pool_fn_t * free_pool_fn) -+{ -+ ZcEnv * zc; -+ -+ av_assert0(!av_rpi_zc_in_use(s)); -+ -+ if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL) -+ return AVERROR(ENOMEM); -+ -+ zc->old = (ZcOldCtxVals){ -+ .opaque = s->opaque, -+ .get_buffer2 = s->get_buffer2, -+ .thread_safe_callbacks = s->thread_safe_callbacks -+ }; -+ -+ s->opaque = zc; -+ s->get_buffer2 = zc_get_buffer2; -+ s->thread_safe_callbacks = 1; -+ return 0; -+} -+ -+void av_rpi_zc_uninit2(struct AVCodecContext * const s) -+{ -+ ZcEnv * const zc = s->opaque; -+ -+ av_assert0(av_rpi_zc_in_use(s)); -+ -+ s->get_buffer2 = zc->old.get_buffer2; -+ s->opaque = zc->old.opaque; -+ s->thread_safe_callbacks = zc->old.thread_safe_callbacks; -+ -+ av_rpi_zc_env_release(zc); -+} -+ -diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h -new file mode 100644 -index 0000000000..f00a7c962c ---- /dev/null -+++ b/libavcodec/rpi_zc.h -@@ -0,0 +1,228 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#ifndef LIBAVCODEC_RPI_ZC_H -+#define LIBAVCODEC_RPI_ZC_H -+ -+// Zero-Copy frame code for RPi -+// RPi needs Y/U/V planes to be contiguous for display. By default -+// ffmpeg will allocate separated planes so a memcpy is needed before -+// display. This code provides a method a making ffmpeg allocate a single -+// bit of memory for the frame when can then be reference counted until -+// display has finished with it. -+ -+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame -+// 0 disables -+// *** This option still in development -+// Only works if SAO active -+// Allocates buffers that are twice the required size -+#define RPI_ZC_SAND_8_IN_10_BUF 0 -+ -+struct AVBufferRef; -+struct AVFrame; -+struct AVCodecContext; -+enum AVPixelFormat; -+ -+// "Opaque" pointer to whatever we are using as a buffer reference -+typedef struct AVBufferRef * AVRpiZcRefPtr; -+ -+struct AVZcEnv; -+typedef struct AVZcEnv * AVZcEnvPtr; -+ -+typedef struct AVRpiZcFrameGeometry -+{ -+ unsigned int stride_y; // Luma stride (bytes) -+ unsigned int height_y; // Luma height (lines) -+ unsigned int stride_c; // Chroma stride (bytes) -+ unsigned int height_c; // Chroma stride (lines) -+ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) -+ unsigned int stripes; // Number of stripes (sand) -+ unsigned int bytes_per_pel; -+ int stripe_is_yc; // A single stripe is Y then C (false for tall sand) -+ -+ int format; // Requested format -+ unsigned int video_width; // Requested width -+ unsigned int video_height; // Requested height -+} AVRpiZcFrameGeometry; -+ -+// Get expected MMAL geometry for a given format, width & height -+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( -+ const int format, -+ const unsigned int video_width, const unsigned int video_height); -+ -+//---------------------------------------------------------------------------- -+// -+// Calls that extract info from a ZC frame whether internally or externally -+// allocated -+ -+// Generate a ZC reference to the buffer(s) in this frame -+// If the buffer doesn't appear to be one allocated by ZC -+// then the behaviour depends on maycopy: -+// If maycopy=0 then return NULL -+// If maycopy=1 && the src frame is in a form where we can easily copy -+// the data, then allocate a new buffer and copy the data into it -+// Otherwise return NULL -+// If maycopy == 0 then ZC may be NULL -+AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc, -+ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); -+ -+// Unreference the buffer refed/allocated by _zc_ref -+// If fr_ref is NULL then this will NOP -+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref); -+ -+// Get the vc_handle from the frame ref -+// Returns -1 if ref doesn't look valid -+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); -+// Get the vcsm_handle from the frame ref -+// Returns 0 if ref doesn't look valid -+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref); -+// Get offset from the start of the memory referenced -+// by the vc_handle to valid data -+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); -+// Length of buffer data -+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); -+// Get the number of bytes allocated from the frame ref -+// Returns 0 if ref doesn't look valid -+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); -+// Geometry this frame was allocated with -+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref); -+ -+//---------------------------------------------------------------------------- -+// -+// Calls for external frame allocation -+ -+// Callbacks registered in av_rpi_zc_init2 -+ -+// Callback to allocate a buf for a frame -+// The frame itself is generated in the calling code -+// -+// Parameters: -+// pool_env value passed to av-rpi_zc_init2 -+// size size wanted -+// geo geometry of the frame to be allocated -+// Returns: -+// NULL Alloc failed -+// ptr AVBufferBuf* of allocated buffer -+// In most cases av_rpi_zc_buf will be called by this function -+// and this will be the buf returned by that. -+typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size, -+ const AVRpiZcFrameGeometry * geo); -+ -+// Callback once ffmpeg is completely done with this pool -+// Called once all allocated buffers have been derefed and ffmpegs ref to this -+// pool has been dropped -+typedef void av_rpi_zc_free_pool_fn_t(void * pool_env); -+ -+// Init ZC into a context -+// Sets opaque, get_buffer2, thread_safe_callbacks -+// Use if you want to allocate your own pools and/or create ZC buffers for -+// all decoders -+// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken -+// apart by av_rpi_zc_xxx calls without this -+int av_rpi_zc_init2(struct AVCodecContext * const s, -+ void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, -+ av_rpi_zc_free_pool_fn_t * free_pool_fn); -+ -+// Free ZC from a context -+void av_rpi_zc_uninit2(struct AVCodecContext * const s); -+ -+// Get minimum pool size in frames - valid by the time the first alloc request -+// occurs. Takes into account thread requests and DPB sizes derived from SPS -+// rather than just adding a worst case DPB size. -+unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc); -+ -+typedef struct av_rpi_zc_buf_fn_tab_s { -+ // This AVBuffer is being freed by ffmpeg - return memory -+ // to external pool. Memory may be, but need not be, unmapped. -+ // v is the ptr passed in av_rpi_zc_buf -+ void (* free)(void * v); -+ -+ // Return appropriate handles / mappings -+ // v is the ptr passed in av_rpi_zc_buf -+ unsigned int (* vcsm_handle)(void * v); -+ unsigned int (* vc_handle)(void * v); -+ void * (* map_arm)(void * v); -+ unsigned int (* map_vc)(void * v); -+} av_rpi_zc_buf_fn_tab_t; -+ -+// Allocate a ZC AVBufferRef and set its callback table -+// Doesn't take a buffer address directly - relies on callbacks to return -+// addresses as they are required. Mappings need not be generated until -+// the map callbacks are called but they should persist from then until -+// the buffer is freed. -+// -+// Parameters: -+// numbytes Size of the buffer -+// addr_offset Offset to first usable byte of buffer (for alignment) -+// normally 0 -+// v Pointer passed to callbacks -+// fn_tab Function table -+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab); -+ -+// Get v ptr set in in av_rpi_zc_buf -+void * av_rpi_zc_buf_v(AVBufferRef * const buf); -+ -+//---------------------------------------------------------------------------- -+// -+// Mostly internal calls but might possibly be wanted by outside code -+ -+void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc); -+AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx); -+void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size); -+ -+// Test to see if the context is using zc (checks get_buffer2) -+int av_rpi_zc_in_use(const struct AVCodecContext * const s); -+ -+// Get buffer generates placeholders for later alloc -+int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame); -+// Resolve actually does the alloc (noop if already alloced) -+// Set data pointers on a buffer/frame that was copied before the alloc -+// accured -+#define ZC_RESOLVE_FAIL 0 // return error on invalid -+#define ZC_RESOLVE_ALLOC 1 // alloc as invalid -+#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid -+#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid -+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc); -+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc); -+ -+int av_rpi_zc_set_valid_frame(AVFrame * const frame); -+int av_rpi_zc_set_broken_frame(AVFrame * const frame); -+ -+ -+ -+ -+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, -+ void * pool_env, -+ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, -+ av_rpi_zc_free_pool_fn_t * free_pool_fn); -+void av_rpi_zc_env_release(const AVZcEnvPtr zc); -+ -+ -+#endif -+ -diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h -new file mode 100644 -index 0000000000..9b7b6536a4 ---- /dev/null -+++ b/libavcodec/rpi_zc_frames.h -@@ -0,0 +1,142 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox, Ben Avison -+*/ -+ -+#ifndef RPI_ZC_FRAMES_H -+#define RPI_ZC_FRAMES_H -+ -+#define RPI_ONE_BUF 1 -+ -+#include "rpi_mem.h" // for GPU_MEM_PTR_T -+#include "libavutil/frame.h" -+ -+#if !RPI_ONE_BUF -+static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]); -+ return p->vc; -+} -+ -+static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]); -+ return p->vc; -+} -+ -+static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]); -+ return p->vc; -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]); -+} -+ -+#else -+ -+static inline int gpu_is_buf1(const AVFrame * const frame) -+{ -+ return frame->buf[1] == NULL; -+} -+ -+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) -+{ -+ return av_buffer_get_opaque(frame->buf[0]); -+} -+ -+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) -+{ -+ return av_buffer_pool_buffer_get_opaque(frame->buf[n]); -+} -+ -+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) -+{ -+ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); -+ return gm->vc + (frame->data[n] - gm->arm); -+} -+ -+ -+static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ return get_vc_address3(frame, 0); -+} -+ -+static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ return get_vc_address3(frame, 1); -+} -+ -+static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ return get_vc_address3(frame, 2); -+} -+ -+#if 0 -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.numbytes = frame->data[1] - frame->data[0]; -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 0); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.arm += frame->data[1] - frame->data[0]; -+ g.vc += frame->data[1] - frame->data[0]; -+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 1); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.arm += frame->data[2] - frame->data[0]; -+ g.vc += frame->data[2] - frame->data[0]; -+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 2); -+} -+#endif -+#endif -+ -+#endif -diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c -new file mode 100644 -index 0000000000..85c5b46d75 ---- /dev/null -+++ b/libavcodec/rpivid_hevc.c -@@ -0,0 +1,2128 @@ -+// FFMPEG HEVC decoder hardware accelerator -+// Andrew Holme, Argon Design Ltd -+// Copyright (c) June 2017 Raspberry Pi Ltd -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "fftools/ffmpeg.h" -+#include "libavutil/avassert.h" -+#include "libavutil/imgutils.h" -+#include "avcodec.h" -+#include "hwconfig.h" -+#include "decode.h" -+ -+#include "hevc.h" -+#include "hevcdec.h" -+#include "rpi_zc.h" -+#include "rpi_mem.h" -+#include "rpi_zc_frames.h" -+#include "rpi_mailbox.h" -+ -+ -+#define OPT_PHASE_TIMING 0 // Generate stats for phase usage -+ -+#define OPT_EMU 0 -+ -+#define TRACE_DEV 0 -+#define TRACE_ENTRY 0 -+ -+#define NUM_SCALING_FACTORS 4064 -+ -+#define AXI_BASE64 0 -+ -+#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0)) -+#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6)) -+ -+#define RPIVID_COL_PICS 17 // 16 ref & current -+ -+#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1) -+#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size -+ -+#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2) -+#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size -+ -+////////////////////////////////////////////////////////////////////////////// -+// -+// Register offsets -+ -+#define RPI_SPS0 0 -+#define RPI_SPS1 4 -+#define RPI_PPS 8 -+#define RPI_SLICE 12 -+#define RPI_TILESTART 16 -+#define RPI_TILEEND 20 -+#define RPI_SLICESTART 24 -+#define RPI_MODE 28 -+#define RPI_LEFT0 32 -+#define RPI_LEFT1 36 -+#define RPI_LEFT2 40 -+#define RPI_LEFT3 44 -+#define RPI_QP 48 -+#define RPI_CONTROL 52 -+#define RPI_STATUS 56 -+#define RPI_VERSION 60 -+#define RPI_BFBASE 64 -+#define RPI_BFNUM 68 -+#define RPI_BFCONTROL 72 -+#define RPI_BFSTATUS 76 -+#define RPI_PUWBASE 80 -+#define RPI_PUWSTRIDE 84 -+#define RPI_COEFFWBASE 88 -+#define RPI_COEFFWSTRIDE 92 -+#define RPI_SLICECMDS 96 -+#define RPI_BEGINTILEEND 100 -+#define RPI_TRANSFER 104 -+#define RPI_CFBASE 108 -+#define RPI_CFNUM 112 -+#define RPI_CFSTATUS 116 -+ -+#define RPI_PURBASE 0x8000 -+#define RPI_PURSTRIDE 0x8004 -+#define RPI_COEFFRBASE 0x8008 -+#define RPI_COEFFRSTRIDE 0x800C -+#define RPI_NUMROWS 0x8010 -+#define RPI_CONFIG2 0x8014 -+#define RPI_OUTYBASE 0x8018 -+#define RPI_OUTYSTRIDE 0x801C -+#define RPI_OUTCBASE 0x8020 -+#define RPI_OUTCSTRIDE 0x8024 -+#define RPI_STATUS2 0x8028 -+#define RPI_FRAMESIZE 0x802C -+#define RPI_MVBASE 0x8030 -+#define RPI_MVSTRIDE 0x8034 -+#define RPI_COLBASE 0x8038 -+#define RPI_COLSTRIDE 0x803C -+#define RPI_CURRPOC 0x8040 -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+// Unused but left here to illustrate the diffrences between FFmpegs prob -+// structure and the rpivid one -+ -+struct FFM_PROB { -+ uint8_t sao_merge_flag [ 1]; -+ uint8_t sao_type_idx [ 1]; -+ uint8_t split_coding_unit_flag [ 3]; -+ uint8_t cu_transquant_bypass_flag [ 1]; -+ uint8_t skip_flag [ 3]; -+ uint8_t cu_qp_delta [ 3]; -+ uint8_t pred_mode_flag [ 1]; -+ uint8_t part_mode [ 4]; -+ uint8_t prev_intra_luma_pred_flag [ 1]; -+ uint8_t intra_chroma_pred_mode [ 2]; -+ uint8_t merge_flag [ 1]; -+ uint8_t merge_idx [ 1]; -+ uint8_t inter_pred_idc [ 5]; -+ uint8_t ref_idx_l0 [ 2]; -+ uint8_t ref_idx_l1 [ 2]; -+ uint8_t abs_mvd_greater0_flag [ 2]; -+ uint8_t abs_mvd_greater1_flag [ 2]; -+ uint8_t mvp_lx_flag [ 1]; -+ uint8_t no_residual_data_flag [ 1]; -+ uint8_t split_transform_flag [ 3]; -+ uint8_t cbf_luma [ 2]; -+ uint8_t cbf_cb_cr [ 4]; -+ uint8_t transform_skip_flag/*[][]*/ [ 2]; -+ uint8_t explicit_rdpcm_flag/*[][]*/ [ 2]; -+ uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2]; -+ uint8_t last_significant_coeff_x_prefix [18]; -+ uint8_t last_significant_coeff_y_prefix [18]; -+ uint8_t significant_coeff_group_flag [ 4]; -+ uint8_t significant_coeff_flag [44]; -+ uint8_t coeff_abs_level_greater1_flag [24]; -+ uint8_t coeff_abs_level_greater2_flag [ 6]; -+ uint8_t log2_res_scale_abs [ 8]; -+ uint8_t res_scale_sign_flag [ 2]; -+ uint8_t cu_chroma_qp_offset_flag [ 1]; -+ uint8_t cu_chroma_qp_offset_idx [ 1]; -+} __attribute__((packed)); -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+struct RPI_PROB { -+ uint8_t SAO_MERGE_FLAG [ 1]; -+ uint8_t SAO_TYPE_IDX [ 1]; -+ uint8_t SPLIT_FLAG [ 3]; -+ uint8_t CU_SKIP_FLAG [ 3]; -+ uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1]; -+ uint8_t PRED_MODE [ 1]; -+ uint8_t PART_SIZE [ 4]; -+ uint8_t INTRA_PRED_MODE [ 1]; -+ uint8_t CHROMA_PRED_MODE [ 1]; -+ uint8_t MERGE_FLAG_EXT [ 1]; -+ uint8_t MERGE_IDX_EXT [ 1]; -+ uint8_t INTER_DIR [ 5]; -+ uint8_t REF_PIC [ 2]; -+ uint8_t MVP_IDX [ 1]; -+ uint8_t MVD [ 2]; -+ uint8_t QT_ROOT_CBF [ 1]; -+ uint8_t TRANS_SUBDIV_FLAG [ 3]; -+ uint8_t QT_CBF [ 6]; -+ uint8_t DQP [ 2]; -+ uint8_t ONE_FLAG [24]; -+ uint8_t LASTX [18]; -+ uint8_t LASTY [18]; -+ uint8_t SIG_CG_FLAG [ 4]; -+ uint8_t ABS_FLAG [ 6]; -+ uint8_t TRANSFORMSKIP_FLAG [ 2]; -+ uint8_t SIG_FLAG [42]; -+ uint8_t SIG_FLAG_unused [ 2]; -+} __attribute__((packed)); -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+struct RPI_CMD { -+ uint32_t addr; -+ uint32_t data; -+} __attribute__((packed)); -+ -+struct RPI_BIT { -+ int cmd; -+ const void *ptr; -+ int len; -+}; -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+struct RPI_T; -+ -+// Actual addressability is 38bits but we can only alloc in the bottom 32 -+// currently - when passed to rpivid h/w the address is always >> 6 so will -+// fit in 32 bit there -+// At some point we may weant to make this uint64_t -+typedef uint32_t vid_vc_addr_t; -+ -+typedef enum rpivid_decode_state_e { -+ RPIVID_DECODE_NEW = 0, -+ RPIVID_DECODE_START, -+ RPIVID_DECODE_SLICE, -+ RPIVID_DECODE_END, -+} rpivid_decode_state_t; -+ -+#define RPI_PROB_VALS 154U -+#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3) -+ -+typedef struct dec_env_s { -+ const AVCodecContext * avctx; -+ -+ rpivid_decode_state_t state; -+ unsigned int decode_order; -+ -+ int phase_no; // Current phase (i.e. the last one we waited for) -+ struct dec_env_s * phase_wait_q_next; -+ sem_t phase_wait; -+ -+ struct RPI_BIT *bit_fifo; -+ struct RPI_CMD *cmd_fifo; -+ unsigned int bit_len, bit_max; -+ unsigned int cmd_len, cmd_max; -+ unsigned int num_slice_msgs; -+ unsigned int PicWidthInCtbsY; -+ unsigned int PicHeightInCtbsY; -+ unsigned int dpbno_col; -+ uint32_t reg_slicestart; -+ unsigned int wpp_entry_x; -+ unsigned int wpp_entry_y; -+ -+ const uint8_t * nal_buffer; -+ size_t nal_size; -+ -+ uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3]; -+ uint8_t scaling_factors[NUM_SCALING_FACTORS]; -+// unsigned int RefPicList[2][HEVC_MAX_REFS]; -+} dec_env_t; -+ -+#define RPIVID_PHASES 3 -+#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order -+#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order -+ -+#if OPT_PHASE_TIMING -+static const unsigned int time_thresholds[8] = { -+ 10, 15, 20, 30, 45, 60, 75, 90 -+}; -+#endif -+ -+typedef struct phase_wait_env_s { -+ unsigned int last_order; -+ dec_env_t * q; -+#if OPT_PHASE_TIMING -+ uint64_t phase_time; -+ uint64_t max_phase_time; -+ uint64_t time_in_phase; -+ uint64_t time_out_phase; -+ unsigned int max_time_decode_order; -+ unsigned int time_bins[9]; -+ unsigned int time_bins3[9]; -+ unsigned int time_bins5[9]; -+ uint64_t time_stash[16]; -+ unsigned int i3; -+#endif -+} phase_wait_env_t; // Single linked list of threads waiting for this phase -+ -+typedef struct RPI_T { -+ atomic_int ref_count; -+ sem_t ref_zero; -+ -+ dec_env_t ** dec_envs; -+ AVZcEnvPtr zc; -+ -+ pthread_mutex_t phase_lock; -+ phase_wait_env_t phase_reqs[RPIVID_PHASES]; -+ -+ volatile uint32_t * regs; -+ volatile uint32_t * ints; -+ -+ GPU_MEM_PTR_T gcolbuf; -+ unsigned int col_stride; -+ size_t col_picsize; -+ -+ unsigned int bitbuf_no; -+ sem_t bitbuf_sem; -+ GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS]; -+ -+ unsigned int max_pu_msgs; -+ unsigned int coeffbuf_no; -+ sem_t coeffbuf_sem; -+ GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS]; -+ -+ unsigned int decode_order; -+ int mbox_fd; -+ int gpu_init_type; -+} RPI_T; -+ -+#if OPT_PHASE_TIMING -+static uint64_t tus64(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; -+} -+#endif -+ -+static inline unsigned int rnd64(unsigned int x) -+{ -+ return (x + 63) & ~63; -+} -+ -+static inline int rpi_sem_wait(sem_t * const sem) -+{ -+ int rv; -+ while ((rv = sem_wait(sem)) != 0 && errno == EINTR) -+ /* Loop */; -+ return rv; -+} -+ -+//============================================================================ -+ -+#define REGS_NAME "/dev/rpivid-hevcmem" -+#define REGS_SIZE 0x10000 -+#define INTS_NAME "/dev/rpivid-intcmem" -+#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway -+ -+static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size) -+{ -+ void *gpio_map; -+ int mem_fd; -+ -+ /* open /dev/mem */ -+ if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) { -+ av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name); -+ return NULL; -+ } -+ -+ // Now map it -+ gpio_map = mmap( -+ NULL, -+ size, -+ PROT_READ|PROT_WRITE, -+ MAP_SHARED, -+ mem_fd, -+ 0 -+ ); -+ -+ close(mem_fd); // No longer need the FD -+ -+ if (gpio_map == MAP_FAILED) { -+ av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed"); -+ return NULL; -+ } -+ -+ return (volatile uint32_t *)gpio_map; -+} -+ -+static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size) -+{ -+ volatile uint32_t * const gpio_map = *p_gpio_map; -+ if (gpio_map != NULL) { -+ *p_gpio_map = NULL; -+ munmap((void *)gpio_map, size); -+ } -+} -+ -+#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing! -+#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6) -+ -+static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data) -+{ -+#if TRACE_DEV -+ printf("W %x %08x\n", addr, MANGLE64(data)); -+#endif -+ -+ rpi->regs[addr >> 2] = MANGLE64(data); -+} -+ -+static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data) -+{ -+#if TRACE_DEV -+ printf("W %x %08x\n", addr, data >> 6); -+#endif -+ -+ rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed -+} -+ -+static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data) -+{ -+#if TRACE_DEV -+ printf("W %x %08x\n", addr, data); -+#endif -+ -+ rpi->regs[addr >> 2] = data; -+} -+ -+static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr) -+{ -+ const uint32_t v = rpi->regs[addr >> 2]; -+#if TRACE_DEV -+ printf("R %x (=%x)\n", addr, v); -+#endif -+ return v; -+} -+ -+#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001 -+#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002 -+#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004 -+#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008 -+#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010 -+#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020 -+#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040 -+#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080 -+ -+static inline void int_wait(const RPI_T * const rpi, const unsigned int phase) -+{ -+ const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET; -+ const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET; -+ uint32_t ival; -+ while (((ival = rpi->ints[0]) & mask_done) == 0) { -+ usleep(1000); -+ } -+ rpi->ints[0] = ival & mask_reset; -+} -+ -+#if TRACE_DEV && 0 -+static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) { -+ int i; -+ -+ for (i=0; iregs[(addr>>2)+i]); -+ -+ if ((i%4)==3 || i+1 == num) -+ printf("\n"); -+ else -+ printf(" "); -+ } -+} -+ -+static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) { -+ int i; -+ -+ for (i=0; i>2; i++) -+ { -+ if ((i%4)==0) -+ printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i); -+ -+ printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]); -+ -+ if ((i%4)==3 || i+1 == size>>2) -+ printf("\n"); -+ else -+ printf(" "); -+ } -+} -+#endif -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static inline size_t round_up_size(const size_t x) -+{ -+ /* Admit no size < 256 */ -+ const unsigned int n = x < 256 ? 8 : av_log2(x) - 1; -+ -+ return x >= (3 << n) ? 4 << n : (3 << n); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Scaling factors -+ -+static void expand_scaling_list( -+ const unsigned int sizeID, -+ const unsigned int matrixID, -+ uint8_t * const dst0, -+ const uint8_t * const src0, -+ uint8_t dc) -+{ -+ switch (sizeID) { -+ case 0: -+ memcpy(dst0, src0, 16); -+ break; -+ case 1: -+ memcpy(dst0, src0, 64); -+ break; -+ case 2: -+ { -+ uint8_t * d = dst0; -+ for (unsigned int y=0; y != 16; y++) { -+ const uint8_t * s = src0 + (y >> 1) * 8; -+ for (unsigned int x = 0; x != 8; ++x) { -+ *d++ = *s; -+ *d++ = *s++; -+ } -+ } -+ dst0[0] = dc; -+ break; -+ } -+ default: -+ { -+ uint8_t * d = dst0; -+ for (unsigned int y=0; y != 32; y++) { -+ const uint8_t * s = src0 + (y >> 2) * 8; -+ for (unsigned int x = 0; x != 8; ++x) { -+ *d++ = *s; -+ *d++ = *s; -+ *d++ = *s; -+ *d++ = *s++; -+ } -+ } -+ dst0[0] = dc; -+ break; -+ } -+ } -+} -+ -+static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) { -+ // Array of constants for scaling factors -+ static const uint32_t scaling_factor_offsets[4][6] = { -+ // MID0 MID1 MID2 MID3 MID4 MID5 -+ {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4) -+ {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8) -+ {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16) -+ {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32) -+ -+ // ffmpeg places SID3,MID1 where matrixID 3 normally is -+ const ScalingList * const sl = -+ s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list -+ : &s->ps.sps->scaling_list; -+ unsigned int mid; -+ -+ for (mid=0; mid<6; mid++) -+ expand_scaling_list(0, mid, -+ de->scaling_factors + scaling_factor_offsets[0][mid], -+ sl->sl[0][mid], 0); -+ for (mid=0; mid<6; mid++) -+ expand_scaling_list(1, mid, -+ de->scaling_factors + scaling_factor_offsets[1][mid], -+ sl->sl[1][mid], 0); -+ for (mid=0; mid<6; mid++) -+ expand_scaling_list(2, mid, -+ de->scaling_factors + scaling_factor_offsets[2][mid], -+ sl->sl[2][mid], -+ sl->sl_dc[0][mid]); -+ // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg -+ for (mid=0; mid<6; mid += 3) -+ expand_scaling_list(3, mid, -+ de->scaling_factors + scaling_factor_offsets[3][mid], -+ sl->sl[3][mid], -+ sl->sl_dc[1][mid]); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Probabilities -+ -+static const uint8_t prob_init[3][156] = { -+ { -+ 153, 200, 139, 141, 157, 154, 154, 154, -+ 154, 154, 184, 154, 154, 154, 184, 63, -+ 154, 154, 154, 154, 154, 154, 154, 154, -+ 154, 154, 154, 154, 154, 153, 138, 138, -+ 111, 141, 94, 138, 182, 154, 154, 154, -+ 140, 92, 137, 138, 140, 152, 138, 139, -+ 153, 74, 149, 92, 139, 107, 122, 152, -+ 140, 179, 166, 182, 140, 227, 122, 197, -+ 110, 110, 124, 125, 140, 153, 125, 127, -+ 140, 109, 111, 143, 127, 111, 79, 108, -+ 123, 63, 110, 110, 124, 125, 140, 153, -+ 125, 127, 140, 109, 111, 143, 127, 111, -+ 79, 108, 123, 63, 91, 171, 134, 141, -+ 138, 153, 136, 167, 152, 152, 139, 139, -+ 111, 111, 125, 110, 110, 94, 124, 108, -+ 124, 107, 125, 141, 179, 153, 125, 107, -+ 125, 141, 179, 153, 125, 107, 125, 141, -+ 179, 153, 125, 140, 139, 182, 182, 152, -+ 136, 152, 136, 153, 136, 139, 111, 136, -+ 139, 111, 0, 0, }, -+ { -+ 153, 185, 107, 139, 126, 197, 185, 201, -+ 154, 149, 154, 139, 154, 154, 154, 152, -+ 110, 122, 95, 79, 63, 31, 31, 153, -+ 153, 168, 140, 198, 79, 124, 138, 94, -+ 153, 111, 149, 107, 167, 154, 154, 154, -+ 154, 196, 196, 167, 154, 152, 167, 182, -+ 182, 134, 149, 136, 153, 121, 136, 137, -+ 169, 194, 166, 167, 154, 167, 137, 182, -+ 125, 110, 94, 110, 95, 79, 125, 111, -+ 110, 78, 110, 111, 111, 95, 94, 108, -+ 123, 108, 125, 110, 94, 110, 95, 79, -+ 125, 111, 110, 78, 110, 111, 111, 95, -+ 94, 108, 123, 108, 121, 140, 61, 154, -+ 107, 167, 91, 122, 107, 167, 139, 139, -+ 155, 154, 139, 153, 139, 123, 123, 63, -+ 153, 166, 183, 140, 136, 153, 154, 166, -+ 183, 140, 136, 153, 154, 166, 183, 140, -+ 136, 153, 154, 170, 153, 123, 123, 107, -+ 121, 107, 121, 167, 151, 183, 140, 151, -+ 183, 140, 0, 0, }, -+ { -+ 153, 160, 107, 139, 126, 197, 185, 201, -+ 154, 134, 154, 139, 154, 154, 183, 152, -+ 154, 137, 95, 79, 63, 31, 31, 153, -+ 153, 168, 169, 198, 79, 224, 167, 122, -+ 153, 111, 149, 92, 167, 154, 154, 154, -+ 154, 196, 167, 167, 154, 152, 167, 182, -+ 182, 134, 149, 136, 153, 121, 136, 122, -+ 169, 208, 166, 167, 154, 152, 167, 182, -+ 125, 110, 124, 110, 95, 94, 125, 111, -+ 111, 79, 125, 126, 111, 111, 79, 108, -+ 123, 93, 125, 110, 124, 110, 95, 94, -+ 125, 111, 111, 79, 125, 126, 111, 111, -+ 79, 108, 123, 93, 121, 140, 61, 154, -+ 107, 167, 91, 107, 107, 167, 139, 139, -+ 170, 154, 139, 153, 139, 123, 123, 63, -+ 124, 166, 183, 140, 136, 153, 154, 166, -+ 183, 140, 136, 153, 154, 166, 183, 140, -+ 136, 153, 154, 170, 153, 138, 138, 122, -+ 121, 122, 121, 167, 151, 183, 140, 151, -+ 183, 140, 0, 0, }, -+}; -+ -+ -+////////////////////////////////////////////////////////////////////////////// -+// Phase 1 command and bit FIFOs -+ -+// ???? uint16_t addr - put in uint32_t -+static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) { -+ if (de->cmd_len==de->cmd_max) -+ av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD))); -+ -+#if TRACE_DEV -+ printf("[%02x] %x %x\n", de->cmd_len, addr, data); -+#endif -+ -+ de->cmd_fifo[de->cmd_len].addr = addr; -+ de->cmd_fifo[de->cmd_len].data = data; -+ return de->cmd_len++; -+} -+ -+static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) { -+ if (de->bit_len==de->bit_max) -+ av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT))); -+ de->bit_fifo[de->bit_len].cmd = cmd_idx; -+ de->bit_fifo[de->bit_len].ptr = ptr; -+ de->bit_fifo[de->bit_len].len = len; -+ de->bit_len++; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Write probability and scaling factor memories -+ -+#if 0 -+static void WriteProb(dec_env_t * const de) { -+ int i; -+ const uint8_t *p = (uint8_t *) &de->probabilities; -+ for (i=0; ish.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ? -+ s->sh.slice_type + 1 : 2 - s->sh.slice_type; -+ const uint8_t * p = prob_init[init_type]; -+ const int q = av_clip(s->sh.slice_qp, 0, 51); -+ unsigned int i; -+ -+ for (i = 0; i < RPI_PROB_VALS; i++) { -+ int init_value = p[i]; -+ int m = (init_value >> 4) * 5 - 45; -+ int n = ((init_value & 15) << 3) - 16; -+ int pre = 2 * (((m * q) >> 4) + n) - 127; -+ -+ pre ^= pre >> 31; -+ if (pre > 124) -+ pre = 124 + (pre & 1); -+ dst[i] = pre; -+ } -+ for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) { -+ dst[i] = 0; -+ } -+ -+ for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4) -+ p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24)); -+ -+} -+ -+ -+static void WriteScalingFactors(dec_env_t * const de) { -+ int i; -+ const uint8_t *p = (uint8_t *) de->scaling_factors; -+ for (i=0; i= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c -+ return i-1; -+} -+ -+static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) { -+ if (ctb < bd[num-1]) return ctb_size; -+ else if (width % ctb_size) return width % ctb_size; -+ else return ctb_size; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Handle PU and COEFF stream overflow -+ -+ -+// Returns: -+// -2 Other error -+// -1 Out of coeff space -+// 0 OK -+// 1 Out of PU space -+ -+static int check_status(const RPI_T * const rpi, dec_env_t * const de) { -+ uint32_t status; -+ -+ // this is the definition of successful completion of phase 1 -+ // it assures that status register is zero and all blocks in each tile have completed -+ if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM)) -+ return 0; -+ -+ status = apb_read(rpi, RPI_STATUS); -+ -+ if ((status & 8) != 0) -+ return -1; -+ -+ if ((status & 0x10) != 0) -+ return 1; -+ -+ return -2; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Write STATUS register with expected end CTU address of previous slice -+ -+static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) { -+ const HEVCPPS * const pps = s->ps.pps; -+ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; -+ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; -+ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); -+} -+ -+static void wpp_pause(dec_env_t * const de, int ctb_row) { -+ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25); -+ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); -+ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000); -+ p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2); -+} -+ -+static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { -+ const HEVCPPS *pps = s->ps.pps; -+ int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; -+ int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; -+ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; -+ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; -+ if (de->wpp_entry_x<2 && (de->wpp_entry_y2) && de->PicWidthInCtbsY>2) -+ wpp_pause(de, last_y); -+ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); -+ if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_yps.sps; -+ const HEVCPPS *pps = s->ps.pps; -+ -+ p1_apb_write(de, RPI_SPS0, -+ (sps->log2_min_cb_size << 0) + -+ (sps->log2_ctb_size << 4) + -+ (sps->log2_min_tb_size << 8) + -+ (sps->log2_max_trafo_size << 12) + -+ (sps->bit_depth << 16) + -+ (sps->bit_depth << 20) + -+ (sps->max_transform_hierarchy_depth_intra << 24) + -+ (sps->max_transform_hierarchy_depth_inter << 28)); -+ -+ p1_apb_write(de, RPI_SPS1, -+ (sps->pcm.bit_depth << 0) + -+ (sps->pcm.bit_depth_chroma << 4) + -+ (sps->pcm.log2_min_pcm_cb_size << 8) + -+ (sps->pcm.log2_max_pcm_cb_size << 12) + -+ (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) + -+ (sps->amp_enabled_flag << 18) + -+ (sps->pcm_enabled_flag << 19) + -+ (sps->scaling_list_enable_flag << 20) + -+ (sps->sps_strong_intra_smoothing_enable_flag << 21)); -+ -+ p1_apb_write(de, RPI_PPS, -+ (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) + -+ (pps->cu_qp_delta_enabled_flag << 4) + -+ (pps->transquant_bypass_enable_flag << 5) + -+ (pps->transform_skip_enabled_flag << 6) + -+ (pps->sign_data_hiding_flag << 7) + -+ (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) + -+ (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) + -+ (pps->constrained_intra_pred_flag << 24)); -+ -+ if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de); -+ -+ if (!s->sh.dependent_slice_segment_flag) { -+ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; -+ int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; -+ de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16); -+ } -+ -+ p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static void write_slice(dec_env_t * const de, const HEVCContext * const s, -+ const unsigned int slice_w, const unsigned int slice_h) { -+ uint32_t u32 = -+ (s->sh.slice_type << 12) -+ + (s->sh.slice_sample_adaptive_offset_flag[0] << 14) -+ + (s->sh.slice_sample_adaptive_offset_flag[1] << 15) -+ + (slice_w << 17) -+ + (slice_h << 24); -+ -+ if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |= -+ (s->sh.max_num_merge_cand << 0) -+ + (s->sh.nb_refs[L0] << 4) -+ + (s->sh.nb_refs[L1] << 8); -+ -+ if (s->sh.slice_type==HEVC_SLICE_B) -+ u32 |= s->sh.mvd_l1_zero_flag<<16; -+ p1_apb_write(de, RPI_SLICE, u32); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Wavefront mode -+ -+static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s, -+ const int do_bte, const int resetQPY, const int ctb_addr_ts) { -+ const HEVCSPS * const sps = s->ps.sps; -+ const HEVCPPS * const pps = s->ps.pps; -+ -+ int ctb_size = 1<log2_ctb_size; -+ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ -+ int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY; -+ int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY; -+ -+ int endx = de->PicWidthInCtbsY-1; -+ int endy = ctb_row; -+ -+ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); -+ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); -+ -+ p1_apb_write(de, RPI_TILESTART, 0); -+ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); -+ -+ if (do_bte) -+ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); -+ -+ write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size); -+ -+ if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); -+ -+ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001); -+ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Tiles mode -+ -+static void new_entry_point(dec_env_t * const de, const HEVCContext * const s, -+ const int do_bte, const int resetQPY, const int ctb_addr_ts) { -+ const HEVCSPS * const sps = s->ps.sps; -+ const HEVCPPS * const pps = s->ps.pps; -+ -+ int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY; -+ int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY; -+ -+ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); -+ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); -+ -+ int endx = pps->col_bd[tile_x+1] - 1; -+ int endy = pps->row_bd[tile_y+1] - 1; -+ -+ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); -+ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); -+ -+ p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16)); -+ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); -+ -+ if (do_bte) -+ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); -+ -+ write_slice(de, s, slice_w, slice_h); -+ -+ if (resetQPY) -+ p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); -+ -+ p1_apb_write(de, RPI_MODE, (0xFFFF << 0) -+ + (0x0 << 16) -+ + ((tile_x==pps->num_tile_columns-1) << 17) -+ + ((tile_y==pps->num_tile_rows-1) << 18)); -+ -+ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+// Doesn't attempt to remove from context as we should only do this at the end -+// of time or on create error -+static void -+dec_env_delete(dec_env_t * const de) -+{ -+// gpu_free(&de->gbuf); -+ -+ av_freep(&de->cmd_fifo); -+ av_freep(&de->bit_fifo); -+ -+ sem_destroy(&de->phase_wait); -+ av_free(de); -+} -+ -+static dec_env_t * -+dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi) -+{ -+ dec_env_t * const de = av_mallocz(sizeof(*de)); -+ int i; -+ -+ if (de == NULL) -+ return NULL; -+ -+ de->avctx = avctx; -+ de->phase_no = RPIVID_PHASE_NEW; -+ -+ sem_init(&de->phase_wait, 0, 0); -+ -+ if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL) -+ goto fail; -+ -+ if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL) -+ goto fail; -+ -+ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this -+ for (i = 0; i != avctx->thread_count; ++i) { -+ if (rpi->dec_envs[i] == NULL) -+ { -+ rpi->dec_envs[i] = de; -+ break; -+ } -+ } -+ pthread_mutex_unlock(&rpi->phase_lock); -+ -+ if (i == avctx->thread_count) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n"); -+ goto fail; -+ } -+ -+ return de; -+ -+fail: -+ dec_env_delete(de); -+ return NULL; -+} -+ -+ -+static dec_env_t * -+dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi) -+{ -+ dec_env_t * de = NULL; -+ const int ref_count = atomic_fetch_add(&rpi->ref_count, 1); -+ -+ if (ref_count <= 0) { -+ // Already dead -+ av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");; -+ return NULL; -+ } -+ -+ for (int i = 0; i != avctx->thread_count; ++i) { -+ if (rpi->dec_envs[i] == NULL) -+ { -+ de = dec_env_new(avctx, rpi); -+ break; -+ } -+ if (rpi->dec_envs[i]->avctx == avctx) -+ { -+ de = rpi->dec_envs[i]; -+ break; -+ } -+ } -+ return de; -+} -+ -+// Call at end of fn -+// Used to ensure we aren't in a worker thead when killed -+static void -+dec_env_release(RPI_T * const rpi, dec_env_t * const de) -+{ -+ const int n = atomic_fetch_sub(&rpi->ref_count, 1); -+ if (n == 1) { -+ sem_post(&rpi->ref_zero); -+ } -+} -+ -+//---------------------------------------------------------------------------- -+ -+// Wait for a slot in the given phase -+// Any error return is probably fatal -+static int -+wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) -+{ -+ int needs_wait = 0; -+ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; -+ -+ pthread_mutex_lock(&rpi->phase_lock); -+ if (p->last_order + 1 != de->decode_order) { -+ de->phase_wait_q_next = p->q; -+ p->q = de; -+ needs_wait = 1; -+ } -+ pthread_mutex_unlock(&rpi->phase_lock); -+ -+ if (needs_wait) { -+ while (sem_wait(&de->phase_wait) == -1) -+ { -+ int err; -+ if ((err = errno) != EINTR) -+ return AVERROR(err); -+ } -+ } -+ -+ de->phase_no = phase_no; -+ return 0; -+} -+ -+static void -+post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) -+{ -+ dec_env_t * next_de = NULL; -+ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; -+ dec_env_t ** q = &p->q; -+ -+ pthread_mutex_lock(&rpi->phase_lock); -+ -+ p->last_order = de->decode_order; -+ while (*q != NULL) { -+ dec_env_t * const t_de = *q; -+ -+ if (t_de->decode_order == p->last_order + 1) { -+ // This is us - remove from Q -+ *q = t_de->phase_wait_q_next; -+ t_de->phase_wait_q_next = NULL; // Tidy -+ next_de = t_de; -+ break; -+ } -+ q = &t_de->phase_wait_q_next; -+ } -+ -+ pthread_mutex_unlock(&rpi->phase_lock); -+ -+ if (next_de != NULL) -+ sem_post(&next_de->phase_wait); -+} -+ -+// Wait & signal stuff s.t. threads in other phases can continue -+static void -+abort_phases(RPI_T * const rpi, dec_env_t * const de) -+{ -+ for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) { -+ wait_phase(rpi, de, i); -+ post_phase(rpi, de, i); -+ } -+ de->phase_no = RPIVID_PHASE_NEW; -+} -+ -+// Start timing for phase -+// Stats only - no actual effect -+static inline void tstart_phase(RPI_T * const rpi, const int phase_no) -+{ -+#if OPT_PHASE_TIMING -+ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; -+ const int64_t now = tus64(); -+ if (p->phase_time != 0) -+ p->time_out_phase += now - p->phase_time; -+ p->phase_time = now; -+#endif -+} -+ -+#if OPT_PHASE_TIMING -+static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n) -+{ -+ uint64_t tsum = 0; -+ unsigned int i; -+ for (i = 0; i != avg_n; ++i) -+ tsum += p->time_stash[(p->i3 - i) & 15]; -+ for (i = 0; i != 9; ++i) { -+ if (time_thresholds[i] * 1000 * avg_n > tsum) -+ break; -+ } -+ return i; -+} -+#endif -+ -+// End timing for phase -+// Stats only - no actual effect -+static inline void tend_phase(RPI_T * const rpi, const int phase_no) -+{ -+#if OPT_PHASE_TIMING -+ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; -+ const uint64_t now = tus64(); -+ const uint64_t in_time = now - p->phase_time; -+ -+ p->time_in_phase += in_time; -+ p->phase_time = now; -+ p->time_stash[p->i3] = in_time; -+ if (in_time > p->max_phase_time) { -+ p->max_phase_time = in_time; -+ p->max_time_decode_order = p->last_order; -+ } -+ ++p->time_bins[tavg_bin_phase(p, 1)]; -+ ++p->time_bins3[tavg_bin_phase(p, 3)]; -+ ++p->time_bins5[tavg_bin_phase(p, 5)]; -+ -+ p->i3 = (p->i3 + 1) & 15; -+#endif -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Start frame -+ -+static int rpi_hevc_start_frame( -+ AVCodecContext * avctx, -+ const uint8_t *buffer, -+ uint32_t size) { -+ -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ dec_env_t * const de = dec_env_get(avctx, rpi); -+ const HEVCContext * const s = avctx->priv_data; -+ const HEVCSPS * const sps = s->ps.sps; -+ const unsigned int CtbSizeY = 1U << sps->log2_ctb_size; -+ -+#if TRACE_ENTRY -+ printf("<<< %s[%p]\n", __func__, de); -+#endif -+ -+ if (de == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); -+ return -1; -+ } -+ -+ de->phase_no = RPIVID_PHASE_START; -+ de->decode_order = ++rpi->decode_order; // *** atomic? -+ -+ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame -+ -+ if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); -+ return -1; -+ } -+ de->state = RPIVID_DECODE_START; -+ -+ de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15 -+ de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17 -+ de->bit_len = 0; -+ de->cmd_len = 0; -+ -+#if TRACE_ENTRY -+ printf(">>> %s[%p]\n", __func__, de); -+#endif -+ -+ dec_env_release(rpi, de); -+ return 0; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Slice messages -+ -+static void msg_slice(dec_env_t * const de, const uint16_t msg) { -+ de->slice_msgs[de->num_slice_msgs++] = msg; -+} -+ -+static void program_slicecmds(dec_env_t * const de, const int sliceid) { -+ int i; -+ p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8)); -+ for(i=0; i < de->num_slice_msgs; i++) { -+ p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff); -+ } -+} -+ -+static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) { -+ const HEVCSPS * const sps = s->ps.sps; -+ const HEVCPPS * const pps = s->ps.pps; -+ const SliceHeader *sh = &s->sh; -+ -+ int weightedPredFlag, i, rIdx; -+ uint16_t cmd_slice; -+ unsigned int collocated_from_l0_flag; -+ -+ de->num_slice_msgs=0; -+ de->dpbno_col = 0; -+ cmd_slice = 0; -+ if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1; -+ if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2; -+ if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3; -+ -+ if (sh->slice_type!=HEVC_SLICE_I) { -+ cmd_slice += sh->nb_refs[L0]<<2; -+ cmd_slice += sh->nb_refs[L1]<<6; -+ } -+ -+ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) -+ cmd_slice |= sh->max_num_merge_cand<<11; -+ -+ collocated_from_l0_flag = -+ !sh->slice_temporal_mvp_enabled_flag ? -+ 0 : -+ sh->slice_type == HEVC_SLICE_B ? -+ (sh->collocated_list == L0) : -+ (sh->slice_type==HEVC_SLICE_P); -+ cmd_slice |= collocated_from_l0_flag<<14; -+ -+ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) { -+ -+ int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past -+ for(i=L0; i<=L1; i++) { -+ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { -+ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; -+ HEVCFrame *c = s->ref; // CurrentPicture -+ if (c->poc < f->poc) NoBackwardPredFlag = 0; -+ } -+ } -+ -+ if (sps->sps_temporal_mvp_enabled_flag) -+ { -+ const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ? -+ s->ref->refPicList + 0 : -+ s->ref->refPicList + 1; -+ de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB; -+ } -+ -+ cmd_slice += NoBackwardPredFlag<<10; -+ msg_slice(de, cmd_slice); -+ -+ // Write reference picture descriptions -+ weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag; -+ -+ for(i=L0; i<=L1; i++) -+ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { -+ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; -+ HEVCFrame *c = s->ref; // CurrentPicture -+ int pic = f - s->DPB; -+ // Make sure pictures are in range 0 to 15 -+ int adjusted_pic = fref->refPicList[i].isLongTerm[rIdx]; -+ msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6)); -+ msg_slice(de, f->poc); -+ if (weightedPredFlag) { -+ msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3)); -+ msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff); -+ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3)); -+ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff); -+ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3)); -+ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff); -+ } -+ } -+ } -+ else -+ msg_slice(de, cmd_slice); -+ -+ msg_slice(de, ((sh->beta_offset/2)&15) -+ + (((sh->tc_offset/2)&15) << 4) -+ + (sh->disable_deblocking_filter_flag << 8) -+ + (sh->slice_loop_filter_across_slices_enabled_flag << 9) -+ + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK -+ -+ msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF -+} -+ -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static void rpi_hevc_abort_frame(AVCodecContext * const avctx) { -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ dec_env_t * const de = dec_env_get(avctx, rpi); -+ -+#if TRACE_ENTRY -+ printf("<<< %s[%p]\n", __func__, de); -+#endif -+ -+ if (de == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); -+ return; -+ } -+ -+ switch (de->state) { -+ case RPIVID_DECODE_NEW: -+ case RPIVID_DECODE_END: -+ // Expected transition -+ break; -+ -+ case RPIVID_DECODE_SLICE: -+ // Error transition -+ av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n"); -+ break; -+ -+ case RPIVID_DECODE_START: -+ default: -+ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); -+ break; -+ } -+ -+ abort_phases(rpi, de); -+ de->state = RPIVID_DECODE_NEW; -+ -+ dec_env_release(rpi, de); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// End frame -+ -+static int rpi_hevc_end_frame(AVCodecContext * const avctx) { -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ const HEVCContext * const s = avctx->priv_data; -+ const HEVCPPS * const pps = s->ps.pps; -+ const HEVCSPS * const sps = s->ps.sps; -+ dec_env_t * const de = dec_env_get(avctx, rpi); -+ AVFrame * const f = s->ref->frame; -+ const unsigned int dpbno_cur = s->ref - s->DPB; -+ vid_vc_addr_t cmds_vc; -+ vid_vc_addr_t pu_base_vc; -+ unsigned int pu_stride; -+ vid_vc_addr_t coeff_base_vc; -+ unsigned int coeff_stride; -+ unsigned int i; -+ int rv = 0; -+ int status = 0; -+ int coeffbuf_sem_claimed = 0; -+ -+#if TRACE_ENTRY -+ fprintf("<<< %s[%p]\n", __func__, de); -+#endif -+ -+ if (de == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); -+ return AVERROR_BUG; // Should never happen -+ } -+ -+ if (de->state != RPIVID_DECODE_SLICE) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); -+ rv = AVERROR_UNKNOWN; -+ goto fail; -+ } -+ de->state = RPIVID_DECODE_END; -+ -+ // End of command compilation -+ { -+ const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1; -+ const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1; -+ if (pps->entropy_coding_sync_enabled_flag) { -+ if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2) -+ wpp_pause(de, last_y); -+ } -+ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); -+ } -+ -+ // Phase 0 --------------------------------------------------------------- -+ -+ wait_phase(rpi, de, 0); -+ rpi_sem_wait(&rpi->bitbuf_sem); -+ tstart_phase(rpi, 0); -+ -+ // Copy cmds & bits into gpu side buffer -+ // Layout: CMDS, BITS -+ { -+ uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm; -+ vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc; -+ unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD); -+ -+ uint8_t * p = armbase + rnd64(cmd_bytes); -+ uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes; -+ -+ cmds_vc = vcbase; -+ -+ // Copy all the bits & update bitstream cmds to point at the right bits -+ for (i = 0; i < de->bit_len; ++i) -+ { -+ const unsigned int seg_len = de->bit_fifo[i].len; -+ -+ if (p + seg_len > eobits) { -+ status = -1; -+ break; -+ } -+ -+ memcpy(p, de->bit_fifo[i].ptr, seg_len); -+ de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase); -+ -+ p += rnd64(seg_len); -+ } -+ -+ memcpy(armbase, de->cmd_fifo, cmd_bytes); -+ } -+ -+ if (status == 0) -+ { -+ if (++rpi->bitbuf_no >= RPIVID_BITBUFS) -+ rpi->bitbuf_no = 0; -+ } -+ else -+ { -+ sem_post(&rpi->bitbuf_sem); -+ av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n"); -+ rv = AVERROR_BUFFER_TOO_SMALL; -+ } -+ -+ tend_phase(rpi, 0); -+ post_phase(rpi, de, 0); -+ -+ if (status < 0) -+ goto fail; -+ -+ // Phase 1 --------------------------------------------------------------- -+ -+ wait_phase(rpi, de, 1); -+ rpi_sem_wait(&rpi->coeffbuf_sem); -+ coeffbuf_sem_claimed = 1; -+ tstart_phase(rpi, 1); -+ -+ status = 0; -+ for (;;) -+ { -+ // (Re-)allocate PU/COEFF stream space -+ const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes; -+ unsigned int pu_size; -+ -+ pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc; -+ pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY); -+ pu_size = pu_stride * de->PicHeightInCtbsY; -+ -+ if (pu_size >= total_size || status == -1) { -+ GPU_MEM_PTR_T newbuf; -+ -+ if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n"); -+ status = -1; -+ break; -+ } -+ gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no); -+ rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf; -+ status = 0; -+ continue; -+ } -+ -+ // Allocate all remaining space to coeff -+ coeff_base_vc = pu_base_vc + pu_size; -+ coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64 -+ -+ apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc); -+ apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride); -+ apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc); -+ apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride); -+ -+ // Trigger command FIFO -+ apb_write(rpi, RPI_CFNUM, de->cmd_len); -+#if TRACE_DEV && 0 -+ apb_dump_regs(rpi, 0x0, 32); -+ apb_dump_regs(rpi, 0x8000, 24); -+ axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD)); -+#endif -+ apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc); -+ -+ int_wait(rpi, 1); -+ -+ status = check_status(rpi, de); -+ -+ if (status == -1) -+ continue; -+ else if (status != 1) -+ break; -+ -+ // Status 1 means out of PU space so try again with more -+ // If we ran out of Coeff space then we are out of memory - we could possibly realloc? -+ rpi->max_pu_msgs += rpi->max_pu_msgs / 2; -+ } -+ -+ // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we -+ // may reuse a live buffer when we kick the coeff sem -+ if (status == 0) -+ { -+ if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS) -+ rpi->coeffbuf_no = 0; -+ } -+ else -+ { -+ if (status == -1) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs); -+ rv = AVERROR_BUFFER_TOO_SMALL; -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n"); -+ rv = AVERROR_INVALIDDATA; -+ } -+ } -+ -+ tend_phase(rpi, 1); -+ sem_post(&rpi->bitbuf_sem); -+ post_phase(rpi, de, 1); -+ -+ if (status != 0) -+ goto fail; -+ -+ // Phase 2 --------------------------------------------------------------- -+ -+ wait_phase(rpi, de, 2); -+ -+ if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0) -+ { -+ // As we are in phase 2 already here we don't need to worry about -+ // ceoffbuf_no despite the early exit -+ post_phase(rpi, de, 2); -+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n"); -+ goto fail; -+ } -+ -+ tstart_phase(rpi, 2); -+ -+ apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc); -+ apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride); -+ apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc); -+ apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride); -+ -+ apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f)); -+ apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f)); -+ apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128); -+ apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128); -+ -+ // Keep the last thing we resolved as fallback for any ref we fail to -+ // resolve. As a final fallback use our current frame. The pels might -+ // not be there yet but at least the memory is valid. -+ // -+ // Attempt to resolve the entire DPB - we could note what we have used -+ // in ref lists but probably simpler and more reliable to set the whole thing -+ { -+ AVFrame * fallback_frame = f; -+ for (i = 0; i != 16; ++i) { -+ // Avoid current frame -+ const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i; -+ AVFrame * fr = hevc_fr->frame; -+ -+ if (fr != NULL && -+ av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0) -+ { -+ fallback_frame = fr; -+ } -+ else -+ { -+ fr = fallback_frame; -+ } -+ -+ apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr)); -+ apb_write(rpi, 0x9004+16*i, 0); -+ apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr)); -+ apb_write(rpi, 0x900C+16*i, 0); -+ } -+ } -+ -+ apb_write(rpi, RPI_CONFIG2, -+ (sps->bit_depth << 0) // BitDepthY -+ + (sps->bit_depth << 4) // BitDepthC -+ + ((sps->bit_depth>8) << 8) // BitDepthY -+ + ((sps->bit_depth>8) << 9) // BitDepthC -+ + (sps->log2_ctb_size <<10) -+ + (pps->constrained_intra_pred_flag <<13) -+ + (sps->sps_strong_intra_smoothing_enable_flag<<14) -+ + (sps->sps_temporal_mvp_enabled_flag <<15) -+ + (pps->log2_parallel_merge_level <<16) -+ + (s->sh.slice_temporal_mvp_enabled_flag <<19) -+ + (sps->pcm.loop_filter_disable_flag <<20) -+ + ((pps->cb_qp_offset&31) <<21) -+ + ((pps->cr_qp_offset&31) <<26)); -+ -+ apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width); -+ apb_write(rpi, RPI_CURRPOC, s->poc); -+ -+ // collocated reads/writes -+ if (sps->sps_temporal_mvp_enabled_flag) { -+ av_assert0(de->dpbno_col < RPIVID_COL_PICS); -+ av_assert0(dpbno_cur < RPIVID_COL_PICS); -+ -+ apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride); -+ apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride); -+ apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize); -+ apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize); -+ } -+ -+#if TRACE_DEV && 0 -+ apb_dump_regs(rpi, 0x0, 32); -+ apb_dump_regs(rpi, 0x8000, 24); -+#endif -+ -+ apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY); -+ apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block -+ -+ int_wait(rpi, 2); -+ -+ tend_phase(rpi, 2); -+ coeffbuf_sem_claimed = 0; -+ sem_post(&rpi->coeffbuf_sem); -+ // Set valid here to avoid race in resolving in any pending phase 2 -+ av_rpi_zc_set_valid_frame(f); -+ -+ post_phase(rpi, de, 2); -+ -+ // Flush frame for CPU access -+ // Arguably the best place would be at the start of phase 2 but here -+ // will overlap with the wait -+ // -+ // * Even better would be to have better lock/unlock control in ZC for external access -+ if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached -+ { -+ rpi_cache_buf_t cbuf; -+ rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf); -+ rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE); -+ rpi_cache_flush_finish(fe); -+ } -+ -+#if TRACE_ENTRY -+ printf(">>> %s[%p] OK\n", __func__, de); -+#endif -+ -+ dec_env_release(rpi, de); -+ return 0; -+ -+fail: -+ av_rpi_zc_set_broken_frame(f); -+ if (coeffbuf_sem_claimed) -+ sem_post(&rpi->coeffbuf_sem); -+ abort_phases(rpi, de); // Dummy any unresolved phases -+ -+#if TRACE_ENTRY -+ printf(">>> %s[%p] FAIL\n", __func__, de); -+#endif -+ -+ dec_env_release(rpi, de); -+ return rv; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+ -+#if TRACE_DEV -+static void dump_data(const uint8_t * p, size_t len) -+{ -+ size_t i; -+ for (i = 0; i < len; i += 16) { -+ size_t j; -+ printf("%04x", i); -+ for (j = 0; j != 16; ++j) { -+ printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]); -+ } -+ printf("\n"); -+ } -+} -+#endif -+ -+#if OPT_EMU -+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) -+{ -+ unsigned int z = 0; -+ while (idx--) { -+ if (*b++ == 0) { -+ ++z; -+ if (z >= 2 && *b == 3) { -+ ++b; -+ z = 0; -+ } -+ } -+ else { -+ z = 0; -+ } -+ } -+ return b; -+} -+#endif -+ -+static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) { -+ const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes -+ const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware -+ const GetBitContext *gb = &s->HEVClc->gb; -+ -+#if OPT_EMU -+ const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1); -+ const int len = de->nal_size - (ptr - de->nal_buffer); -+#else -+ const int len = 1 + gb->size_in_bits/8 - gb->index/8; -+ const void *ptr = &gb->buffer[gb->index/8]; -+#endif -+ -+#if TRACE_DEV -+ printf("Index=%d, /8=%#x\n", gb->index, gb->index/8); -+ dump_data(de->nal_buffer, 128); -+#endif -+ -+ p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later -+ p1_apb_write(de, RPI_BFNUM, len); -+ p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop -+ p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6)); -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Wavefront mode -+ -+static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) -+{ -+ const HEVCPPS * const pps = s->ps.pps; -+ -+ int i, resetQPY=1; -+ int indep = !s->sh.dependent_slice_segment_flag; -+ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; -+ -+ if (ctb_addr_ts) -+ wpp_end_previous_slice(de, s, ctb_addr_ts); -+ pre_slice_decode(de, s); -+ WriteBitstream(de, s); -+ if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1) -+ WriteProb(de, s); -+ else if (ctb_col==0) -+ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); -+ else -+ resetQPY=0; -+ program_slicecmds(de, s->slice_idx); -+ new_slice_segment(de, s); -+ wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts); -+ for (i=0; ish.num_entry_point_offsets; i++) { -+ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; -+ int last_x = de->PicWidthInCtbsY-1; -+ if (de->PicWidthInCtbsY>2) -+ wpp_pause(de, ctb_row); -+ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2); -+ if (de->PicWidthInCtbsY==2) -+ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); -+ if (de->PicWidthInCtbsY==1) -+ WriteProb(de, s); -+ else -+ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); -+ ctb_addr_ts += pps->column_width[0]; -+ wpp_entry_point(de, s, 0, 1, ctb_addr_ts); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+// Tiles mode -+ -+static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { -+ const HEVCPPS * const pps = s->ps.pps; -+ int i, resetQPY; -+ -+ if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts); -+ pre_slice_decode(de, s); -+ WriteBitstream(de, s); -+ resetQPY = ctb_addr_ts==0 -+ || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1] -+ || !s->sh.dependent_slice_segment_flag; -+ if (resetQPY) WriteProb(de, s); -+ program_slicecmds(de, s->slice_idx); -+ new_slice_segment(de, s); -+ new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts); -+ for (i=0; ish.num_entry_point_offsets; i++) { -+ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY; -+ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; -+ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); -+ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); -+ int last_x = pps->col_bd[tile_x+1]-1; -+ int last_y = pps->row_bd[tile_y+1]-1; -+ p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18)); -+ WriteProb(de, s); -+ ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y]; -+ new_entry_point(de, s, 0, 1, ctb_addr_ts); -+ } -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static int cabac_start_align(HEVCContext *s) -+{ -+ GetBitContext *gb = &s->HEVClc->gb; -+ skip_bits(gb, 1); -+ align_get_bits(gb); -+ // Should look at getting rid of this -+ return ff_init_cabac_decoder(&s->HEVClc->cc, -+ gb->buffer + get_bits_count(gb) / 8, -+ (get_bits_left(gb) + 7) / 8); -+} -+ -+static int rpi_hevc_decode_slice( -+ AVCodecContext *avctx, -+ const uint8_t *buffer, -+ uint32_t size) -+{ -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ HEVCContext * const s = avctx->priv_data; -+ dec_env_t * const de = dec_env_get(avctx, rpi); -+ const HEVCPPS *pps = s->ps.pps; -+ int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; -+ -+#if TRACE_ENTRY -+ printf("<<< %s[%p]\n", __func__, de); -+#endif -+ if (de == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); -+ return -1; -+ } -+ -+ if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); -+ return -1; -+ } -+ de->state = RPIVID_DECODE_SLICE; -+ -+ de->nal_buffer = buffer; -+ de->nal_size = size; -+ -+#if !OPT_EMU -+// ff_hevc_cabac_init(s, ctb_addr_ts); -+ cabac_start_align(s); -+#endif -+ if (s->ps.sps->scaling_list_enable_flag) -+ populate_scaling_factors(de, s); -+ pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts) -+ : decode_slice(de, s, ctb_addr_ts); -+#if TRACE_ENTRY -+ printf(">>> %s[%p]\n", __func__, de); -+#endif -+ dec_env_release(rpi, de); -+ return 0; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static int rpivid_retrieve_data(void *logctx, AVFrame *frame) -+{ -+ int rv; -+ if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0) -+ av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n"); -+ return rv; -+} -+ -+static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) -+{ -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ HEVCContext * const s = avctx->priv_data; -+ // Frame buffering + 1 output. Would need thread_count extra but we now -+ // alloc at the start of phase 2 so that is the only thread we need the -+ // extra buffer for. -+ const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1; -+ int rv; -+ -+ if (av_rpi_zc_in_use(avctx)) -+ { -+ const AVZcEnvPtr zc = avctx->opaque; -+ av_rpi_zc_set_decoder_pool_size(zc, pool_req); -+ rv = av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc -+ } -+ else -+ { -+ if (rpi->zc == NULL) { -+ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this -+ // Alloc inside lock to make sure we only ever alloc one -+ if (rpi->zc == NULL) { -+ rpi->zc = av_rpi_zc_int_env_alloc(s); -+ } -+ pthread_mutex_unlock(&rpi->phase_lock); -+ } -+ av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-) -+ rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) : -+ av_rpi_zc_get_buffer(rpi->zc, frame); -+ } -+ -+ if (rv == 0 && -+ (rv = ff_attach_decode_data(frame)) < 0) -+ { -+ av_frame_unref(frame); -+ } -+ -+ if (rv == 0) -+ { -+ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; -+ fdd->post_process = rpivid_retrieve_data; -+ } -+ -+ return rv; -+} -+ -+#if OPT_PHASE_TIMING -+static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins) -+{ -+ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n", -+ bins[0], bins[1], bins[2], bins[3], -+ bins[4], bins[5], bins[6], bins[7], bins[8]); -+} -+#endif -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static int rpi_hevc_free(AVCodecContext *avctx) { -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+ -+#if TRACE_ENTRY -+ printf("<<< %s\n", __func__); -+#endif -+ -+ dec_env_release(rpi, NULL); -+ -+ // Wait for everything else to stop -+ { -+ struct timespec tt; -+ clock_gettime(CLOCK_REALTIME, &tt); -+ tt.tv_sec += 2; -+ while (sem_timedwait(&rpi->ref_zero, &tt) == -1) { -+ const int err = errno; -+ if (err == ETIMEDOUT) { -+ av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n"); -+ return -1; -+ } -+ if (err != EINTR) { -+ av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err); -+ break; -+ } -+ } -+ } -+ -+#if OPT_PHASE_TIMING -+ { -+ unsigned int i; -+ for (i = 0; i != RPIVID_PHASES; ++i) { -+ const phase_wait_env_t * const p = rpi->phase_reqs + i; -+ av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i, -+ (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000), -+ (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000)); -+ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n", -+ time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3], -+ time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]); -+ log_bin_phase(avctx, p->time_bins); -+ log_bin_phase(avctx, p->time_bins3); -+ log_bin_phase(avctx, p->time_bins5); -+ av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n", -+ (unsigned int)(p->max_phase_time / 1000), -+ p->max_time_decode_order); -+ } -+ av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs); -+ } -+#endif -+ -+ if (rpi->dec_envs != NULL) -+ { -+ for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) { -+ dec_env_delete(rpi->dec_envs[i]); -+ } -+ av_freep(&rpi->dec_envs); -+ } -+ -+ av_rpi_zc_int_env_freep(&rpi->zc); -+ -+ gpu_free(&rpi->gcolbuf); -+ -+ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { -+ gpu_free(rpi->gbitbufs + i); -+ } -+ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { -+ gpu_free(rpi->gcoeffbufs + i); -+ } -+ -+ unmap_devp(&rpi->regs, REGS_SIZE); -+ unmap_devp(&rpi->ints, INTS_SIZE); -+ -+ if (rpi->gpu_init_type > 0) -+ rpi_mem_gpu_uninit(); -+ -+ if (rpi->mbox_fd >= 0) { -+ mbox_release_clock(rpi->mbox_fd); -+ mbox_close(rpi->mbox_fd); -+ } -+ -+ sem_destroy(&rpi->ref_zero); -+ sem_destroy(&rpi->coeffbuf_sem); -+ sem_destroy(&rpi->bitbuf_sem); -+ -+#if TRACE_ENTRY -+ printf(">>> %s\n", __func__); -+#endif -+ return 0; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+static int rpi_hevc_init(AVCodecContext *avctx) { -+ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; -+// const char *err; -+ -+#if TRACE_ENTRY -+ printf("<<< %s\n", __func__); -+#endif -+ -+ if (avctx->width>4096 || avctx->height>4096) { -+ av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height); -+ return AVERROR(ENOTSUP); -+ } -+ -+ memset(rpi, 0, sizeof(*rpi)); -+ -+ rpi->mbox_fd = -1; -+ rpi->decode_order = 0; -+ -+ // Initial PU/COEFF stream buffer split chosen as worst case seen so far -+ rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU -+ -+ -+ atomic_store(&rpi->ref_count, 1); -+ sem_init(&rpi->ref_zero, 0, 0); -+ -+ sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS); -+ sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS); -+ -+ pthread_mutex_init(&rpi->phase_lock, NULL); -+ -+ if ((rpi->mbox_fd = mbox_open()) < 0) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n"); -+ goto fail; -+ } -+ mbox_request_clock(rpi->mbox_fd); -+ -+ if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL || -+ (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n"); -+ goto fail; -+ } -+ -+ if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n"); -+ goto fail; -+ } -+ -+ if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count); -+ goto fail; -+ } -+ -+ rpi->col_stride = rnd64(avctx->width); -+ rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4); -+ if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n"); -+ goto fail; -+ } -+ -+ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { -+ if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i); -+ goto fail; -+ } -+ } -+ -+ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { -+ if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0) -+ { -+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i); -+ goto fail; -+ } -+ } -+ -+ av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n"); -+ -+ return 0; -+ -+fail: -+ rpi_hevc_free(avctx); -+ return AVERROR_EXTERNAL; -+} -+ -+////////////////////////////////////////////////////////////////////////////// -+ -+const AVHWAccel ff_hevc_rpi4_8_hwaccel = { -+ .name = "hevc_rpi4_8", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .id = AV_CODEC_ID_HEVC, -+ .pix_fmt = AV_PIX_FMT_RPI4_8, -+ .alloc_frame = rpivid_hevc_alloc_frame, -+ .start_frame = rpi_hevc_start_frame, -+ .end_frame = rpi_hevc_end_frame, -+ .abort_frame = rpi_hevc_abort_frame, -+ .decode_slice = rpi_hevc_decode_slice, -+ .init = rpi_hevc_init, -+ .uninit = rpi_hevc_free, -+ .priv_data_size = sizeof(RPI_T), -+ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, -+}; -+ -+const AVHWAccel ff_hevc_rpi4_10_hwaccel = { -+ .name = "hevc_rpi4_10", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .id = AV_CODEC_ID_HEVC, -+ .pix_fmt = AV_PIX_FMT_RPI4_10, -+ .alloc_frame = rpivid_hevc_alloc_frame, -+ .start_frame = rpi_hevc_start_frame, -+ .end_frame = rpi_hevc_end_frame, -+ .abort_frame = rpi_hevc_abort_frame, -+ .decode_slice = rpi_hevc_decode_slice, -+ .init = rpi_hevc_init, -+ .uninit = rpi_hevc_free, -+ .priv_data_size = sizeof(RPI_T), -+ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, -+}; -+ -diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 4b2679eb38..8d80d19788 100644 ---- a/libavcodec/v4l2_buffers.c -+++ b/libavcodec/v4l2_buffers.c -@@ -21,6 +21,7 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include - #include - #include - #include -@@ -29,57 +30,88 @@ - #include - #include "libavcodec/avcodec.h" - #include "libavcodec/internal.h" -+#include "libavutil/avassert.h" - #include "libavutil/pixdesc.h" -+#include "libavutil/hwcontext.h" - #include "v4l2_context.h" - #include "v4l2_buffers.h" - #include "v4l2_m2m.h" -+#include "v4l2_req_dmabufs.h" -+#include "weak_link.h" - - #define USEC_PER_SEC 1000000 --static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; -+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; - --static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) -+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) - { -- return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? -- container_of(buf->context, V4L2m2mContext, output) : -- container_of(buf->context, V4L2m2mContext, capture); -+ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? -+ container_of(ctx, V4L2m2mContext, output) : -+ container_of(ctx, V4L2m2mContext, capture); - } - --static inline AVCodecContext *logger(V4L2Buffer *buf) -+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) - { -- return buf_to_m2mctx(buf)->avctx; -+ return ctx_to_m2mctx(buf->context); - } - --static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) -+static inline AVCodecContext *logger(const V4L2Buffer * const buf) - { -- V4L2m2mContext *s = buf_to_m2mctx(avbuf); -+ return buf_to_m2mctx(buf)->avctx; -+} - -- if (s->avctx->pkt_timebase.num) -- return s->avctx->pkt_timebase; -- return s->avctx->time_base; -+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) -+{ -+ const V4L2m2mContext *s = buf_to_m2mctx(avbuf); -+ const AVRational tb = s->avctx->pkt_timebase.num ? -+ s->avctx->pkt_timebase : -+ s->avctx->time_base; -+ return tb.num && tb.den ? tb : v4l2_timebase; - } - --static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) -+static inline struct timeval tv_from_int(const int64_t t) - { -- int64_t v4l2_pts; -+ return (struct timeval){ -+ .tv_usec = t % USEC_PER_SEC, -+ .tv_sec = t / USEC_PER_SEC -+ }; -+} - -- if (pts == AV_NOPTS_VALUE) -- pts = 0; -+static inline int64_t int_from_tv(const struct timeval t) -+{ -+ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec; -+} - -+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) -+{ - /* convert pts to v4l2 timebase */ -- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); -- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; -- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; -+ const int64_t v4l2_pts = -+ pts == AV_NOPTS_VALUE ? 0 : -+ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); -+ out->buf.timestamp = tv_from_int(v4l2_pts); - } - --static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) -+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) - { -- int64_t v4l2_pts; -- -+ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp); -+ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE; -+#if 0 - /* convert pts back to encoder timebase */ -- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + -- avbuf->buf.timestamp.tv_usec; -+ return -+ avbuf->context->no_pts_rescale ? v4l2_pts : -+ v4l2_pts == 0 ? AV_NOPTS_VALUE : -+ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); -+#endif -+} - -- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); -+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) -+{ -+ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -+ out->planes[plane].bytesused = bytesused; -+ out->planes[plane].length = length; -+ } else { -+ out->buf.bytesused = bytesused; -+ out->buf.length = length; -+ } - } - - static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) -@@ -116,6 +148,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) - return AVCOL_PRI_UNSPECIFIED; - } - -+static void v4l2_set_color(V4L2Buffer *buf, -+ const enum AVColorPrimaries avcp, -+ const enum AVColorSpace avcs, -+ const enum AVColorTransferCharacteristic avxc) -+{ -+ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; -+ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; -+ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; -+ -+ switch (avcp) { -+ case AVCOL_PRI_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ ycbcr = V4L2_YCBCR_ENC_709; -+ break; -+ case AVCOL_PRI_BT470M: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ ycbcr = V4L2_YCBCR_ENC_601; -+ break; -+ case AVCOL_PRI_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_PRI_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_PRI_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_PRI_BT2020: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ case AVCOL_PRI_SMPTE428: -+ case AVCOL_PRI_SMPTE431: -+ case AVCOL_PRI_SMPTE432: -+ case AVCOL_PRI_EBU3213: -+ case AVCOL_PRI_RESERVED: -+ case AVCOL_PRI_FILM: -+ case AVCOL_PRI_UNSPECIFIED: -+ default: -+ break; -+ } -+ -+ switch (avcs) { -+ case AVCOL_SPC_RGB: -+ cs = V4L2_COLORSPACE_SRGB; -+ break; -+ case AVCOL_SPC_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ break; -+ case AVCOL_SPC_FCC: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ break; -+ case AVCOL_SPC_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_SPC_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_SPC_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_SPC_BT2020_CL: -+ cs = V4L2_COLORSPACE_BT2020; -+ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; -+ break; -+ case AVCOL_SPC_BT2020_NCL: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ default: -+ break; -+ } -+ -+ switch (xfer) { -+ case AVCOL_TRC_BT709: -+ xfer = V4L2_XFER_FUNC_709; -+ break; -+ case AVCOL_TRC_IEC61966_2_1: -+ xfer = V4L2_XFER_FUNC_SRGB; -+ break; -+ case AVCOL_TRC_SMPTE240M: -+ xfer = V4L2_XFER_FUNC_SMPTE240M; -+ break; -+ case AVCOL_TRC_SMPTE2084: -+ xfer = V4L2_XFER_FUNC_SMPTE2084; -+ break; -+ default: -+ break; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { -+ buf->context->format.fmt.pix_mp.colorspace = cs; -+ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; -+ buf->context->format.fmt.pix_mp.xfer_func = xfer; -+ } else { -+ buf->context->format.fmt.pix.colorspace = cs; -+ buf->context->format.fmt.pix.ycbcr_enc = ycbcr; -+ buf->context->format.fmt.pix.xfer_func = xfer; -+ } -+} -+ - static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) - { - enum v4l2_quantization qt; -@@ -134,6 +265,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) - return AVCOL_RANGE_UNSPECIFIED; - } - -+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) -+{ -+ const enum v4l2_quantization q = -+ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : -+ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : -+ V4L2_QUANTIZATION_DEFAULT; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { -+ buf->context->format.fmt.pix_mp.quantization = q; -+ } else { -+ buf->context->format.fmt.pix.quantization = q; -+ } -+} -+ - static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) - { - enum v4l2_ycbcr_encoding ycbcr; -@@ -210,73 +355,178 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) - return AVCOL_TRC_UNSPECIFIED; - } - --static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) - { -- V4L2Buffer* avbuf = opaque; -- V4L2m2mContext *s = buf_to_m2mctx(avbuf); -+ return V4L2_FIELD_IS_INTERLACED(buf->buf.field); -+} - -- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) { -- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); -+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) -+{ -+ return buf->buf.field == V4L2_FIELD_INTERLACED_TB; -+} - -- if (s->reinit) { -- if (!atomic_load(&s->refcount)) -- sem_post(&s->refsync); -- } else { -- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) { -- /* no need to queue more buffers to the driver */ -- avbuf->status = V4L2BUF_AVAILABLE; -- } -- else if (avbuf->context->streamon) -- ff_v4l2_buffer_enqueue(avbuf); -- } -+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) -+{ -+ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : -+ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; -+} - -- av_buffer_unref(&avbuf->context_ref); -+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) -+{ -+ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor *layer; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_objects = avbuf->num_planes; -+ drm_desc->nb_layers = 1; -+ -+ layer = &drm_desc->layers[0]; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = avbuf->plane_info[i].offset; -+ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; - } -+ -+ switch (avbuf->context->av_pix_fmt) { -+ case AV_PIX_FMT_YUYV422: -+ -+ layer->format = DRM_FORMAT_YUYV; -+ layer->nb_planes = 1; -+ -+ break; -+ -+ case AV_PIX_FMT_NV12: -+ case AV_PIX_FMT_NV21: -+ -+ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? -+ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; -+ break; -+ -+ case AV_PIX_FMT_YUV420P: -+ -+ layer->format = DRM_FORMAT_YUV420; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 3; -+ -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height; -+ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ -+ layer->planes[2].object_index = 0; -+ layer->planes[2].offset = layer->planes[1].offset + -+ ((avbuf->plane_info[0].bytesperline * -+ avbuf->context->format.fmt.pix.height) >> 2); -+ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; -+ break; -+ -+ default: -+ drm_desc->nb_layers = 0; -+ break; -+ } -+ -+ return (uint8_t *) drm_desc; - } - --static int v4l2_buf_increase_ref(V4L2Buffer *in) -+static void v4l2_free_bufref(void *opaque, uint8_t *data) - { -- V4L2m2mContext *s = buf_to_m2mctx(in); -+ AVBufferRef * bufref = (AVBufferRef *)data; -+ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; -+ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); - -- if (in->context_ref) -- atomic_fetch_add(&in->context_refcount, 1); -- else { -- in->context_ref = av_buffer_ref(s->self_ref); -- if (!in->context_ref) -- return AVERROR(ENOMEM); -+ if (ctx != NULL) { -+ // Buffer still attached to context -+ V4L2m2mContext *s = buf_to_m2mctx(avbuf); - -- in->context_refcount = 1; -- } -+ ff_mutex_lock(&ctx->lock); - -- in->status = V4L2BUF_RET_USER; -- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); -+ ff_v4l2_buffer_set_avail(avbuf); - -- return 0; -+ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); -+ /* no need to queue more buffers to the driver */ -+ } -+ else if (ctx->streamon) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name); -+ avbuf->buf.timestamp.tv_sec = 0; -+ avbuf->buf.timestamp.tv_usec = 0; -+ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER -+ } -+ else { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name); -+ } -+ -+ ff_mutex_unlock(&ctx->lock); -+ } -+ -+ ff_weak_link_unlock(avbuf->context_wl); -+ av_buffer_unref(&bufref); - } - --static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) -+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i) - { -- int ret; -+ return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length; -+} - -- if (plane >= in->num_planes) -- return AVERROR(EINVAL); -+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) -+{ -+ int i, ret; -+ const V4L2m2mContext * const s = buf_to_m2mctx(avbuf); - -- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ -- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, -- in->plane_info[plane].length, v4l2_free_buffer, in, 0); -- if (!*buf) -- return AVERROR(ENOMEM); -+ for (i = 0; i < avbuf->num_planes; i++) { -+ int dma_fd = -1; -+ const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i); -+ -+ if (s->db_ctl != NULL) { -+ if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL) -+ return AVERROR(ENOMEM); -+ dma_fd = dmabuf_fd(avbuf->dmabuf[i]); -+ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) -+ avbuf->buf.m.planes[i].m.fd = dma_fd; -+ else -+ avbuf->buf.m.fd = dma_fd; -+ } -+ else { -+ struct v4l2_exportbuffer expbuf; -+ memset(&expbuf, 0, sizeof(expbuf)); -+ -+ expbuf.index = avbuf->buf.index; -+ expbuf.type = avbuf->buf.type; -+ expbuf.plane = i; -+ -+ ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ dma_fd = expbuf.fd; -+ } - -- ret = v4l2_buf_increase_ref(in); -- if (ret) -- av_buffer_unref(buf); -+ avbuf->drm_frame.objects[i].size = blen; -+ avbuf->drm_frame.objects[i].fd = dma_fd; -+ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ } - -- return ret; -+ return 0; - } - - static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) - { - unsigned int bytesused, length; -+ int rv = 0; - - if (plane >= out->num_planes) - return AVERROR(EINVAL); -@@ -284,32 +534,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i - length = out->plane_info[plane].length; - bytesused = FFMIN(size+offset, length); - -- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); -- -- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -- out->planes[plane].bytesused = bytesused; -- out->planes[plane].length = length; -- } else { -- out->buf.bytesused = bytesused; -- out->buf.length = length; -+ if (size > length - offset) { -+ size = length - offset; -+ rv = AVERROR(ENOMEM); - } - -- return 0; -+ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); -+ -+ set_buf_length(out, plane, bytesused, length); -+ -+ return rv; -+} -+ -+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) -+{ -+ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]); -+ AVBufferRef * newbuf; -+ -+ if (!bufref) -+ return NULL; -+ -+ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0); -+ if (newbuf == NULL) -+ av_buffer_unref(&bufref); -+ -+ avbuf->status = V4L2BUF_RET_USER; -+ return newbuf; - } - - static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - { -- int i, ret; -+ int i; - - frame->format = avbuf->context->av_pix_fmt; - -- for (i = 0; i < avbuf->num_planes; i++) { -- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); -- if (ret) -- return ret; -+ frame->buf[0] = wrap_avbuf(avbuf); -+ if (frame->buf[0] == NULL) -+ return AVERROR(ENOMEM); - -+ if (buf_to_m2mctx(avbuf)->output_drm) { -+ /* 1. get references to the actual data */ -+ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); -+ return 0; -+ } -+ -+ -+ /* 1. get references to the actual data */ -+ for (i = 0; i < avbuf->num_planes; i++) { -+ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset; - frame->linesize[i] = avbuf->plane_info[i].bytesperline; -- frame->data[i] = frame->buf[i]->data; - } - - /* fixup special cases */ -@@ -318,17 +593,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - case AV_PIX_FMT_NV21: - if (avbuf->num_planes > 1) - break; -- frame->linesize[1] = avbuf->plane_info[0].bytesperline; -- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; -+ frame->linesize[1] = frame->linesize[0]; -+ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); - break; - - case AV_PIX_FMT_YUV420P: - if (avbuf->num_planes > 1) - break; -- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1; -- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1; -- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; -- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2); -+ frame->linesize[1] = frame->linesize[0] / 2; -+ frame->linesize[2] = frame->linesize[1]; -+ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); -+ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; - break; - - default: -@@ -338,68 +613,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) - return 0; - } - -+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) -+{ -+ if (dst_stride == src_stride && w + 32 >= dst_stride) { -+ memcpy(dst, src, dst_stride * h); -+ } -+ else { -+ while (--h >= 0) { -+ memcpy(dst, src, w); -+ dst += dst_stride; -+ src += src_stride; -+ } -+ } -+} -+ -+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) -+{ -+ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); -+} -+ -+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out) -+{ -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src) -+ return AVERROR(EINVAL); -+ -+ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { -+ // Only currently cope with single buffer types -+ if (out->buf.length != 1) -+ return AVERROR_PATCHWELCOME; -+ if (src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ out->planes[0].m.fd = src->objects[0].fd; -+ } -+ else { -+ if (src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ out->buf.m.fd = src->objects[0].fd; -+ } -+ -+ // No need to copy src AVDescriptor and if we did then we may confuse -+ // fd close on free -+ out->ref_buf = av_buffer_ref(frame->buf[0]); -+ -+ return 0; -+} -+ - static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - { -- int i, ret; -- struct v4l2_format fmt = out->context->format; -- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? -- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; -- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? -- fmt.fmt.pix_mp.height : fmt.fmt.pix.height; -- int is_planar_format = 0; -- -- switch (pixel_format) { -- case V4L2_PIX_FMT_YUV420M: -- case V4L2_PIX_FMT_YVU420M: --#ifdef V4L2_PIX_FMT_YUV422M -- case V4L2_PIX_FMT_YUV422M: --#endif --#ifdef V4L2_PIX_FMT_YVU422M -- case V4L2_PIX_FMT_YVU422M: --#endif --#ifdef V4L2_PIX_FMT_YUV444M -- case V4L2_PIX_FMT_YUV444M: --#endif --#ifdef V4L2_PIX_FMT_YVU444M -- case V4L2_PIX_FMT_YVU444M: --#endif -- case V4L2_PIX_FMT_NV12M: -- case V4L2_PIX_FMT_NV21M: -- case V4L2_PIX_FMT_NV12MT_16X16: -- case V4L2_PIX_FMT_NV12MT: -- case V4L2_PIX_FMT_NV16M: -- case V4L2_PIX_FMT_NV61M: -- is_planar_format = 1; -- } -- -- if (!is_planar_format) { -- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); -- int planes_nb = 0; -- int offset = 0; -- -- for (i = 0; i < desc->nb_components; i++) -- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); -- -- for (i = 0; i < planes_nb; i++) { -- int size, h = height; -- if (i == 1 || i == 2) { -+ int i; -+ int num_planes = 0; -+ int pel_strides[4] = {0}; -+ -+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); -+ -+ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { -+ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); -+ return -1; -+ } -+ -+ for (i = 0; i != desc->nb_components; ++i) { -+ if (desc->comp[i].plane >= num_planes) -+ num_planes = desc->comp[i].plane + 1; -+ pel_strides[desc->comp[i].plane] = desc->comp[i].step; -+ } -+ -+ if (out->num_planes > 1) { -+ if (num_planes != out->num_planes) { -+ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); -+ return -1; -+ } -+ for (i = 0; i != num_planes; ++i) { -+ int w = frame->width; -+ int h = frame->height; -+ if (is_chroma(desc, i, num_planes)) { -+ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); - h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); - } -- size = frame->linesize[i] * h; -- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset); -- if (ret) -- return ret; -- offset += size; -+ -+ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, -+ frame->data[i], frame->linesize[i], -+ w * pel_strides[i], h); -+ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); - } -- return 0; - } -+ else -+ { -+ unsigned int offset = 0; -+ -+ for (i = 0; i != num_planes; ++i) { -+ int w = frame->width; -+ int h = frame->height; -+ int dst_stride = out->plane_info[0].bytesperline; -+ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; -+ -+ if (is_chroma(desc, i, num_planes)) { -+ // Is chroma -+ dst_stride >>= desc->log2_chroma_w; -+ offset += dst_stride * (out->context->height >> desc->log2_chroma_h); -+ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); -+ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); -+ } -+ else { -+ // Is luma or alpha -+ offset += dst_stride * out->context->height; -+ } -+ if (offset > out->plane_info[0].length) { -+ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); -+ return -1; -+ } - -- for (i = 0; i < out->num_planes; i++) { -- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0); -- if (ret) -- return ret; -+ cpy_2d(dst, dst_stride, -+ frame->data[i], frame->linesize[i], -+ w * pel_strides[i], h); -+ } -+ set_buf_length(out, 0, offset, out->plane_info[0].length); - } -- - return 0; - } - -@@ -409,16 +743,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) - * - ******************************************************************************/ - --int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) -+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts) - { -- v4l2_set_pts(out, frame->pts); -- -- return v4l2_buffer_swframe_to_buf(frame, out); -+ out->buf.flags = frame->key_frame ? -+ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -+ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); -+ // Beware that colour info is held in format rather than the actual -+ // v4l2 buffer struct so this may not be as useful as you might hope -+ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); -+ v4l2_set_color_range(out, frame->color_range); -+ // PTS & interlace are buffer vars -+ if (track_ts) -+ out->buf.timestamp = tv_from_int(track_ts); -+ else -+ v4l2_set_pts(out, frame->pts); -+ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); -+ -+ return frame->format == AV_PIX_FMT_DRM_PRIME ? -+ v4l2_buffer_primeframe_to_buf(frame, out) : -+ v4l2_buffer_swframe_to_buf(frame, out); - } - - int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - { - int ret; -+ V4L2Context * const ctx = avbuf->context; - - av_frame_unref(frame); - -@@ -429,17 +778,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - - /* 2. get frame information */ - frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); -+ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : -+ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : -+ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : -+ AV_PICTURE_TYPE_NONE; - frame->color_primaries = v4l2_get_color_primaries(avbuf); - frame->colorspace = v4l2_get_color_space(avbuf); - frame->color_range = v4l2_get_color_range(avbuf); - frame->color_trc = v4l2_get_color_trc(avbuf); - frame->pts = v4l2_get_pts(avbuf); - frame->pkt_dts = AV_NOPTS_VALUE; -+ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); -+ frame->top_field_first = v4l2_buf_is_top_first(avbuf); - - /* these values are updated also during re-init in v4l2_process_driver_event */ -- frame->height = avbuf->context->height; -- frame->width = avbuf->context->width; -- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio; -+ frame->height = ctx->height; -+ frame->width = ctx->width; -+ frame->sample_aspect_ratio = ctx->sample_aspect_ratio; -+ -+ if (ctx->selection.height && ctx->selection.width) { -+ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0; -+ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0; -+ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? -+ frame->width - (ctx->selection.left + ctx->selection.width) : 0; -+ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? -+ frame->height - (ctx->selection.top + ctx->selection.height) : 0; -+ } - - /* 3. report errors upstream */ - if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { -@@ -452,15 +816,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) - - int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - { -- int ret; -- - av_packet_unref(pkt); -- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); -- if (ret) -- return ret; -+ -+ pkt->buf = wrap_avbuf(avbuf); -+ if (pkt->buf == NULL) -+ return AVERROR(ENOMEM); - - pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; -- pkt->data = pkt->buf->data; -+ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; -+ pkt->flags = 0; - - if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) - pkt->flags |= AV_PKT_FLAG_KEY; -@@ -475,39 +839,107 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) - return 0; - } - --int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, -+ const void *extdata, size_t extlen, -+ const int64_t timestamp) - { - int ret; - -- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0); -- if (ret) -+ if (extlen) { -+ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); -+ if (ret) -+ return ret; -+ } -+ -+ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); -+ if (ret && ret != AVERROR(ENOMEM)) - return ret; - -- v4l2_set_pts(out, pkt->pts); -+ if (timestamp) -+ out->buf.timestamp = tv_from_int(timestamp); -+ else -+ v4l2_set_pts(out, pkt->pts); - -- if (pkt->flags & AV_PKT_FLAG_KEY) -- out->flags = V4L2_BUF_FLAG_KEYFRAME; -+ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? -+ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : -+ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); - -- return 0; -+ return ret; -+} -+ -+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) -+{ -+ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); - } - --int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) -+ -+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) -+{ -+ V4L2Buffer * const avbuf = (V4L2Buffer *)data; -+ int i; -+ -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) { -+ struct V4L2Plane_info *p = avbuf->plane_info + i; -+ if (p->mm_addr != NULL) -+ munmap(p->mm_addr, p->length); -+ } -+ -+ if (avbuf->dmabuf[0] == NULL) { -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -+ if (avbuf->drm_frame.objects[i].fd != -1) -+ close(avbuf->drm_frame.objects[i].fd); -+ } -+ } -+ else { -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) { -+ dmabuf_free(avbuf->dmabuf[i]); -+ } -+ } -+ -+ av_buffer_unref(&avbuf->ref_buf); -+ -+ ff_weak_link_unref(&avbuf->context_wl); -+ -+ av_free(avbuf); -+} -+ -+ -+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem) - { -- V4L2Context *ctx = avbuf->context; - int ret, i; -+ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); -+ AVBufferRef * bufref; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - -- avbuf->buf.memory = V4L2_MEMORY_MMAP; -+ *pbufref = NULL; -+ if (avbuf == NULL) -+ return AVERROR(ENOMEM); -+ -+ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); -+ if (bufref == NULL) { -+ av_free(avbuf); -+ return AVERROR(ENOMEM); -+ } -+ -+ avbuf->context = ctx; -+ avbuf->buf.memory = mem; - avbuf->buf.type = ctx->type; - avbuf->buf.index = index; - -+ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { -+ avbuf->drm_frame.objects[i].fd = -1; -+ } -+ -+ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master); -+ - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->buf.length = VIDEO_MAX_PLANES; - avbuf->buf.m.planes = avbuf->planes; - } - -- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); -+ ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf); - if (ret < 0) -- return AVERROR(errno); -+ goto fail; - - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->num_planes = 0; -@@ -520,6 +952,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - avbuf->num_planes = 1; - - for (i = 0; i < avbuf->num_planes; i++) { -+ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP && -+ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm); - - avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? - ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline : -@@ -527,25 +961,31 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; -- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, -- PROT_READ | PROT_WRITE, MAP_SHARED, -- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); -+ avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset; -+ -+ if (want_mmap) -+ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, -+ PROT_READ | PROT_WRITE, MAP_SHARED, -+ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); - } else { - avbuf->plane_info[i].length = avbuf->buf.length; -- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, -- PROT_READ | PROT_WRITE, MAP_SHARED, -- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); -+ avbuf->plane_info[i].offset = 0; -+ -+ if (want_mmap) -+ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, -+ PROT_READ | PROT_WRITE, MAP_SHARED, -+ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); - } - -- if (avbuf->plane_info[i].mm_addr == MAP_FAILED) -- return AVERROR(ENOMEM); -+ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { -+ avbuf->plane_info[i].mm_addr = NULL; -+ ret = AVERROR(ENOMEM); -+ goto fail; -+ } - } - - avbuf->status = V4L2BUF_AVAILABLE; - -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -- return 0; -- - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - avbuf->buf.m.planes = avbuf->planes; - avbuf->buf.length = avbuf->num_planes; -@@ -555,20 +995,53 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) - avbuf->buf.length = avbuf->planes[0].length; - } - -- return ff_v4l2_buffer_enqueue(avbuf); -+ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -+ if (s->output_drm) { -+ ret = v4l2_buffer_export_drm(avbuf); -+ if (ret) { -+ av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n"); -+ goto fail; -+ } -+ } -+ } -+ -+ *pbufref = bufref; -+ return 0; -+ -+fail: -+ av_buffer_unref(&bufref); -+ return ret; - } - - int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) - { - int ret; -+ int qc; - -- avbuf->buf.flags = avbuf->flags; -+ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { -+ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", -+ avbuf->context->name, avbuf->buf.index, -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, -+ avbuf->context->q_count); -+ } - - ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); -- if (ret < 0) -- return AVERROR(errno); -+ if (ret < 0) { -+ int err = errno; -+ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n", -+ avbuf->context->name, avbuf->buf.index, -+ err, strerror(err)); -+ return AVERROR(err); -+ } - -+ // Lock not wanted - if called from buffer free then lock already obtained -+ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; - avbuf->status = V4L2BUF_IN_DRIVER; -+ pthread_cond_broadcast(&avbuf->context->cond); -+ -+ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", -+ avbuf->context->name, avbuf->buf.index, -+ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); - - return 0; - } -diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h -index 8dbc7fc104..0bda4dd06b 100644 ---- a/libavcodec/v4l2_buffers.h -+++ b/libavcodec/v4l2_buffers.h -@@ -27,29 +27,44 @@ - #include - #include - -+#include "libavutil/hwcontext_drm.h" - #include "avcodec.h" - - enum V4L2Buffer_status { - V4L2BUF_AVAILABLE, - V4L2BUF_IN_DRIVER, -+ V4L2BUF_IN_USE, - V4L2BUF_RET_USER, - }; - - /** - * V4L2Buffer (wrapper for v4l2_buffer management) - */ -+struct V4L2Context; -+struct ff_weak_link_client; -+struct dmabuf_h; -+ - typedef struct V4L2Buffer { -- /* each buffer needs to have a reference to its context */ -+ /* each buffer needs to have a reference to its context -+ * The pointer is good enough for most operation but once the buffer has -+ * been passed to the user the buffer may become orphaned so for free ops -+ * the weak link must be used to ensure that the context is actually -+ * there -+ */ - struct V4L2Context *context; -+ struct ff_weak_link_client *context_wl; - -- /* This object is refcounted per-plane, so we need to keep track -- * of how many context-refs we are holding. */ -- AVBufferRef *context_ref; -- atomic_uint context_refcount; -+ /* DRM descriptor */ -+ AVDRMFrameDescriptor drm_frame; -+ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we -+ * are done -+ */ -+ AVBufferRef * ref_buf; - - /* keep track of the mmap address and mmap length */ - struct V4L2Plane_info { -- int bytesperline; -+ size_t bytesperline; -+ size_t offset; - void * mm_addr; - size_t length; - } plane_info[VIDEO_MAX_PLANES]; -@@ -60,9 +75,9 @@ typedef struct V4L2Buffer { - struct v4l2_buffer buf; - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - -- int flags; - enum V4L2Buffer_status status; - -+ struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here - } V4L2Buffer; - - /** -@@ -98,6 +113,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); - */ - int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - -+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, -+ const void *extdata, size_t extlen, -+ const int64_t timestamp); -+ - /** - * Extracts the data from an AVFrame to a V4L2Buffer - * -@@ -106,7 +125,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); - * - * @returns 0 in case of success, a negative AVERROR code otherwise - */ --int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); -+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts); - - /** - * Initializes a V4L2Buffer -@@ -116,7 +135,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); - * - * @returns 0 in case of success, a negative AVERROR code otherwise - */ --int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); -+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem); - - /** - * Enqueues a V4L2Buffer -@@ -127,5 +146,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); - */ - int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf); - -+static inline void -+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf) -+{ -+ avbuf->status = V4L2BUF_AVAILABLE; -+ av_buffer_unref(&avbuf->ref_buf); -+} -+ - - #endif // AVCODEC_V4L2_BUFFERS_H -diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ff1ea8e57b..fcd5fdf359 100644 ---- a/libavcodec/v4l2_context.c -+++ b/libavcodec/v4l2_context.c -@@ -27,11 +27,13 @@ - #include - #include - #include -+#include "libavutil/avassert.h" - #include "libavcodec/avcodec.h" - #include "libavcodec/internal.h" - #include "v4l2_buffers.h" - #include "v4l2_fmt.h" - #include "v4l2_m2m.h" -+#include "weak_link.h" - - struct v4l2_format_update { - uint32_t v4l2_fmt; -@@ -41,26 +43,168 @@ struct v4l2_format_update { - int update_avfmt; - }; - --static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) -+ -+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) - { -- return V4L2_TYPE_IS_OUTPUT(ctx->type) ? -- container_of(ctx, V4L2m2mContext, output) : -- container_of(ctx, V4L2m2mContext, capture); -+ return (int64_t)n; - } - --static inline AVCodecContext *logger(V4L2Context *ctx) -+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) - { -- return ctx_to_m2mctx(ctx)->avctx; -+ return (unsigned int)pts; - } - --static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) -+// FFmpeg requires us to propagate a number of vars from the coded pkt into -+// the decoded frame. The only thing that tracks like that in V4L2 stateful -+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no -+// guarantees about PTS being unique or specified for every frame so replace -+// the supplied PTS with a simple incrementing number and keep a circular -+// buffer of all the things we want preserved (including the original PTS) -+// indexed by the tracking no. -+static int64_t -+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt) - { -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++x->track_no == 0) -+ x->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, x->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); -+ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pending = 1, -+ .pkt_size = avpkt->size, -+ .pts = avpkt->pts, -+ .dts = avpkt->dts, -+ .reordered_opaque = avctx->reordered_opaque, -+ .pkt_pos = avpkt->pos, -+ .pkt_duration = avpkt->duration, -+ .track_pts = track_pts -+ }; -+ return track_pts; -+} -+ -+static int64_t -+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame) -+{ -+ int64_t track_pts; -+ -+ // Avoid 0 -+ if (++x->track_no == 0) -+ x->track_no = 1; -+ -+ track_pts = track_to_pts(avctx, x->track_no); -+ -+ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); -+ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ -+ .discard = 0, -+ .pending = 1, -+ .pkt_size = 0, -+ .pts = frame->pts, -+ .dts = AV_NOPTS_VALUE, -+ .reordered_opaque = frame->reordered_opaque, -+ .pkt_pos = frame->pkt_pos, -+ .pkt_duration = frame->pkt_duration, -+ .track_pts = track_pts -+ }; -+ return track_pts; - } - --static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_frame_out(AVCodecContext *const avctx, -+ xlat_track_t * const x, -+ AVFrame *const frame) - { -- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -+ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ V4L2m2mTrackEl *const t = x->track_els + n; -+ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) -+ { -+ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, -+ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ frame->pts = AV_NOPTS_VALUE; -+ frame->pkt_dts = AV_NOPTS_VALUE; -+ frame->reordered_opaque = x->last_opaque; -+ frame->pkt_pos = -1; -+ frame->pkt_duration = 0; -+ frame->pkt_size = -1; -+ } -+ else if (!t->discard) -+ { -+ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -+ frame->pkt_dts = t->dts; -+ frame->reordered_opaque = t->reordered_opaque; -+ frame->pkt_pos = t->pkt_pos; -+ frame->pkt_duration = t->pkt_duration; -+ frame->pkt_size = t->pkt_size; -+ -+ x->last_opaque = x->track_els[n].reordered_opaque; -+ if (frame->pts != AV_NOPTS_VALUE) -+ x->last_pts = frame->pts; -+ t->pending = 0; -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); -+ return -1; -+ } -+ -+ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", -+ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); -+ return 0; -+} -+ -+// Returns -1 if we should discard the frame -+static int -+xlat_pts_pkt_out(AVCodecContext *const avctx, -+ xlat_track_t * const x, -+ AVPacket *const pkt) -+{ -+ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE; -+ V4L2m2mTrackEl *const t = x->track_els + n; -+ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts) -+ { -+ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, -+ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); -+ pkt->pts = AV_NOPTS_VALUE; -+ } -+ else if (!t->discard) -+ { -+ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE; -+ -+ x->last_opaque = x->track_els[n].reordered_opaque; -+ if (pkt->pts != AV_NOPTS_VALUE) -+ x->last_pts = pkt->pts; -+ t->pending = 0; -+ } -+ else -+ { -+ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); -+ return -1; -+ } -+ -+ // * Would like something much better than this...xlat(offset + out_count)? -+ pkt->dts = pkt->pts; -+ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n", -+ pkt->pts, t->track_pts, n); -+ return 0; -+} -+ -+ -+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) -+{ -+ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? -+ container_of(ctx, V4L2m2mContext, output) : -+ container_of(ctx, V4L2m2mContext, capture); -+} -+ -+static inline AVCodecContext *logger(const V4L2Context *ctx) -+{ -+ return ctx_to_m2mctx(ctx)->avctx; - } - - static AVRational v4l2_get_sar(V4L2Context *ctx) -@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx) - return sar; - } - --static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) -+static inline int ctx_buffers_alloced(const V4L2Context * const ctx) - { -- struct v4l2_format *fmt1 = &ctx->format; -- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? -- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || -- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height -- : -- fmt1->fmt.pix.width != fmt2->fmt.pix.width || -- fmt1->fmt.pix.height != fmt2->fmt.pix.height; -+ return ctx->bufrefs != NULL; -+} -+ -+// Width/Height changed or we don't have an alloc in the first place? -+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) -+{ -+ const struct v4l2_format *fmt1 = &ctx->format; -+ int ret = !ctx_buffers_alloced(ctx) || -+ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? -+ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || -+ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height -+ : -+ fmt1->fmt.pix.width != fmt2->fmt.pix.width || -+ fmt1->fmt.pix.height != fmt2->fmt.pix.height); - - if (ret) -- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", -+ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", - ctx->name, -- v4l2_get_width(fmt1), v4l2_get_height(fmt1), -- v4l2_get_width(fmt2), v4l2_get_height(fmt2)); -+ ctx_buffers_alloced(ctx), -+ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), -+ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); - - return ret; - } -@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd - } - } - --/** -- * handle resolution change event and end of stream event -- * returns 1 if reinit was successful, negative if it failed -- * returns 0 if reinit was not executed -- */ --static int v4l2_handle_event(V4L2Context *ctx) -+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -- struct v4l2_format cap_fmt = s->capture.format; -- struct v4l2_format out_fmt = s->output.format; -- struct v4l2_event evt = { 0 }; -- int full_reinit, reinit, ret; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ struct v4l2_selection selection = { -+ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, -+ .target = V4L2_SEL_TGT_COMPOSE -+ }; - -- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); -- if (ret < 0) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); -- return 0; -- } -+ memset(r, 0, sizeof(*r)); -+ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) -+ return AVERROR(errno); - -- if (evt.type == V4L2_EVENT_EOS) { -- ctx->done = 1; -- return 0; -- } -+ *r = selection.r; -+ return 0; -+} - -- if (evt.type != V4L2_EVENT_SOURCE_CHANGE) -- return 0; -+static int do_source_change(V4L2m2mContext * const s) -+{ -+ AVCodecContext *const avctx = s->avctx; - -- ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt); -- if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name); -- return 0; -- } -+ int ret; -+ int reinit; -+ struct v4l2_format cap_fmt = s->capture.format; -+ -+ s->capture.done = 0; - - ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); - if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); -+ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); - return 0; - } - -- full_reinit = v4l2_resolution_changed(&s->output, &out_fmt); -- if (full_reinit) { -- s->output.height = v4l2_get_height(&out_fmt); -- s->output.width = v4l2_get_width(&out_fmt); -- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); -- } -+ get_default_selection(&s->capture, &s->capture.selection); - -- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); -+ reinit = ctx_resolution_changed(&s->capture, &cap_fmt); -+ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) -+ reinit = 1; -+ -+ s->capture.format = cap_fmt; - if (reinit) { -- s->capture.height = v4l2_get_height(&cap_fmt); -- s->capture.width = v4l2_get_width(&cap_fmt); -- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); -+ s->capture.height = ff_v4l2_get_format_height(&cap_fmt); -+ s->capture.width = ff_v4l2_get_format_width(&cap_fmt); - } - -- if (full_reinit || reinit) -- s->reinit = 1; -- -- if (full_reinit) { -- ret = ff_v4l2_m2m_codec_full_reinit(s); -- if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n"); -- return AVERROR(EINVAL); -- } -- goto reinit_run; -+ // If we don't support selection (or it is bust) and we obviously have HD then kludge -+ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && -+ (s->capture.height == 1088 && s->capture.width == 1920)) { -+ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; - } - -+ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); -+ -+ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", -+ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, -+ s->capture.width, s->capture.height, -+ s->capture.selection.width, s->capture.selection.height, -+ s->capture.selection.left, s->capture.selection.top, reinit); -+ - if (reinit) { -- if (s->avctx) -- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); -+ if (avctx) -+ ret = ff_set_dimensions(s->avctx, -+ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, -+ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); - if (ret < 0) -- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); -+ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); - - ret = ff_v4l2_m2m_codec_reinit(s); - if (ret) { -- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); -+ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || -+ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { -+ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", -+ s->capture.width, s->capture.height, -+ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); - return AVERROR(EINVAL); - } -+ -+ // Update pixel format - should only actually do something on initial change -+ s->capture.av_pix_fmt = -+ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); -+ if (s->output_drm) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ avctx->sw_pix_fmt = s->capture.av_pix_fmt; -+ } -+ else -+ avctx->pix_fmt = s->capture.av_pix_fmt; -+ - goto reinit_run; - } - -- /* dummy event received */ -- return 0; -+ /* Buffers are OK so just stream off to ack */ -+ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); -+ -+ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); -+ if (ret) -+ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); -+ s->draining = 0; - - /* reinit executed */ - reinit_run: -+ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); - return 1; - } - -@@ -280,171 +452,293 @@ static int v4l2_stop_encode(V4L2Context *ctx) - return 0; - } - --static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) -+// DQ a buffer -+// Amalgamates all the various ways there are of signalling EOS/Event to -+// generate a consistant EPIPE. -+// -+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) -+// -+// Returns: -+// 0 Success -+// AVERROR(EPIPE) Nothing more to read -+// AVERROR(ENOSPC) No buffers in Q to put result in -+// * AVERROR(..) -+ -+ static int -+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) - { -- struct v4l2_plane planes[VIDEO_MAX_PLANES]; -- struct v4l2_buffer buf = { 0 }; -- V4L2Buffer *avbuf; -- struct pollfd pfd = { -- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ -- .fd = ctx_to_m2mctx(ctx)->fd, -+ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); -+ AVCodecContext * const avctx = m->avctx; -+ V4L2Buffer * avbuf; -+ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); -+ -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -+ -+ struct v4l2_buffer buf = { -+ .type = ctx->type, -+ .memory = V4L2_MEMORY_MMAP, - }; -- int i, ret; - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { -- for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -- break; -- } -- if (i == ctx->num_buffers) -- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " -- "userspace. Increase num_capture_buffers " -- "to prevent device deadlock or dropped " -- "packets/frames.\n"); -- } -- -- /* if we are draining and there are no more capture buffers queued in the driver we are done */ -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { -- for (i = 0; i < ctx->num_buffers; i++) { -- /* capture buffer initialization happens during decode hence -- * detection happens at runtime -- */ -- if (!ctx->buffers) -- break; -- -- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) -- goto start; -+ *ppavbuf = NULL; -+ -+ if (ctx->flag_last) -+ return AVERROR(EPIPE); -+ -+ if (is_mp) { -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; -+ } -+ -+ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { -+ const int err = errno; -+ av_assert0(AVERROR(err) < 0); -+ if (err != EINTR) { -+ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -+ ctx->name, av_err2str(AVERROR(err))); -+ -+ if (err == EPIPE) -+ ctx->flag_last = 1; -+ -+ return AVERROR(err); - } -- ctx->done = 1; -- return NULL; - } -+ atomic_fetch_sub(&ctx->q_count, 1); -+ -+ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; -+ ff_v4l2_buffer_set_avail(avbuf); -+ avbuf->buf = buf; -+ if (is_mp) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buf.m.planes = avbuf->planes; -+ } -+ // Done with any attached buffer -+ av_buffer_unref(&avbuf->ref_buf); - --start: -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -- pfd.events = POLLOUT | POLLWRNORM; -- else { -- /* no need to listen to requests for more input while draining */ -- if (ctx_to_m2mctx(ctx)->draining) -- pfd.events = POLLIN | POLLRDNORM | POLLPRI; -+ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { -+ // Zero length cap buffer return == EOS -+ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); -+ -+ // Must reQ so we don't leak -+ // May not matter if the next thing we do is release all the -+ // buffers but better to be tidy. -+ ff_v4l2_buffer_enqueue(avbuf); -+ -+ ctx->flag_last = 1; -+ return AVERROR(EPIPE); -+ } -+ -+#ifdef V4L2_BUF_FLAG_LAST -+ // If flag_last set then this contains data but is the last frame -+ // so remember that but return OK -+ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) -+ ctx->flag_last = 1; -+#endif - } - -- for (;;) { -- ret = poll(&pfd, 1, timeout); -- if (ret > 0) -- break; -- if (errno == EINTR) -+ *ppavbuf = avbuf; -+ return 0; -+} -+ -+/** -+ * handle resolution change event and end of stream event -+ * Expects to be called after the stream has stopped -+ * -+ * returns 1 if reinit was successful, negative if it failed -+ * returns 0 if reinit was not executed -+ */ -+static int -+get_event(V4L2m2mContext * const m) -+{ -+ AVCodecContext * const avctx = m->avctx; -+ struct v4l2_event evt = { 0 }; -+ -+ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { -+ const int rv = AVERROR(errno); -+ if (rv == AVERROR(EINTR)) - continue; -- return NULL; -+ if (rv == AVERROR(EAGAIN)) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); -+ return AVERROR_EOF; -+ } -+ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); -+ return rv; - } - -- /* 0. handle errors */ -- if (pfd.revents & POLLERR) { -- /* if we are trying to get free buffers but none have been queued yet -- no need to raise a warning */ -- if (timeout == 0) { -- for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) -- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); -- } -- } -- else -- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); -+ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); - -- return NULL; -+ if (evt.type == V4L2_EVENT_EOS) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); -+ return AVERROR_EOF; - } - -- /* 1. handle resolution changes */ -- if (pfd.revents & POLLPRI) { -- ret = v4l2_handle_event(ctx); -- if (ret < 0) { -- /* if re-init failed, abort */ -- ctx->done = 1; -- return NULL; -- } -- if (ret) { -- /* if re-init was successful drop the buffer (if there was one) -- * since we had to reconfigure capture (unmap all buffers) -- */ -- return NULL; -+ if (evt.type == V4L2_EVENT_SOURCE_CHANGE) -+ return do_source_change(m); -+ -+ return 0; -+} -+ -+static inline int -+dq_ok(const V4L2Context * const c) -+{ -+ return c->streamon && atomic_load(&c->q_count) != 0; -+} -+ -+// Get a buffer -+// If output then just gets the buffer in the expected way -+// If capture then runs the capture state m/c to deal with res change etc. -+// If return value == 0 then *ppavbuf != NULL -+ -+static int -+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) -+{ -+ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); -+ AVCodecContext * const avctx = m->avctx; -+ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); -+ -+ const unsigned int poll_cap = (POLLIN | POLLRDNORM); -+ const unsigned int poll_out = (POLLOUT | POLLWRNORM); -+ const unsigned int poll_event = POLLPRI; -+ -+ *ppavbuf = NULL; -+ -+ for (;;) { -+ struct pollfd pfd = { -+ .fd = m->fd, -+ // If capture && stream not started then assume we are waiting for the initial event -+ .events = !is_cap ? poll_out : -+ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : -+ poll_event, -+ }; -+ int ret; -+ -+ if (ctx->done) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); -+ return AVERROR_EOF; - } -- } - -- /* 2. dequeue the buffer */ -- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { -+ // If capture && timeout == -1 then also wait for rx buffer free -+ if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining) -+ pfd.events |= poll_out; - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- /* there is a capture buffer ready */ -- if (pfd.revents & (POLLIN | POLLRDNORM)) -- goto dequeue; -+ // If nothing Qed all we will get is POLLERR - avoid that -+ if ((pfd.events == poll_out && !dq_ok(&m->output)) || -+ (pfd.events == poll_cap && !dq_ok(&m->capture)) || -+ (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) { -+ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); -+ return AVERROR(ENOSPC); -+ } - -- /* the driver is ready to accept more input; instead of waiting for the capture -- * buffer to complete we return NULL so input can proceed (we are single threaded) -- */ -- if (pfd.revents & (POLLOUT | POLLWRNORM)) -- return NULL; -+ // Timeout kludged s.t. "forever" eventually gives up & produces logging -+ // If waiting for an event when we have seen a last_frame then we expect -+ // it to be ready already so force a short timeout -+ ret = poll(&pfd, 1, -+ ff_v4l2_ctx_eos(ctx) ? 10 : -+ timeout == -1 ? 3000 : timeout); -+ if (ret < 0) { -+ ret = AVERROR(errno); // Remember errno before logging etc. -+ av_assert0(ret < 0); - } - --dequeue: -- memset(&buf, 0, sizeof(buf)); -- buf.memory = V4L2_MEMORY_MMAP; -- buf.type = ctx->type; -- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -- memset(planes, 0, sizeof(planes)); -- buf.length = VIDEO_MAX_PLANES; -- buf.m.planes = planes; -+ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", -+ ctx->name, ret, timeout, pfd.events, pfd.revents); -+ -+ if (ret < 0) { -+ if (ret == AVERROR(EINTR)) -+ continue; -+ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); -+ return ret; - } - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); -- if (ret) { -- if (errno != EAGAIN) { -- ctx->done = 1; -- if (errno != EPIPE) -- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -- ctx->name, av_err2str(AVERROR(errno))); -+ if (ret == 0) { -+ if (timeout == -1) -+ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); -+ if (ff_v4l2_ctx_eos(ctx)) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); -+ ret = get_event(m); -+ if (ret < 0) { -+ ctx->done = 1; -+ return ret; -+ } - } -- return NULL; -+ return AVERROR(EAGAIN); -+ } -+ -+ if ((pfd.revents & POLLERR) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); -+ return AVERROR_UNKNOWN; - } - -- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? -- buf.m.planes[0].bytesused : buf.bytesused; -- if (bytesused == 0) { -+ if ((pfd.revents & poll_event) != 0) { -+ ret = get_event(m); -+ if (ret < 0) { - ctx->done = 1; -- return NULL; -+ return ret; - } --#ifdef V4L2_BUF_FLAG_LAST -- if (buf.flags & V4L2_BUF_FLAG_LAST) -- ctx->done = 1; --#endif -+ continue; -+ } -+ -+ if ((pfd.revents & poll_cap) != 0) { -+ ret = dq_buf(ctx, ppavbuf); -+ if (ret == AVERROR(EPIPE)) -+ continue; -+ return ret; - } - -- avbuf = &ctx->buffers[buf.index]; -- avbuf->status = V4L2BUF_AVAILABLE; -- avbuf->buf = buf; -- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { -- memcpy(avbuf->planes, planes, sizeof(planes)); -- avbuf->buf.m.planes = avbuf->planes; -+ if ((pfd.revents & poll_out) != 0) { -+ if (is_cap) -+ return AVERROR(EAGAIN); -+ return dq_buf(ctx, ppavbuf); - } -- return avbuf; -+ -+ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); -+ return AVERROR_UNKNOWN; - } -+} - -- return NULL; -+// Clear out flags and timestamps that should should be set by the user -+// Returns the passed avbuf -+static V4L2Buffer * -+clean_v4l2_buffer(V4L2Buffer * const avbuf) -+{ -+ struct v4l2_buffer *const buf = &avbuf->buf; -+ -+ buf->flags = 0; -+ buf->field = V4L2_FIELD_ANY; -+ buf->timestamp = (struct timeval){0}; -+ buf->timecode = (struct v4l2_timecode){0}; -+ buf->sequence = 0; -+ -+ return avbuf; -+} -+ -+int -+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1) -+{ -+ V4L2Buffer * avbuf; -+ if (timeout1 != 0) { -+ int rv = get_qbuf(ctx, &avbuf, timeout1); -+ if (rv != 0) -+ return rv; -+ } -+ do { -+ get_qbuf(ctx, &avbuf, 0); -+ } while (avbuf); -+ return 0; - } - - static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - { -- int timeout = 0; /* return when no more buffers to dequeue */ - int i; - - /* get back as many output buffers as possible */ -- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- do { -- } while (v4l2_dequeue_v4l2buf(ctx, timeout)); -- } -+ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ ff_v4l2_dq_all(ctx, 0); - - for (i = 0; i < ctx->num_buffers; i++) { -- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) -- return &ctx->buffers[i]; -+ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (avbuf->status == V4L2BUF_AVAILABLE) -+ return clean_v4l2_buffer(avbuf); - } - - return NULL; -@@ -452,25 +746,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) - - static int v4l2_release_buffers(V4L2Context* ctx) - { -- struct v4l2_requestbuffers req = { -- .memory = V4L2_MEMORY_MMAP, -- .type = ctx->type, -- .count = 0, /* 0 -> unmaps buffers from the driver */ -- }; -- int i, j; -+ int i; -+ int ret = 0; -+ const int fd = ctx_to_m2mctx(ctx)->fd; - -- for (i = 0; i < ctx->num_buffers; i++) { -- V4L2Buffer *buffer = &ctx->buffers[i]; -+ // Orphan any buffers in the wild -+ ff_weak_link_break(&ctx->wl_master); -+ -+ if (ctx->bufrefs) { -+ for (i = 0; i < ctx->num_buffers; i++) -+ av_buffer_unref(ctx->bufrefs + i); -+ } -+ -+ if (fd != -1) { -+ struct v4l2_requestbuffers req = { -+ .memory = V4L2_MEMORY_MMAP, -+ .type = ctx->type, -+ .count = 0, /* 0 -> unmap all buffers from the driver */ -+ }; -+ -+ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno == EINTR) -+ continue; -+ -+ ret = AVERROR(errno); - -- for (j = 0; j < buffer->num_planes; j++) { -- struct V4L2Plane_info *p = &buffer->plane_info[j]; -- if (p->mm_addr && p->length) -- if (munmap(p->mm_addr, p->length) < 0) -- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); -+ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", -+ ctx->name, av_err2str(AVERROR(errno))); -+ -+ if (ctx_to_m2mctx(ctx)->output_drm) -+ av_log(logger(ctx), AV_LOG_ERROR, -+ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" -+ "for all buffers: \n" -+ " 1. drmModeRmFB(..)\n" -+ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); - } - } -+ atomic_store(&ctx->q_count, 0); - -- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); -+ return ret; - } - - static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) -@@ -499,6 +813,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm - - static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) - { -+ V4L2m2mContext* s = ctx_to_m2mctx(ctx); -+ V4L2m2mPriv *priv = s->avctx->priv_data; - enum AVPixelFormat pixfmt = ctx->av_pix_fmt; - struct v4l2_fmtdesc fdesc; - int ret; -@@ -512,21 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) - return 0; - } - -- for (;;) { -+ for (;; ++fdesc.index) { - ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc); - if (ret) - return AVERROR(EINVAL); - -+ if (priv->pix_fmt != AV_PIX_FMT_NONE) { -+ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) -+ continue; -+ } -+ - pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); - ret = v4l2_try_raw_format(ctx, pixfmt); -- if (ret){ -- fdesc.index++; -- continue; -+ if (ret == 0) { -+ *p = pixfmt; -+ return 0; - } -- -- *p = pixfmt; -- -- return 0; - } - - return AVERROR(EINVAL); -@@ -569,30 +886,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) - * - *****************************************************************************/ - -+ -+static void flush_all_buffers_status(V4L2Context* const ctx) -+{ -+ int i; -+ -+ if (!ctx->bufrefs) -+ return; -+ -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (buf->status == V4L2BUF_IN_DRIVER) -+ ff_v4l2_buffer_set_avail(buf); -+ } -+ atomic_store(&ctx->q_count, 0); -+} -+ -+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) -+{ -+ int i; -+ int rv; -+ -+ if (!ctx->bufrefs) { -+ rv = ff_v4l2_context_init(ctx); -+ if (rv) { -+ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); -+ return rv; -+ } -+ } -+ -+ for (i = 0; i < ctx->num_buffers; ++i) { -+ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; -+ if (buf->status == V4L2BUF_AVAILABLE) { -+ rv = ff_v4l2_buffer_enqueue(buf); -+ if (rv < 0) -+ return rv; -+ } -+ } -+ return 0; -+} -+ - int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) - { - int type = ctx->type; -- int ret; -+ int ret = 0; -+ AVCodecContext * const avctx = logger(ctx); - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); -- if (ret < 0) -- return AVERROR(errno); -+ // Avoid doing anything if there is nothing we can do -+ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) -+ return 0; - -- ctx->streamon = (cmd == VIDIOC_STREAMON); -+ ff_mutex_lock(&ctx->lock); - -- return 0; -+ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) -+ stuff_all_buffers(avctx, ctx); -+ -+ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { -+ const int err = errno; -+ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, -+ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); -+ ret = AVERROR(err); -+ } -+ else -+ { -+ if (cmd == VIDIOC_STREAMOFF) -+ flush_all_buffers_status(ctx); -+ else -+ ctx->first_buf = 1; -+ -+ ctx->streamon = (cmd == VIDIOC_STREAMON); -+ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, -+ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); -+ } -+ -+ // Both stream off & on effectively clear flag_last -+ ctx->flag_last = 0; -+ -+ ff_mutex_unlock(&ctx->lock); -+ -+ return ret; - } - - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ V4L2m2mContext *const s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; -+ int64_t track_ts; - V4L2Buffer* avbuf; - int ret; - - if (!frame) { - ret = v4l2_stop_encode(ctx); - if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name); -+ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name); - s->draining= 1; - return 0; - } -@@ -601,23 +987,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf); -+ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame); -+ -+ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts); - if (ret) - return ret; - - return ff_v4l2_buffer_enqueue(avbuf); - } - --int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, -+ const void * extdata, size_t extlen) - { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer* avbuf; - int ret; -+ int64_t track_ts; - - if (!pkt->size) { - ret = v4l2_stop_decode(ctx); -+ // Log but otherwise ignore stop failure - if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); -+ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); - s->draining = 1; - return 0; - } -@@ -626,8 +1018,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) - if (!avbuf) - return AVERROR(EAGAIN); - -- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); -- if (ret) -+ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt); -+ -+ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts); -+ if (ret == AVERROR(ENOMEM)) -+ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", -+ __func__, pkt->size, avbuf->planes[0].length); -+ else if (ret) - return ret; - - return ff_v4l2_buffer_enqueue(avbuf); -@@ -635,42 +1032,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) - - int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) - { -+ V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer *avbuf; -+ int rv; - -- /* -- * timeout=-1 blocks until: -- * 1. decoded frame available -- * 2. an input buffer is ready to be dequeued -- */ -- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); -- if (!avbuf) { -- if (ctx->done) -- return AVERROR_EOF; -- -- return AVERROR(EAGAIN); -- } -+ do { -+ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) -+ return rv; -+ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0) -+ return rv; -+ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); - -- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); -+ return 0; - } - --int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) -+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout) - { -+ V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ AVCodecContext *const avctx = s->avctx; - V4L2Buffer *avbuf; -+ int rv; - -- /* -- * blocks until: -- * 1. encoded packet available -- * 2. an input buffer ready to be dequeued -- */ -- avbuf = v4l2_dequeue_v4l2buf(ctx, -1); -- if (!avbuf) { -- if (ctx->done) -- return AVERROR_EOF; -+ do { -+ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) -+ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC -+ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) -+ return rv; -+ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); - -- return AVERROR(EAGAIN); -- } -- -- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); -+ return 0; - } - - int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) -@@ -702,78 +1093,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) - - int ff_v4l2_context_set_format(V4L2Context* ctx) - { -- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ int ret; -+ -+ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ if (ret != 0) -+ return ret; -+ -+ // Check returned size against min size and if smaller have another go -+ // Only worry about plane[0] as this is meant to enforce limits for -+ // encoded streams where we might know a bit more about the shape -+ // than the driver -+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { -+ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage) -+ return 0; -+ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size; -+ } -+ else { -+ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage) -+ return 0; -+ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size; -+ } -+ -+ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); -+ return ret; - } - - void ff_v4l2_context_release(V4L2Context* ctx) - { - int ret; - -- if (!ctx->buffers) -+ if (!ctx->bufrefs) - return; - - ret = v4l2_release_buffers(ctx); - if (ret) - av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); - -- av_freep(&ctx->buffers); -+ av_freep(&ctx->bufrefs); -+ av_buffer_unref(&ctx->frames_ref); -+ -+ ff_mutex_destroy(&ctx->lock); -+ pthread_cond_destroy(&ctx->cond); - } - --int ff_v4l2_context_init(V4L2Context* ctx) -+ -+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem) - { -- V4L2m2mContext *s = ctx_to_m2mctx(ctx); -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); - struct v4l2_requestbuffers req; -- int ret, i; -- -- if (!v4l2_type_supported(ctx)) { -- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); -- return AVERROR_PATCHWELCOME; -- } -+ int ret; -+ int i; - -- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); -- if (ret) -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); -+ av_assert0(ctx->bufrefs == NULL); - - memset(&req, 0, sizeof(req)); -- req.count = ctx->num_buffers; -- req.memory = V4L2_MEMORY_MMAP; -+ req.count = req_buffers; -+ req.memory = mem; - req.type = ctx->type; -- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); -- if (ret < 0) { -- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno)); -- return AVERROR(errno); -+ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno != EINTR) { -+ ret = AVERROR(errno); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret)); -+ return ret; -+ } - } - - ctx->num_buffers = req.count; -- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); -- if (!ctx->buffers) { -+ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs)); -+ if (!ctx->bufrefs) { - av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name); -- return AVERROR(ENOMEM); -+ goto fail_release; - } - -- for (i = 0; i < req.count; i++) { -- ctx->buffers[i].context = ctx; -- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); -- if (ret < 0) { -+ ctx->wl_master = ff_weak_link_new(ctx); -+ if (!ctx->wl_master) { -+ ret = AVERROR(ENOMEM); -+ goto fail_release; -+ } -+ -+ for (i = 0; i < ctx->num_buffers; i++) { -+ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem); -+ if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); -- goto error; -+ goto fail_release; - } - } - - av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), - req.count, -- v4l2_get_width(&ctx->format), -- v4l2_get_height(&ctx->format), -+ ff_v4l2_get_format_width(&ctx->format), -+ ff_v4l2_get_format_height(&ctx->format), - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, - V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); - - return 0; - --error: -+fail_release: - v4l2_release_buffers(ctx); -+ av_freep(&ctx->bufrefs); -+ return ret; -+} -+ -+int ff_v4l2_context_init(V4L2Context* ctx) -+{ -+ struct v4l2_queryctrl qctrl; -+ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); -+ int ret; -+ -+ // It is not valid to reinit a context without a previous release -+ av_assert0(ctx->bufrefs == NULL); -+ -+ if (!v4l2_type_supported(ctx)) { -+ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); -+ return AVERROR_PATCHWELCOME; -+ } - -- av_freep(&ctx->buffers); -+ ff_mutex_init(&ctx->lock, NULL); -+ pthread_cond_init(&ctx->cond, NULL); -+ atomic_init(&ctx->q_count, 0); -+ -+ if (s->output_drm) { -+ AVHWFramesContext *hwframes; -+ -+ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref); -+ if (!ctx->frames_ref) { -+ ret = AVERROR(ENOMEM); -+ goto fail_unlock; -+ } -+ -+ hwframes = (AVHWFramesContext*)ctx->frames_ref->data; -+ hwframes->format = AV_PIX_FMT_DRM_PRIME; -+ hwframes->sw_format = ctx->av_pix_fmt; -+ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; -+ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; -+ ret = av_hwframe_ctx_init(ctx->frames_ref); -+ if (ret < 0) -+ goto fail_unref_hwframes; -+ } -+ -+ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret)); -+ goto fail_unref_hwframes; -+ } -+ -+ memset(&qctrl, 0, sizeof(qctrl)); -+ qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT; -+ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) { -+ ret = AVERROR(errno); -+ if (ret != AVERROR(EINVAL)) { -+ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret)); -+ goto fail_unref_hwframes; -+ } -+ // Control unsupported - set default if wanted -+ if (ctx->num_buffers < 2) -+ ctx->num_buffers = 4; -+ } -+ else { -+ if (ctx->num_buffers < 2) -+ ctx->num_buffers = qctrl.minimum + 2; -+ ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum); -+ } -+ -+ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); -+ if (ret < 0) -+ goto fail_unref_hwframes; -+ -+ return 0; - -+fail_unref_hwframes: -+ av_buffer_unref(&ctx->frames_ref); -+fail_unlock: -+ ff_mutex_destroy(&ctx->lock); - return ret; - } -diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 22a9532444..108fc05a6f 100644 ---- a/libavcodec/v4l2_context.h -+++ b/libavcodec/v4l2_context.h -@@ -31,6 +31,7 @@ - #include "libavutil/pixfmt.h" - #include "libavutil/frame.h" - #include "libavutil/buffer.h" -+#include "libavutil/thread.h" - #include "v4l2_buffers.h" - - typedef struct V4L2Context { -@@ -70,28 +71,57 @@ typedef struct V4L2Context { - */ - int width, height; - AVRational sample_aspect_ratio; -+ struct v4l2_rect selection; - - /** -- * Indexed array of V4L2Buffers -+ * If the default size of buffer is less than this then try to -+ * set to this. - */ -- V4L2Buffer *buffers; -+ uint32_t min_buf_size; -+ -+ /** -+ * Indexed array of pointers to V4L2Buffers -+ */ -+ AVBufferRef **bufrefs; - - /** - * Readonly after init. - */ - int num_buffers; - -+ /** -+ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF -+ */ -+ enum v4l2_memory buf_mem; -+ - /** - * Whether the stream has been started (VIDIOC_STREAMON has been sent). - */ - int streamon; - -+ /* 1st buffer after stream on */ -+ int first_buf; -+ - /** - * Either no more buffers available or an unrecoverable error was notified - * by the V4L2 kernel driver: once set the context has to be exited. - */ - int done; - -+ int flag_last; -+ -+ /** -+ * If NZ then when Qing frame/pkt use this rather than the -+ * "real" PTS -+ */ -+ uint64_t track_ts; -+ -+ AVBufferRef *frames_ref; -+ atomic_int q_count; -+ struct ff_weak_link_master *wl_master; -+ -+ AVMutex lock; -+ pthread_cond_t cond; - } V4L2Context; - - /** -@@ -147,7 +177,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); - * @param[inout] pkt The AVPacket to dequeue to. - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. - */ --int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); -+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout); - - /** - * Dequeues a buffer from a V4L2Context to an AVFrame. -@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); - * @param[in] ctx The V4L2Context to dequeue from. - * @param[inout] f The AVFrame to dequeue to. - * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) -+ * - * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. -+ * AVERROR(ENOSPC) if no buffer availible to put -+ * the frame in - */ - int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); - -@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); - * @param[in] pkt A pointer to an AVPacket. - * @return 0 in case of success, a negative error otherwise. - */ --int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); -+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); - - /** - * Enqueues a buffer to a V4L2Context from an AVFrame -@@ -183,4 +216,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); - */ - int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); - -+/** -+ * Dequeue all buffers on this queue -+ * -+ * Used to recycle output buffers -+ * -+ * @param[in] ctx The V4L2Context to dequeue from. -+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, -+ * all others have a timeout of zero -+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return -+ * of the first dequeue operation, 0 otherwise. -+ */ -+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1); -+ -+/** -+ * Returns the number of buffers currently queued -+ * -+ * @param[in] ctx The V4L2Context to evaluate -+ */ -+static inline int -+ff_v4l2_context_q_count(const V4L2Context* const ctx) -+{ -+ return atomic_load(&ctx->q_count); -+} -+ - #endif // AVCODEC_V4L2_CONTEXT_H -diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c -index cdfd579810..a919bdc030 100644 ---- a/libavcodec/v4l2_m2m.c -+++ b/libavcodec/v4l2_m2m.c -@@ -35,6 +35,15 @@ - #include "v4l2_context.h" - #include "v4l2_fmt.h" - #include "v4l2_m2m.h" -+#include "v4l2_req_dmabufs.h" -+ -+static void -+xlat_init(xlat_track_t * const x) -+{ -+ memset(x, 0, sizeof(*x)); -+ x->last_pts = AV_NOPTS_VALUE; -+} -+ - - static inline int v4l2_splane_video(struct v4l2_capability *cap) - { -@@ -68,7 +77,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - - s->capture.done = s->output.done = 0; - s->capture.name = "capture"; -+ s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; - s->output.name = "output"; -+ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; - atomic_init(&s->refcount, 0); - sem_init(&s->refsync, 0, 0); - -@@ -85,12 +96,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) - if (v4l2_mplane_video(&cap)) { - s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; - s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ s->output.format.type = s->output.type; - return 0; - } - - if (v4l2_splane_video(&cap)) { - s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ s->output.format.type = s->output.type; - return 0; - } - -@@ -215,13 +228,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) - av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); - - /* 2. unmap the capture buffers (v4l2 and ffmpeg): -- * we must wait for all references to be released before being allowed -- * to queue new buffers. - */ -- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n"); -- if (atomic_load(&s->refcount)) -- while(sem_wait(&s->refsync) == -1 && errno == EINTR); -- - ff_v4l2_context_release(&s->capture); - - /* 3. get the new capture format */ -@@ -240,7 +247,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) - - /* 5. complete reinit */ - s->draining = 0; -- s->reinit = 0; - - return 0; - } -@@ -274,7 +280,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s) - - /* start again now that we know the stream dimensions */ - s->draining = 0; -- s->reinit = 0; - - ret = ff_v4l2_context_get_format(&s->output, 0); - if (ret) { -@@ -328,10 +333,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) - ff_v4l2_context_release(&s->capture); - sem_destroy(&s->refsync); - -- close(s->fd); -+ if (s->fd != -1) -+ close(s->fd); - av_frame_unref(s->frame); - av_frame_free(&s->frame); - av_packet_unref(&s->buf_pkt); -+ av_freep(&s->extdata_data); -+ -+ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); - - av_free(s); - } -@@ -344,6 +353,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - if (!s) - return 0; - -+ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); -+ -+ if (s->avctx && av_codec_is_decoder(s->avctx->codec)) -+ av_packet_unref(&s->buf_pkt); -+ - if (s->fd >= 0) { - ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); - if (ret) -@@ -356,7 +370,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) - - ff_v4l2_context_release(&s->output); - -+ dmabufs_ctl_unref(&s->db_ctl); -+ close(s->fd); -+ s->fd = -1; -+ - s->self_ref = NULL; -+ // This is only called on avctx close so after this point we don't have that -+ // Crash sooner if we find we are using it (can still log with avctx = NULL) -+ s->avctx = NULL; -+ priv->context = NULL; - av_buffer_unref(&priv->context_ref); - - return 0; -@@ -400,35 +422,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) - return v4l2_configure_contexts(s); - } - --int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s) -+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps) - { -- *s = av_mallocz(sizeof(V4L2m2mContext)); -- if (!*s) -+ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext)); -+ -+ *pps = NULL; -+ if (!s) - return AVERROR(ENOMEM); - -- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext), -+ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s), - &v4l2_m2m_destroy_context, NULL, 0); - if (!priv->context_ref) { -- av_freep(s); -+ av_free(s); - return AVERROR(ENOMEM); - } - - /* assign the context */ -- priv->context = *s; -- (*s)->priv = priv; -+ priv->context = s; -+ s->priv = priv; - - /* populate it */ -- priv->context->capture.num_buffers = priv->num_capture_buffers; -- priv->context->output.num_buffers = priv->num_output_buffers; -- priv->context->self_ref = priv->context_ref; -- priv->context->fd = -1; -+ s->capture.num_buffers = priv->num_capture_buffers; -+ s->output.num_buffers = priv->num_output_buffers; -+ s->self_ref = priv->context_ref; -+ s->fd = -1; -+ xlat_init(&s->xlat); - - priv->context->frame = av_frame_alloc(); - if (!priv->context->frame) { - av_buffer_unref(&priv->context_ref); -- *s = NULL; /* freed when unreferencing context_ref */ - return AVERROR(ENOMEM); - } - -+ *pps = s; - return 0; - } -diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index b67b216331..ded1478a49 100644 ---- a/libavcodec/v4l2_m2m.h -+++ b/libavcodec/v4l2_m2m.h -@@ -30,6 +30,7 @@ - #include - - #include "libavcodec/avcodec.h" -+#include "libavutil/pixfmt.h" - #include "v4l2_context.h" - - #define container_of(ptr, type, member) ({ \ -@@ -38,7 +39,39 @@ - - #define V4L_M2M_DEFAULT_OPTS \ - { "num_output_buffers", "Number of buffers in the output context",\ -- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } -+ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } -+ -+#define FF_V4L2_M2M_TRACK_SIZE 128 -+typedef struct V4L2m2mTrackEl { -+ int discard; // If we see this buffer its been flushed, so discard -+ int pending; -+ int pkt_size; -+ int64_t pts; -+ int64_t dts; -+ int64_t reordered_opaque; -+ int64_t pkt_pos; -+ int64_t pkt_duration; -+ int64_t track_pts; -+} V4L2m2mTrackEl; -+ -+typedef struct pts_stats_s -+{ -+ void * logctx; -+ const char * name; // For debug -+ unsigned int last_count; -+ unsigned int last_interval; -+ int64_t last_pts; -+ int64_t guess; -+} pts_stats_t; -+ -+typedef struct xlat_track_s { -+ unsigned int track_no; -+ int64_t last_pts; // Last valid PTS decoded -+ int64_t last_opaque; -+ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; -+} xlat_track_t; -+ -+struct dmabufs_ctl; - - typedef struct V4L2m2mContext { - char devname[PATH_MAX]; -@@ -52,10 +85,10 @@ typedef struct V4L2m2mContext { - AVCodecContext *avctx; - sem_t refsync; - atomic_uint refcount; -- int reinit; - - /* null frame/packet received */ - int draining; -+ int running; - AVPacket buf_pkt; - - /* Reference to a frame. Only used during encoding */ -@@ -66,6 +99,35 @@ typedef struct V4L2m2mContext { - - /* reference back to V4L2m2mPriv */ - void *priv; -+ -+ AVBufferRef *device_ref; -+ -+ /* generate DRM frames */ -+ int output_drm; -+ -+ /* input frames are drmprime */ -+ int input_drm; -+ -+ /* Frame tracking */ -+ xlat_track_t xlat; -+ -+ pts_stats_t pts_stat; -+ -+ /* req pkt */ -+ int req_pkt; -+ -+ /* Ext data sent */ -+ int extdata_sent; -+ /* Ext data sent in packet - overrides ctx */ -+ void * extdata_data; -+ size_t extdata_size; -+ -+#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 -+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 -+ /* Quirks */ -+ unsigned int quirks; -+ -+ struct dmabufs_ctl * db_ctl; - } V4L2m2mContext; - - typedef struct V4L2m2mPriv { -@@ -76,6 +138,8 @@ typedef struct V4L2m2mPriv { - - int num_output_buffers; - int num_capture_buffers; -+ const char * dmabuf_alloc; -+ enum AVPixelFormat pix_fmt; - } V4L2m2mPriv; - - /** -@@ -129,4 +193,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); - */ - int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); - -+ -+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -+} -+ -+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -+} -+ -+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; -+} -+ -+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) -+{ -+ return ctx->flag_last; -+} -+ -+ - #endif /* AVCODEC_V4L2_M2M_H */ -diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index ab07c0a24a..2bd113facb 100644 ---- a/libavcodec/v4l2_m2m_dec.c -+++ b/libavcodec/v4l2_m2m_dec.c -@@ -23,6 +23,10 @@ - - #include - #include -+ -+#include "libavutil/avassert.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" - #include "libavutil/pixfmt.h" - #include "libavutil/pixdesc.h" - #include "libavutil/opt.h" -@@ -30,75 +34,267 @@ - #include "libavcodec/decode.h" - #include "libavcodec/internal.h" - -+#include "libavcodec/hwaccels.h" -+#include "libavcodec/internal.h" -+#include "libavcodec/hwconfig.h" -+ - #include "v4l2_context.h" - #include "v4l2_m2m.h" - #include "v4l2_fmt.h" -+#include "v4l2_req_dmabufs.h" - --static int v4l2_try_start(AVCodecContext *avctx) -+// Pick 64 for max last count - that is >1sec at 60fps -+#define STATS_LAST_COUNT_MAX 64 -+#define STATS_INTERVAL_MAX (1 << 30) -+ -+#ifndef FF_API_BUFFER_SIZE_T -+#define FF_API_BUFFER_SIZE_T 1 -+#endif -+ -+#define DUMP_FAILED_EXTRADATA 0 -+ -+#if DUMP_FAILED_EXTRADATA -+static inline char hex1(unsigned int x) - { -- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- V4L2Context *const capture = &s->capture; -- V4L2Context *const output = &s->output; -- struct v4l2_selection selection = { 0 }; -- int ret; -+ x &= 0xf; -+ return x <= 9 ? '0' + x : 'a' + x - 10; -+} - -- /* 1. start the output process */ -- if (!output->streamon) { -- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); -- if (ret < 0) { -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); -- return ret; -- } -+static inline char * hex2(char * s, unsigned int x) -+{ -+ *s++ = hex1(x >> 4); -+ *s++ = hex1(x); -+ return s; -+} -+ -+static inline char * hex4(char * s, unsigned int x) -+{ -+ s = hex2(s, x >> 8); -+ s = hex2(s, x); -+ return s; -+} -+ -+static inline char * dash2(char * s) -+{ -+ *s++ = '-'; -+ *s++ = '-'; -+ return s; -+} -+ -+static void -+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len) -+{ -+ size_t i; -+ s = hex4(s, offset); -+ m += offset; -+ for (i = 0; i != 8; ++i) { -+ *s++ = ' '; -+ s = len > i + offset ? hex2(s, *m++) : dash2(s); - } -+ *s++ = ' '; -+ *s++ = ':'; -+ for (; i != 16; ++i) { -+ *s++ = ' '; -+ s = len > i + offset ? hex2(s, *m++) : dash2(s); -+ } -+ *s++ = 0; -+} - -- if (capture->streamon) -- return 0; -+static void -+log_dump(void * logctx, int lvl, const void * const data, const size_t len) -+{ -+ size_t i; -+ for (i = 0; i < len; i += 16) { -+ char buf[80]; -+ data16(buf, i, data, len); -+ av_log(logctx, lvl, "%s\n", buf); -+ } -+} -+#endif - -- /* 2. get the capture format */ -- capture->format.type = capture->type; -- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); -- if (ret) { -- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); -- return ret; -+static int64_t pts_stats_guess(const pts_stats_t * const stats) -+{ -+ if (stats->last_count <= 1) -+ return stats->last_pts; -+ if (stats->last_pts == AV_NOPTS_VALUE || -+ stats->last_interval == 0 || -+ stats->last_count >= STATS_LAST_COUNT_MAX) -+ return AV_NOPTS_VALUE; -+ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; -+} -+ -+static void pts_stats_add(pts_stats_t * const stats, int64_t pts) -+{ -+ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { -+ if (stats->last_count < STATS_LAST_COUNT_MAX) -+ ++stats->last_count; -+ return; - } - -- /* 2.1 update the AVCodecContext */ -- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); -- capture->av_pix_fmt = avctx->pix_fmt; -+ if (stats->last_pts != AV_NOPTS_VALUE) { -+ const int64_t interval = pts - stats->last_pts; - -- /* 3. set the crop parameters */ -- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -- selection.r.height = avctx->coded_height; -- selection.r.width = avctx->coded_width; -- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); -- if (!ret) { -- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); -- if (ret) { -- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); -- } else { -- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); -- /* update the size of the resulting frame */ -- capture->height = selection.r.height; -- capture->width = selection.r.width; -+ if (interval < 0 || interval >= STATS_INTERVAL_MAX || -+ stats->last_count >= STATS_LAST_COUNT_MAX) { -+ if (stats->last_interval != 0) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", -+ __func__, stats->name, interval, stats->last_count); -+ stats->last_interval = 0; -+ } -+ else { -+ const int64_t frame_time = interval / (int64_t)stats->last_count; -+ -+ if (frame_time != stats->last_interval) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", -+ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); -+ stats->last_interval = frame_time; - } - } - -- /* 4. init the capture context now that we have the capture format */ -- if (!capture->buffers) { -- ret = ff_v4l2_context_init(capture); -- if (ret) { -- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); -- return AVERROR(ENOMEM); -+ stats->last_pts = pts; -+ stats->last_count = 1; -+} -+ -+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) -+{ -+ *stats = (pts_stats_t){ -+ .logctx = logctx, -+ .name = name, -+ .last_count = 1, -+ .last_interval = 0, -+ .last_pts = AV_NOPTS_VALUE -+ }; -+} -+ -+// If abdata == NULL then this just counts space required -+// Unpacks avcC if detected -+static int -+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata) -+{ -+ const uint8_t * const xdend = extradata + extrasize; -+ const uint8_t * p = extradata; -+ uint8_t * d = abdata; -+ unsigned int n; -+ unsigned int len; -+ const unsigned int hdrlen = 4; -+ unsigned int need_pps = 1; -+ -+ if (extrasize < 8) -+ return AVERROR(EINVAL); -+ -+ if (p[0] == 0 && p[1] == 0) { -+ // Assume a couple of leading zeros are good enough to indicate NAL -+ if (abdata) -+ memcpy(d, p, extrasize); -+ return extrasize; -+ } -+ -+ // avcC starts with a 1 -+ if (p[0] != 1) -+ return AVERROR(EINVAL); -+ -+ p += 5; -+ n = *p++ & 0x1f; -+ -+doxps: -+ while (n--) { -+ if (xdend - p < 2) -+ return AVERROR(EINVAL); -+ len = (p[0] << 8) | p[1]; -+ p += 2; -+ if (xdend - p < (ptrdiff_t)len) -+ return AVERROR(EINVAL); -+ if (abdata) { -+ d[0] = 0; -+ d[1] = 0; -+ d[2] = 0; -+ d[3] = 1; -+ memcpy(d + 4, p, len); - } -+ d += len + hdrlen; -+ p += len; -+ } -+ if (need_pps) { -+ need_pps = 0; -+ if (p >= xdend) -+ return AVERROR(EINVAL); -+ n = *p++; -+ goto doxps; - } - -- /* 5. start the capture process */ -- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -- if (ret) { -- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); -+ return d - abdata; -+} -+ -+static int -+copy_extradata(AVCodecContext * const avctx, -+ const void * const src_data, const int src_len, -+ void ** const pdst_data, size_t * const pdst_len) -+{ -+ int len; -+ -+ *pdst_len = 0; -+ av_freep(pdst_data); -+ -+ if (avctx->codec_id == AV_CODEC_ID_H264) -+ len = h264_xd_copy(src_data, src_len, NULL); -+ else -+ len = src_len < 0 ? AVERROR(EINVAL) : src_len; -+ -+ // Zero length is OK but we want to stop - -ve is error val -+ if (len <= 0) -+ return len; -+ -+ if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) -+ return AVERROR(ENOMEM); -+ -+ if (avctx->codec_id == AV_CODEC_ID_H264) -+ h264_xd_copy(src_data, src_len, *pdst_data); -+ else -+ memcpy(*pdst_data, src_data, len); -+ *pdst_len = len; -+ -+ return 0; -+} -+ -+ -+ -+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) -+{ -+ int ret; -+ struct v4l2_decoder_cmd cmd = { -+ .cmd = V4L2_DEC_CMD_START, -+ .flags = 0, -+ }; -+ -+ if (s->output.streamon) -+ return 0; -+ -+ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); -+ if (ret != 0) { -+ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); - return ret; - } - -+ // STREAMON should do implicit START so this just for those that don't. -+ // It is optional so don't worry if it fails -+ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { -+ ret = AVERROR(errno); -+ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); -+ } -+ else { -+ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); -+ } -+ return 0; -+} -+ -+static int v4l2_try_start(AVCodecContext *avctx) -+{ -+ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -+ int ret; -+ -+ /* 1. start the output process */ -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; - return 0; - } - -@@ -133,58 +329,548 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) - return 0; - } - --static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+static void -+set_best_effort_pts(AVCodecContext *const avctx, -+ pts_stats_t * const ps, -+ AVFrame *const frame) -+{ -+ pts_stats_add(ps, frame->pts); -+ -+#if FF_API_PKT_PTS -+FF_DISABLE_DEPRECATION_WARNINGS -+ frame->pkt_pts = frame->pts; -+FF_ENABLE_DEPRECATION_WARNINGS -+#endif -+ frame->best_effort_timestamp = pts_stats_guess(ps); -+ // If we can't guess from just PTS - try DTS -+ if (frame->best_effort_timestamp == AV_NOPTS_VALUE) -+ frame->best_effort_timestamp = frame->pkt_dts; -+ -+ // We can't emulate what s/w does in a useful manner and using the -+ // "correct" answer seems to just confuse things. -+ frame->pkt_dts = frame->pts; -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", -+ frame->pts, frame->best_effort_timestamp, frame->pkt_dts); -+} -+ -+static void -+xlat_flush(xlat_track_t * const x) -+{ -+ unsigned int i; -+ // Do not reset track_no - this ensures that any frames left in the decoder -+ // that turn up later get discarded. -+ -+ x->last_pts = AV_NOPTS_VALUE; -+ x->last_opaque = 0; -+ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { -+ x->track_els[i].pending = 0; -+ x->track_els[i].discard = 1; -+ } -+} -+ -+static void -+xlat_init(xlat_track_t * const x) -+{ -+ memset(x, 0, sizeof(*x)); -+ xlat_flush(x); -+} -+ -+static int -+xlat_pending(const xlat_track_t * const x) -+{ -+ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; -+ int i; -+ const int64_t now = x->last_pts; -+ -+ for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { -+ const V4L2m2mTrackEl * const t = x->track_els + n; -+ -+ // Discard only set on never-set or flushed entries -+ // So if we get here we've never successfully decoded a frame so allow -+ // more frames into the buffer before stalling -+ if (t->discard) -+ return i - 16; -+ -+ // If we've got this frame out then everything before this point -+ // must have entered the decoder -+ if (!t->pending) -+ break; -+ -+ // If we've never seen a pts all we can do is count frames -+ if (now == AV_NOPTS_VALUE) -+ continue; -+ -+ if (t->dts != AV_NOPTS_VALUE && now >= t->dts) -+ break; -+ } -+ -+ return i; -+} -+ -+static inline int stream_started(const V4L2m2mContext * const s) { -+ return s->output.streamon; -+} -+ -+#define NQ_OK 0 -+#define NQ_Q_FULL 1 -+#define NQ_SRC_EMPTY 2 -+#define NQ_NONE 3 -+#define NQ_DRAINING 4 -+#define NQ_DEAD 5 -+ -+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) -+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) -+ -+// do_not_get If true then no new packet will be got but status will -+// be set appropriately -+ -+// AVERROR_EOF Flushing an already flushed stream -+// -ve Error (all errors except EOF are unexpected) -+// NQ_OK (0) OK -+// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) -+// NQ_SRC_EMPTY Src empty (do not retry) -+// NQ_NONE Enqueue not attempted -+// NQ_DRAINING At EOS, dQ dest until EOS there too -+// NQ_DEAD Not running (do not retry, do not attempt capture dQ) -+ -+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) - { -- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; -- V4L2Context *const capture = &s->capture; -- V4L2Context *const output = &s->output; - int ret; - -- if (!s->buf_pkt.size) { -- ret = ff_decode_get_packet(avctx, &s->buf_pkt); -- if (ret < 0 && ret != AVERROR_EOF) -+ // If we don't already have a coded packet - get a new one -+ // We will already have a coded pkt if the output Q was full last time we -+ // tried to Q it -+ if (!s->buf_pkt.size && !do_not_get) { -+ unsigned int i; -+ -+ for (i = 0; i < 256; ++i) { -+ uint8_t * side_data; -+#if FF_API_BUFFER_SIZE_T -+ int side_size; -+#else -+ size_t side_size; -+#endif -+ ret = ff_decode_get_packet(avctx, &s->buf_pkt); -+ if (ret != 0) -+ break; -+ -+ // New extradata is the only side-data we undertand -+ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); -+ if (side_data) { -+ av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); -+ if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0) -+ av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret)); -+ s->extdata_sent = 0; -+ } -+ -+ if (s->buf_pkt.size != 0) -+ break; -+ -+ if (s->buf_pkt.side_data_elems == 0) { -+ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); -+ ret = AVERROR_EOF; -+ break; -+ } -+ -+ // Retry a side-data only pkt -+ } -+ // If i >= 256 something has gone wrong -+ if (i >= 256) { -+ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); -+ return AVERROR(EIO); -+ } -+ -+ if (ret == AVERROR(EAGAIN)) { -+ if (!stream_started(s)) { -+ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); -+ return NQ_DEAD; -+ } -+ return NQ_SRC_EMPTY; -+ } -+ -+ if (ret == AVERROR_EOF) { -+ // EOF - enter drain mode -+ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", -+ ret, s->buf_pkt.size, stream_started(s), s->draining); -+ if (!stream_started(s)) { -+ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); -+ s->draining = 1; -+ s->capture.done = 1; -+ return AVERROR_EOF; -+ } -+ -+ if (!s->draining) { -+ // Calling enqueue with an empty pkt starts drain -+ av_assert0(s->buf_pkt.size == 0); -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); -+ if (ret) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); -+ return ret; -+ } -+ } -+ return NQ_DRAINING; -+ } -+ -+ if (ret < 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); - return ret; -+ } - } - -- if (s->draining) -- goto dequeue; -+ if (s->draining) { -+ if (s->buf_pkt.size) { -+ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); -+ av_packet_unref(&s->buf_pkt); -+ } -+ return NQ_DRAINING; -+ } - -- ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); -- if (ret < 0 && ret != AVERROR(EAGAIN)) -- goto fail; -+ if (!s->buf_pkt.size) -+ return NQ_NONE; - -- /* if EAGAIN don't unref packet and try to enqueue in the next iteration */ -- if (ret != AVERROR(EAGAIN)) -+ if ((ret = check_output_streamon(avctx, s)) != 0) -+ return ret; -+ -+ if (s->extdata_sent) -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); -+ else -+ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); -+ -+ if (ret == AVERROR(EAGAIN)) { -+ // Out of input buffers - keep packet -+ ret = NQ_Q_FULL; -+ } -+ else { -+ // In all other cases we are done with this packet - av_packet_unref(&s->buf_pkt); -+ s->extdata_sent = 1; - -- if (!s->draining) { -- ret = v4l2_try_start(avctx); - if (ret) { -- /* cant recover */ -- if (ret != AVERROR(ENOMEM)) -- ret = 0; -- goto fail; -+ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); -+ return ret; -+ } -+ } -+ -+ // Start if we haven't -+ { -+ const int ret2 = v4l2_try_start(avctx); -+ if (ret2) { -+ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); -+ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; - } - } - --dequeue: -- return ff_v4l2_context_dequeue_frame(capture, frame, -1); --fail: -- av_packet_unref(&s->buf_pkt); - return ret; - } - -+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) -+{ -+ int rv = 0; -+ -+ ff_mutex_lock(&ctx->lock); -+ -+ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { -+ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { -+ rv = AVERROR(errno); -+ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); -+ break; -+ } -+ } -+ -+ ff_mutex_unlock(&ctx->lock); -+ return rv; -+} -+ -+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+{ -+ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; -+ int src_rv = NQ_OK; -+ int dst_rv = 1; // Non-zero (done), non-negative (error) number -+ unsigned int i = 0; -+ -+ do { -+ const int pending = xlat_pending(&s->xlat); -+ const int prefer_dq = (pending > 4); -+ const int last_src_rv = src_rv; -+ -+ av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); -+ -+ // Enqueue another pkt for decode if -+ // (a) We don't have a lot of stuff in the buffer already OR -+ // (b) ... we (think we) do but we've failed to get a frame already OR -+ // (c) We've dequeued a lot of frames without asking for input -+ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); -+ -+ // If we got a frame last time or we've already tried to get a frame and -+ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) -+ // indicating that we want more input. -+ // This should mean that once decode starts we enter a stable state where -+ // we alternately ask for input and produce output -+ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) -+ break; -+ -+ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { -+ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); -+ break; -+ } -+ -+ // Try to get a new frame if -+ // (a) we haven't already got one AND -+ // (b) enqueue returned a status indicating that decode should be attempted -+ if (dst_rv != 0 && TRY_DQ(src_rv)) { -+ // Pick a timeout depending on state -+ // The pending count isn't completely reliable so it is good enough -+ // hint that we want a frame but not good enough to require it in -+ // all cases; however if it has got > 31 that exceeds its margin of -+ // error so require a frame to prevent ridiculous levels of latency -+ const int t = -+ src_rv == NQ_Q_FULL ? -1 : -+ src_rv == NQ_DRAINING ? 300 : -+ prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0; -+ -+ // Dequeue frame will unref any previous contents of frame -+ // if it returns success so we don't need an explicit unref -+ // when discarding -+ // This returns AVERROR(EAGAIN) on timeout or if -+ // there is room in the input Q and timeout == -1 -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -+ -+ // Failure due to no buffer in Q? -+ if (dst_rv == AVERROR(ENOSPC)) { -+ // Wait & retry -+ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { -+ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); -+ } -+ } -+ -+ if (dst_rv == 0) { -+ set_best_effort_pts(avctx, &s->pts_stat, frame); -+ if (!s->running) { -+ s->running = 1; -+ av_log(avctx, AV_LOG_VERBOSE, "Decode running\n"); -+ } -+ } -+ -+ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { -+ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); -+ dst_rv = AVERROR_EOF; -+ s->capture.done = 1; -+ } -+ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) -+ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", -+ s->draining, s->capture.done); -+ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) -+ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", -+ s->draining, s->capture.done, dst_rv); -+ } -+ -+ ++i; -+ if (i >= 256) { -+ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); -+ src_rv = AVERROR(EIO); -+ } -+ -+ // Continue trying to enqueue packets if either -+ // (a) we succeeded last time OR -+ // (b) we didn't ret a frame and we can retry the input -+ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); -+ -+ // Ensure that the frame contains nothing if we aren't returning a frame -+ // (might happen when discarding) -+ if (dst_rv) -+ av_frame_unref(frame); -+ -+ // If we got a frame this time ask for a pkt next time -+ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; -+ -+#if 0 -+ if (dst_rv == 0) -+ { -+ static int z = 0; -+ if (++z > 50) { -+ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); -+ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); -+ return -1; -+ } -+ } -+#endif -+ -+ return dst_rv == 0 ? 0 : -+ src_rv < 0 ? src_rv : -+ dst_rv < 0 ? dst_rv : -+ AVERROR(EAGAIN); -+} -+ -+#if 0 -+#include -+static int64_t us_time(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; -+} -+ -+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) -+{ -+ int ret; -+ const int64_t now = us_time(); -+ int64_t done; -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ ret = v4l2_receive_frame2(avctx, frame); -+ done = us_time(); -+ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); -+ return ret; -+} -+#endif -+ -+static int -+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) -+{ -+ unsigned int i; -+ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); -+ const uint32_t w = avctx->coded_width; -+ const uint32_t h = avctx->coded_height; -+ -+ if (w == 0 || h == 0 || fcc == 0) { -+ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); -+ return 0; -+ } -+ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { -+ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); -+ return 0; -+ } -+ -+ for (i = 0;; ++i) { -+ struct v4l2_frmsizeenum fs = { -+ .index = i, -+ .pixel_format = fcc, -+ }; -+ -+ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { -+ const int err = AVERROR(errno); -+ if (err == AVERROR(EINTR)) -+ continue; -+ if (i == 0 && err == AVERROR(ENOTTY)) { -+ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); -+ return 0; -+ } -+ if (err != AVERROR(EINVAL)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); -+ return err; -+ } -+ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", -+ w, h, av_fourcc2str(fcc), i); -+ return err; -+ } -+ -+ switch (fs.type) { -+ case V4L2_FRMSIZE_TYPE_DISCRETE: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, -+ fs.discrete.width,fs.discrete.height); -+ if (w == fs.discrete.width && h == fs.discrete.height) -+ return 0; -+ break; -+ case V4L2_FRMSIZE_TYPE_STEPWISE: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, -+ fs.stepwise.min_width, fs.stepwise.min_height, -+ fs.stepwise.max_width, fs.stepwise.max_height, -+ fs.stepwise.step_width,fs.stepwise.step_height); -+ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && -+ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && -+ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && -+ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) -+ return 0; -+ break; -+ case V4L2_FRMSIZE_TYPE_CONTINUOUS: -+ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, -+ fs.stepwise.min_width, fs.stepwise.min_height, -+ fs.stepwise.max_width, fs.stepwise.max_height, -+ fs.stepwise.step_width,fs.stepwise.step_height); -+ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && -+ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) -+ return 0; -+ break; -+ default: -+ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); -+ return AVERROR(EINVAL); -+ } -+ } -+} -+ -+static int -+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) -+{ -+ struct v4l2_capability cap; -+ -+ memset(&cap, 0, sizeof(cap)); -+ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { -+ int err = errno; -+ if (err == EINTR) -+ continue; -+ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); -+ return AVERROR(err); -+ } -+ -+ // Could be made table driven if we have a few more but right now there -+ // seems no point -+ -+ // Meson (amlogic) always gives a resolution changed event after output -+ // streamon and userspace must (re)allocate capture buffers and streamon -+ // capture to clear the event even if the capture buffers were the right -+ // size in the first place. -+ if (strcmp(cap.driver, "meson-vdec") == 0) -+ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; -+ -+ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); -+ return 0; -+} -+ -+// This heuristic is for H264 but use for everything -+static uint32_t max_coded_size(const AVCodecContext * const avctx) -+{ -+ uint32_t wxh = avctx->coded_width * avctx->coded_height; -+ uint32_t size; -+ -+ size = wxh * 3 / 2; -+ // H.264 Annex A table A-1 gives minCR which is either 2 or 4 -+ // unfortunately that doesn't yield an actually useful limit -+ // and it should be noted that frame 0 is special cased to allow -+ // a bigger number which really isn't helpful for us. So just pick -+ // frame_size / 2 -+ size /= 2; -+ // Add 64k to allow for any overheads and/or encoder hopefulness -+ // with small WxH -+ return size + (1 << 16); -+} -+ - static av_cold int v4l2_decode_init(AVCodecContext *avctx) - { - V4L2Context *capture, *output; - V4L2m2mContext *s; - V4L2m2mPriv *priv = avctx->priv_data; -+ int gf_pix_fmt; - int ret; - -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ if (avctx->codec_id == AV_CODEC_ID_H264) { -+ if (avctx->ticks_per_frame == 1) { -+ if(avctx->time_base.den < INT_MAX/2) { -+ avctx->time_base.den *= 2; -+ } else -+ avctx->time_base.num /= 2; -+ } -+ avctx->ticks_per_frame = 2; -+ } -+ -+ av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; - -+ xlat_init(&s->xlat); -+ pts_stats_init(&s->pts_stat, avctx, "decoder"); -+ - capture = &s->capture; - output = &s->output; - -@@ -192,14 +878,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - * by the v4l2 driver; this event will trigger a full pipeline reconfig and - * the proper values will be retrieved from the kernel driver. - */ -- output->height = capture->height = avctx->coded_height; -- output->width = capture->width = avctx->coded_width; -+// output->height = capture->height = avctx->coded_height; -+// output->width = capture->width = avctx->coded_width; -+ output->height = capture->height = 0; -+ output->width = capture->width = 0; - - output->av_codec_id = avctx->codec_id; - output->av_pix_fmt = AV_PIX_FMT_NONE; -+ output->min_buf_size = max_coded_size(avctx); - - capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; - capture->av_pix_fmt = avctx->pix_fmt; -+ capture->min_buf_size = 0; -+ -+ /* the client requests the codec to generate DRM frames: -+ * - data[0] will therefore point to the returned AVDRMFrameDescriptor -+ * check the ff_v4l2_buffer_to_avframe conversion function. -+ * - the DRM frame format is passed in the DRM frame descriptor layer. -+ * check the v4l2_get_drm_frame function. -+ */ -+ -+ avctx->sw_pix_fmt = avctx->pix_fmt; -+ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); -+ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", -+ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), -+ avctx->coded_width, avctx->coded_height, -+ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); -+ -+ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { -+ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; -+ s->output_drm = 1; -+ } -+ else { -+ capture->av_pix_fmt = gf_pix_fmt; -+ s->output_drm = 0; -+ } -+ -+ s->db_ctl = NULL; -+ if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) { -+ if (strcmp(priv->dmabuf_alloc, "cma") == 0) -+ s->db_ctl = dmabufs_ctl_new(); -+ else { -+ av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc); -+ return AVERROR(EINVAL); -+ } -+ if (!s->db_ctl) { -+ av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc); -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); -+ if (!s->device_ref) { -+ ret = AVERROR(ENOMEM); -+ return ret; -+ } -+ -+ ret = av_hwdevice_ctx_init(s->device_ref); -+ if (ret < 0) -+ return ret; - - s->avctx = avctx; - ret = ff_v4l2_m2m_codec_init(priv); -@@ -208,12 +945,84 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) - return ret; - } - -- return v4l2_prepare_decoder(s); -+ if (avctx->extradata && -+ (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret)); -+#if DUMP_FAILED_EXTRADATA -+ log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size); -+#endif -+ return ret; -+ } -+ -+ if ((ret = v4l2_prepare_decoder(s)) < 0) -+ return ret; -+ -+ if ((ret = get_quirks(avctx, s)) != 0) -+ return ret; -+ -+ if ((ret = check_size(avctx, s)) != 0) -+ return ret; -+ -+ return 0; - } - - static av_cold int v4l2_decode_close(AVCodecContext *avctx) - { -- return ff_v4l2_m2m_codec_end(avctx->priv_data); -+ int rv; -+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ rv = ff_v4l2_m2m_codec_end(avctx->priv_data); -+ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv); -+ return rv; -+} -+ -+static void v4l2_decode_flush(AVCodecContext *avctx) -+{ -+ // An alternatve and more drastic form of flush is to simply do this: -+ // v4l2_decode_close(avctx); -+ // v4l2_decode_init(avctx); -+ // The downside is that this keeps a decoder open until all the frames -+ // associated with it have been returned. This is a bit wasteful on -+ // possibly limited h/w resources and fails on a Pi for this reason unless -+ // more GPU mem is allocated than is the default. -+ -+ V4L2m2mPriv * const priv = avctx->priv_data; -+ V4L2m2mContext * const s = priv->context; -+ V4L2Context * const output = &s->output; -+ V4L2Context * const capture = &s->capture; -+ -+ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); -+ -+ // Reflushing everything is benign, quick and avoids having to worry about -+ // states like EOS processing so don't try to optimize out (having got it -+ // wrong once) -+ -+ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); -+ -+ // Clear any buffered input packet -+ av_packet_unref(&s->buf_pkt); -+ -+ // Clear a pending EOS -+ if (ff_v4l2_ctx_eos(capture)) { -+ // Arguably we could delay this but this is easy and doesn't require -+ // thought or extra vars -+ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); -+ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); -+ } -+ -+ // V4L2 makes no guarantees about whether decoded frames are flushed or not -+ // so mark all frames we are tracking to be discarded if they appear -+ xlat_flush(&s->xlat); -+ -+ // resend extradata -+ s->extdata_sent = 0; -+ // clear status vars -+ s->running = 0; -+ s->draining = 0; -+ output->done = 0; -+ capture->done = 0; -+ -+ // Stream on will occur when we actually submit a new frame -+ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); - } - - #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -222,10 +1031,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) - static const AVOption options[] = { - V4L_M2M_DEFAULT_OPTS, - { "num_capture_buffers", "Number of buffers in the capture context", -- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS }, -+ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, -+ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, -+ { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS }, - { NULL}, - }; - -+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { -+ HW_CONFIG_INTERNAL(DRM_PRIME), -+ NULL -+}; -+ - #define M2MDEC_CLASS(NAME) \ - static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ - .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -246,9 +1062,15 @@ static const AVOption options[] = { - .init = v4l2_decode_init, \ - .receive_frame = v4l2_receive_frame, \ - .close = v4l2_decode_close, \ -+ .flush = v4l2_decode_flush, \ - .bsfs = bsf_name, \ - .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ - .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ -+ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ -+ AV_PIX_FMT_NV12, \ -+ AV_PIX_FMT_YUV420P, \ -+ AV_PIX_FMT_NONE}, \ -+ .hw_configs = v4l2_m2m_hw_configs, \ - .wrapper_name = "v4l2m2m", \ - } - -diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c -index f644b50133..6472b56030 100644 ---- a/libavcodec/v4l2_m2m_enc.c -+++ b/libavcodec/v4l2_m2m_enc.c -@@ -24,6 +24,8 @@ - #include - #include - #include -+#include -+ - #include "encode.h" - #include "libavcodec/avcodec.h" - #include "libavcodec/internal.h" -@@ -38,6 +40,34 @@ - #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x - #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x - -+// P030 should be defined in drm_fourcc.h and hopefully will be sometime -+// in the future but until then... -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') -+#endif -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+#ifndef V4L2_CID_CODEC_BASE -+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE -+#endif -+ -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in videodev2.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ - static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den) - { - struct v4l2_streamparm parm = { 0 }; -@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p) - static int v4l2_check_b_frame_support(V4L2m2mContext *s) - { - if (s->avctx->max_b_frames) -- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n"); -+ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames); - -- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0); -+ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1); - v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0); - if (s->avctx->max_b_frames == 0) - return 0; - - avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding"); -- - return AVERROR_PATCHWELCOME; - } - -@@ -271,17 +300,208 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s) - return 0; - } - -+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame) -+{ -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ const uint32_t drm_fmt = src->layers[0].format; -+ // Treat INVALID as LINEAR -+ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? -+ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; -+ uint32_t pix_fmt = 0; -+ uint32_t w = 0; -+ uint32_t h = 0; -+ uint32_t bpl = src->layers[0].planes[0].pitch; -+ -+ // We really don't expect multiple layers -+ // All formats that we currently cope with are single object -+ -+ if (src->nb_layers != 1 || src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ switch (drm_fmt) { -+ case DRM_FORMAT_YUV420: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 3) -+ break; -+ pix_fmt = V4L2_PIX_FMT_YUV420; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ break; -+ -+ case DRM_FORMAT_NV12: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_COL128; -+ w = bpl; -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ case DRM_FORMAT_P030: -+ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; -+ w = bpl / 2; // Matching lie to how we construct this -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+ break; -+ -+ default: -+ break; -+ } -+ -+ if (!pix_fmt) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->plane_fmt[0].bytesperline = bpl; -+ pix->num_planes = 1; -+ } -+ else { -+ struct v4l2_pix_format *const pix = &format->fmt.pix; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->bytesperline = bpl; -+ } -+ -+ return 0; -+} -+ -+// Do we have similar enough formats to be usable? -+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b) -+{ -+ if (a->type != b->type) -+ return 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) { -+ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp; -+ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp; -+ unsigned int i; -+ if (pa->pixelformat != pb->pixelformat || -+ pa->num_planes != pb->num_planes) -+ return 0; -+ for (i = 0; i != pa->num_planes; ++i) { -+ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline) -+ return 0; -+ } -+ } -+ else { -+ const struct v4l2_pix_format *const pa = &a->fmt.pix; -+ const struct v4l2_pix_format *const pb = &b->fmt.pix; -+ if (pa->pixelformat != pb->pixelformat || -+ pa->bytesperline != pb->bytesperline) -+ return 0; -+ } -+ return 1; -+} -+ -+static inline int q_full(const V4L2Context *const output) -+{ -+ return ff_v4l2_context_q_count(output) == output->num_buffers; -+} -+ - static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) - { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const output = &s->output; -+ int rv; -+ const int needs_slot = q_full(output); -+ -+ av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); -+ -+ // Signal EOF if needed (doesn't need q slot) -+ if (!frame) { -+ av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__); -+ return ff_v4l2_context_enqueue_frame(output, frame); -+ } -+ -+ if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) { -+ // We should be able to return AVERROR(EAGAIN) to indicate buffer -+ // exhaustion, but ffmpeg currently treats that as fatal. -+ av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv)); -+ return rv; -+ } -+ -+ if (s->input_drm && !output->streamon) { -+ struct v4l2_format req_format = {.type = output->format.type}; -+ -+ // Set format when we first get a buffer -+ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n"); -+ return rv; -+ } -+ -+ ff_v4l2_context_release(output); -+ -+ output->format = req_format; -+ -+ if ((rv = ff_v4l2_context_set_format(output)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n"); -+ return rv; -+ } -+ -+ if (!fmt_eq(&req_format, &output->format)) { -+ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ output->selection.top = frame->crop_top; -+ output->selection.left = frame->crop_left; -+ output->selection.width = av_frame_cropped_width(frame); -+ output->selection.height = av_frame_cropped_height(frame); -+ -+ if ((rv = ff_v4l2_context_init(output)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n"); -+ return rv; -+ } -+ -+ { -+ struct v4l2_selection selection = { -+ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT, -+ .target = V4L2_SEL_TGT_CROP, -+ .r = output->selection -+ }; -+ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n", -+ selection.r.width, selection.r.height, selection.r.left, selection.r.top, -+ av_err2str(AVERROR(errno))); -+ } -+ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n", -+ selection.r.width, selection.r.height, selection.r.left, selection.r.top); -+ } -+ } - - #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME -- if (frame && frame->pict_type == AV_PICTURE_TYPE_I) -+ if (frame->pict_type == AV_PICTURE_TYPE_I) - v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); - #endif - -- return ff_v4l2_context_enqueue_frame(output, frame); -+ rv = ff_v4l2_context_enqueue_frame(output, frame); -+ if (rv) { -+ av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv)); -+ } -+ -+ return rv; - } - - static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) -@@ -292,6 +512,11 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - AVFrame *frame = s->frame; - int ret; - -+ av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__, -+ ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture)); -+ -+ ff_v4l2_dq_all(output, 0); -+ - if (s->draining) - goto dequeue; - -@@ -328,7 +553,115 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) - } - - dequeue: -- return ff_v4l2_context_dequeue_packet(capture, avpkt); -+ // Dequeue a frame -+ for (;;) { -+ int t = q_full(output) ? -1 : s->draining ? 300 : 0; -+ int rv2; -+ -+ // If output is full wait for either a packet or output to become not full -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t); -+ -+ // If output was full retry packet dequeue -+ t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300; -+ rv2 = ff_v4l2_dq_all(output, t); -+ if (t == 0 || rv2 != 0) -+ break; -+ } -+ if (ret) -+ return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; -+ -+ if (capture->first_buf == 1) { -+ uint8_t * data; -+ const int len = avpkt->size; -+ -+ // 1st buffer after streamon should be SPS/PPS -+ capture->first_buf = 2; -+ -+ // Clear both possible stores so there is no chance of confusion -+ av_freep(&s->extdata_data); -+ s->extdata_size = 0; -+ av_freep(&avctx->extradata); -+ avctx->extradata_size = 0; -+ -+ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) -+ goto fail_no_mem; -+ -+ memcpy(data, avpkt->data, len); -+ av_packet_unref(avpkt); -+ -+ // We need to copy the header, but keep local if not global -+ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { -+ avctx->extradata = data; -+ avctx->extradata_size = len; -+ } -+ else { -+ s->extdata_data = data; -+ s->extdata_size = len; -+ } -+ -+ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0); -+ ff_v4l2_dq_all(output, 0); -+ if (ret) -+ return ret; -+ } -+ -+ // First frame must be key so mark as such even if encoder forgot -+ if (capture->first_buf == 2) { -+ avpkt->flags |= AV_PKT_FLAG_KEY; -+ -+ // Add any extradata to the 1st packet we emit as we cannot create it at init -+ if (avctx->extradata_size > 0 && avctx->extradata) { -+ void * const side = av_packet_new_side_data(avpkt, -+ AV_PKT_DATA_NEW_EXTRADATA, -+ avctx->extradata_size); -+ if (!side) -+ goto fail_no_mem; -+ -+ memcpy(side, avctx->extradata, avctx->extradata_size); -+ } -+ } -+ -+ // Add SPS/PPS to the start of every key frame if non-global headers -+ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { -+ const size_t newlen = s->extdata_size + avpkt->size; -+ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); -+ -+ if (buf == NULL) -+ goto fail_no_mem; -+ -+ memcpy(buf->data, s->extdata_data, s->extdata_size); -+ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); -+ -+ av_buffer_unref(&avpkt->buf); -+ avpkt->buf = buf; -+ avpkt->data = buf->data; -+ avpkt->size = newlen; -+ } -+ else if (ff_v4l2_context_q_count(capture) < 2) { -+ // Avoid running out of capture buffers -+ // In most cases the buffers will be returned quickly in which case -+ // we don't copy and can use the v4l2 buffers directly but sometimes -+ // ffmpeg seems to hold onto all of them for a long time (.mkv -+ // creation?) so avoid deadlock in those cases. -+ AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE); -+ if (buf == NULL) -+ goto fail_no_mem; -+ -+ memcpy(buf->data, avpkt->data, avpkt->size); -+ av_buffer_unref(&avpkt->buf); // Will recycle the V4L2 buffer -+ -+ avpkt->buf = buf; -+ avpkt->data = buf->data; -+ } -+ -+ capture->first_buf = 0; -+ return 0; -+ -+fail_no_mem: -+ av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n"); -+ ret = AVERROR(ENOMEM); -+ av_packet_unref(avpkt); -+ return ret; - } - - static av_cold int v4l2_encode_init(AVCodecContext *avctx) -@@ -340,6 +673,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - uint32_t v4l2_fmt_output; - int ret; - -+ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt); -+ - ret = ff_v4l2_m2m_create_context(priv, &s); - if (ret < 0) - return ret; -@@ -347,13 +682,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - capture = &s->capture; - output = &s->output; - -+ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME); -+ - /* common settings output/capture */ - output->height = capture->height = avctx->height; - output->width = capture->width = avctx->width; - - /* output context */ - output->av_codec_id = AV_CODEC_ID_RAWVIDEO; -- output->av_pix_fmt = avctx->pix_fmt; -+ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt : -+ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt : -+ AV_PIX_FMT_YUV420P; - - /* capture context */ - capture->av_codec_id = avctx->codec_id; -@@ -372,7 +711,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) - v4l2_fmt_output = output->format.fmt.pix.pixelformat; - - pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO); -- if (pix_fmt_output != avctx->pix_fmt) { -+ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output); - av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name); - return AVERROR(EINVAL); -@@ -390,9 +729,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx) - #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM - - #define V4L_M2M_CAPTURE_OPTS \ -- V4L_M2M_DEFAULT_OPTS,\ -+ { "num_output_buffers", "Number of buffers in the output context",\ -+ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\ - { "num_capture_buffers", "Number of buffers in the capture context", \ -- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS } -+ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS } - - static const AVOption mpeg4_options[] = { - V4L_M2M_CAPTURE_OPTS, -diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c -new file mode 100644 -index 0000000000..5b3fb958fa ---- /dev/null -+++ b/libavcodec/v4l2_req_decode_q.c -@@ -0,0 +1,84 @@ -+#include -+#include -+#include -+ -+#include "v4l2_req_decode_q.h" -+ -+int decode_q_in_q(const req_decode_ent * const d) -+{ -+ return d->in_q; -+} -+ -+void decode_q_add(req_decode_q * const q, req_decode_ent * const d) -+{ -+ pthread_mutex_lock(&q->q_lock); -+ if (!q->head) { -+ q->head = d; -+ q->tail = d; -+ d->prev = NULL; -+ } -+ else { -+ q->tail->next = d; -+ d->prev = q->tail; -+ q->tail = d; -+ } -+ d->next = NULL; -+ d->in_q = 1; -+ pthread_mutex_unlock(&q->q_lock); -+} -+ -+// Remove entry from Q - if head wake-up anything that was waiting -+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d) -+{ -+ int try_signal = 0; -+ -+ if (!d->in_q) -+ return; -+ -+ pthread_mutex_lock(&q->q_lock); -+ if (d->prev) -+ d->prev->next = d->next; -+ else { -+ try_signal = 1; // Only need to signal if we were head -+ q->head = d->next; -+ } -+ -+ if (d->next) -+ d->next->prev = d->prev; -+ else -+ q->tail = d->prev; -+ -+ // Not strictly needed but makes debug easier -+ d->next = NULL; -+ d->prev = NULL; -+ d->in_q = 0; -+ pthread_mutex_unlock(&q->q_lock); -+ -+ if (try_signal) -+ pthread_cond_broadcast(&q->q_cond); -+} -+ -+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d) -+{ -+ pthread_mutex_lock(&q->q_lock); -+ -+ while (q->head != d) -+ pthread_cond_wait(&q->q_cond, &q->q_lock); -+ -+ pthread_mutex_unlock(&q->q_lock); -+} -+ -+void decode_q_uninit(req_decode_q * const q) -+{ -+ pthread_mutex_destroy(&q->q_lock); -+ pthread_cond_destroy(&q->q_cond); -+} -+ -+void decode_q_init(req_decode_q * const q) -+{ -+ memset(q, 0, sizeof(*q)); -+ pthread_mutex_init(&q->q_lock, NULL); -+ pthread_cond_init(&q->q_cond, NULL); -+} -+ -+ -diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h -new file mode 100644 -index 0000000000..af7bbe1de4 ---- /dev/null -+++ b/libavcodec/v4l2_req_decode_q.h -@@ -0,0 +1,25 @@ -+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H -+#define AVCODEC_V4L2_REQ_DECODE_Q_H -+ -+typedef struct req_decode_ent { -+ struct req_decode_ent * next; -+ struct req_decode_ent * prev; -+ int in_q; -+} req_decode_ent; -+ -+typedef struct req_decode_q { -+ pthread_mutex_t q_lock; -+ pthread_cond_t q_cond; -+ req_decode_ent * head; -+ req_decode_ent * tail; -+} req_decode_q; -+ -+int decode_q_in_q(const req_decode_ent * const d); -+void decode_q_add(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d); -+void decode_q_uninit(req_decode_q * const q); -+void decode_q_init(req_decode_q * const q); -+ -+#endif -+ -diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c -new file mode 100644 -index 0000000000..cfa94d55c4 ---- /dev/null -+++ b/libavcodec/v4l2_req_devscan.c -@@ -0,0 +1,449 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+#include -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_utils.h" -+ -+struct decdev { -+ enum v4l2_buf_type src_type; -+ uint32_t src_fmt_v4l2; -+ const char * vname; -+ const char * mname; -+}; -+ -+struct devscan { -+ struct decdev env; -+ unsigned int dev_size; -+ unsigned int dev_count; -+ struct decdev *devs; -+}; -+ -+static int video_src_pixfmt_supported(uint32_t fmt) -+{ -+ return 1; -+} -+ -+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, -+ unsigned int width, unsigned int height, -+ unsigned int pixelformat) -+{ -+ unsigned int sizeimage; -+ -+ memset(format, 0, sizeof(*format)); -+ format->type = type; -+ -+ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { -+ format->fmt.pix_mp.width = width; -+ format->fmt.pix_mp.height = height; -+ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; -+ format->fmt.pix_mp.pixelformat = pixelformat; -+ } else { -+ format->fmt.pix.width = width; -+ format->fmt.pix.height = height; -+ format->fmt.pix.sizeimage = sizeimage; -+ format->fmt.pix.pixelformat = pixelformat; -+ } -+} -+ -+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, -+ unsigned int width, unsigned int height) -+{ -+ struct v4l2_format format; -+ -+ v4l2_setup_format(&format, type, width, height, pixelformat); -+ -+ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0; -+} -+ -+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities) -+{ -+ struct v4l2_capability capability = { 0 }; -+ int rc; -+ -+ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability); -+ if (rc < 0) -+ return -errno; -+ -+ if (capabilities != NULL) { -+ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0) -+ *capabilities = capability.device_caps; -+ else -+ *capabilities = capability.capabilities; -+ } -+ -+ return 0; -+} -+ -+static int devscan_add(struct devscan *const scan, -+ enum v4l2_buf_type src_type, -+ uint32_t src_fmt_v4l2, -+ const char * vname, -+ const char * mname) -+{ -+ struct decdev *d; -+ -+ if (scan->dev_size <= scan->dev_count) { -+ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2; -+ d = realloc(scan->devs, n * sizeof(*d)); -+ if (!d) -+ return -ENOMEM; -+ scan->devs = d; -+ scan->dev_size = n; -+ } -+ -+ d = scan->devs + scan->dev_count; -+ d->src_type = src_type; -+ d->src_fmt_v4l2 = src_fmt_v4l2; -+ d->vname = strdup(vname); -+ if (!d->vname) -+ return -ENOMEM; -+ d->mname = strdup(mname); -+ if (!d->mname) { -+ free((char *)d->vname); -+ return -ENOMEM; -+ } -+ ++scan->dev_count; -+ return 0; -+} -+ -+void devscan_delete(struct devscan **const pScan) -+{ -+ unsigned int i; -+ struct devscan * const scan = *pScan; -+ -+ if (!scan) -+ return; -+ *pScan = NULL; -+ -+ for (i = 0; i < scan->dev_count; ++i) { -+ free((char*)scan->devs[i].mname); -+ free((char*)scan->devs[i].vname); -+ } -+ free(scan->devs); -+ free(scan); -+} -+ -+#define REQ_BUF_CAPS (\ -+ V4L2_BUF_CAP_SUPPORTS_DMABUF |\ -+ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\ -+ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) -+ -+static void probe_formats(void * const dc, -+ struct devscan *const scan, -+ const int fd, -+ const unsigned int type_v4l2, -+ const char *const mpath, -+ const char *const vpath) -+{ -+ unsigned int i; -+ for (i = 0;; ++i) { -+ struct v4l2_fmtdesc fmtdesc = { -+ .index = i, -+ .type = type_v4l2 -+ }; -+ struct v4l2_requestbuffers rbufs = { -+ .count = 0, -+ .type = type_v4l2, -+ .memory = V4L2_MEMORY_MMAP -+ }; -+ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { -+ if (errno == EINTR) -+ continue; -+ if (errno != EINVAL) -+ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2); -+ return; -+ } -+ if (!video_src_pixfmt_supported(fmtdesc.pixelformat)) -+ continue; -+ -+ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) { -+ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat); -+ continue; -+ } -+ -+ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) { -+ if (errno != EINTR) { -+ request_debug(dc, "%s: Reqbufs failed\n", vpath); -+ continue; -+ } -+ } -+ -+ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) { -+ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities); -+ continue; -+ } -+ -+ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n", -+ mpath, vpath, fmtdesc.pixelformat, type_v4l2); -+ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath); -+ } -+} -+ -+ -+static int probe_video_device(void * const dc, -+ struct udev_device *const device, -+ struct devscan *const scan, -+ const char *const mpath) -+{ -+ int ret; -+ unsigned int capabilities = 0; -+ int video_fd = -1; -+ -+ const char *path = udev_device_get_devnode(device); -+ if (!path) { -+ request_err(dc, "%s: get video device devnode failed\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ video_fd = open(path, O_RDWR, 0); -+ if (video_fd == -1) { -+ ret = -errno; -+ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); -+ goto fail; -+ } -+ -+ ret = v4l2_query_capabilities(video_fd, &capabilities); -+ if (ret < 0) { -+ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities); -+ -+ if (!(capabilities & V4L2_CAP_STREAMING)) { -+ request_debug(dc, "%s: missing required streaming capability\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) { -+ request_debug(dc, "%s: missing required mem2mem capability\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ /* Should check capture formats too... */ -+ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0) -+ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path); -+ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) -+ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path); -+ -+ close(video_fd); -+ return 0; -+ -+fail: -+ if (video_fd >= 0) -+ close(video_fd); -+ return ret; -+} -+ -+static int probe_media_device(void * const dc, -+ struct udev_device *const device, -+ struct devscan *const scan) -+{ -+ int ret; -+ int rv; -+ struct media_device_info device_info = { 0 }; -+ struct media_v2_topology topology = { 0 }; -+ struct media_v2_interface *interfaces = NULL; -+ struct udev *udev = udev_device_get_udev(device); -+ struct udev_device *video_device; -+ dev_t devnum; -+ int media_fd = -1; -+ -+ const char *path = udev_device_get_devnode(device); -+ if (!path) { -+ request_err(dc, "%s: get media device devnode failed\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ media_fd = open(path, O_RDWR, 0); -+ if (media_fd < 0) { -+ ret = -errno; -+ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ if (topology.num_interfaces <= 0) { -+ request_err(dc, "%s: media device has no interfaces\n", __func__); -+ ret = -EINVAL; -+ goto fail; -+ } -+ -+ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces)); -+ if (!interfaces) { -+ request_err(dc, "%s: allocating media interface struct failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; -+ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); -+ if (rv < 0) { -+ ret = -errno; -+ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); -+ goto fail; -+ } -+ -+ for (int i = 0; i < topology.num_interfaces; i++) { -+ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) -+ continue; -+ -+ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); -+ video_device = udev_device_new_from_devnum(udev, 'c', devnum); -+ if (!video_device) { -+ ret = -errno; -+ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device); -+ continue; -+ } -+ -+ ret = probe_video_device(dc, video_device, scan, path); -+ udev_device_unref(video_device); -+ -+ if (ret != 0) -+ goto fail; -+ } -+ -+fail: -+ free(interfaces); -+ if (media_fd != -1) -+ close(media_fd); -+ return ret; -+} -+ -+const char *decdev_media_path(const struct decdev *const dev) -+{ -+ return !dev ? NULL : dev->mname; -+} -+ -+const char *decdev_video_path(const struct decdev *const dev) -+{ -+ return !dev ? NULL : dev->vname; -+} -+ -+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev) -+{ -+ return !dev ? 0 : dev->src_type; -+} -+ -+uint32_t decdev_src_pixelformat(const struct decdev *const dev) -+{ -+ return !dev ? 0 : dev->src_fmt_v4l2; -+} -+ -+ -+const struct decdev *devscan_find(struct devscan *const scan, -+ const uint32_t src_fmt_v4l2) -+{ -+ unsigned int i; -+ -+ if (scan->env.mname && scan->env.vname) -+ return &scan->env; -+ -+ if (!src_fmt_v4l2) -+ return scan->dev_count ? scan->devs + 0 : NULL; -+ -+ for (i = 0; i != scan->dev_count; ++i) { -+ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2) -+ return scan->devs + i; -+ } -+ return NULL; -+} -+ -+int devscan_build(void * const dc, struct devscan **pscan) -+{ -+ int ret; -+ struct udev *udev; -+ struct udev_enumerate *enumerate; -+ struct udev_list_entry *devices; -+ struct udev_list_entry *entry; -+ struct udev_device *device; -+ struct devscan * scan; -+ -+ *pscan = NULL; -+ -+ scan = calloc(1, sizeof(*scan)); -+ if (!scan) { -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH"); -+ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH"); -+ if (scan->env.mname && scan->env.vname) { -+ request_info(dc, "Media/video device env overrides found: %s,%s\n", -+ scan->env.mname, scan->env.vname); -+ *pscan = scan; -+ return 0; -+ } -+ -+ udev = udev_new(); -+ if (!udev) { -+ request_err(dc, "%s: allocating udev context failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ enumerate = udev_enumerate_new(udev); -+ if (!enumerate) { -+ request_err(dc, "%s: allocating udev enumerator failed\n", __func__); -+ ret = -ENOMEM; -+ goto fail; -+ } -+ -+ udev_enumerate_add_match_subsystem(enumerate, "media"); -+ udev_enumerate_scan_devices(enumerate); -+ -+ devices = udev_enumerate_get_list_entry(enumerate); -+ udev_list_entry_foreach(entry, devices) { -+ const char *path = udev_list_entry_get_name(entry); -+ if (!path) -+ continue; -+ -+ device = udev_device_new_from_syspath(udev, path); -+ if (!device) -+ continue; -+ -+ probe_media_device(dc, device, scan); -+ udev_device_unref(device); -+ } -+ -+ udev_enumerate_unref(enumerate); -+ -+ *pscan = scan; -+ return 0; -+ -+fail: -+ udev_unref(udev); -+ devscan_delete(&scan); -+ return ret; -+} -+ -diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h -new file mode 100644 -index 0000000000..956d9234f1 ---- /dev/null -+++ b/libavcodec/v4l2_req_devscan.h -@@ -0,0 +1,23 @@ -+#ifndef _DEVSCAN_H_ -+#define _DEVSCAN_H_ -+ -+#include -+ -+struct devscan; -+struct decdev; -+enum v4l2_buf_type; -+ -+/* These return pointers to data in the devscan structure and so are vaild -+ * for the lifetime of that -+ */ -+const char *decdev_media_path(const struct decdev *const dev); -+const char *decdev_video_path(const struct decdev *const dev); -+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); -+uint32_t decdev_src_pixelformat(const struct decdev *const dev); -+ -+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); -+ -+int devscan_build(void * const dc, struct devscan **pscan); -+void devscan_delete(struct devscan **const pScan); -+ -+#endif -diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c -new file mode 100644 -index 0000000000..acc0366e76 ---- /dev/null -+++ b/libavcodec/v4l2_req_dmabufs.c -@@ -0,0 +1,369 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_utils.h" -+ -+#define DMABUF_NAME1 "/dev/dma_heap/linux,cma" -+#define DMABUF_NAME2 "/dev/dma_heap/reserved" -+ -+#define TRACE_ALLOC 0 -+ -+struct dmabufs_ctl; -+struct dmabuf_h; -+ -+struct dmabuf_fns { -+ int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size); -+ void (*buf_free)(struct dmabuf_h * dh); -+ int (*ctl_new)(struct dmabufs_ctl * dbsc); -+ void (*ctl_free)(struct dmabufs_ctl * dbsc); -+}; -+ -+struct dmabufs_ctl { -+ atomic_int ref_count; -+ int fd; -+ size_t page_size; -+ void * v; -+ const struct dmabuf_fns * fns; -+}; -+ -+struct dmabuf_h { -+ int fd; -+ size_t size; -+ size_t len; -+ void * mapptr; -+ void * v; -+ const struct dmabuf_fns * fns; -+}; -+ -+#if TRACE_ALLOC -+static unsigned int total_bufs = 0; -+static size_t total_size = 0; -+#endif -+ -+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size) -+{ -+ struct dmabuf_h *dh; -+ -+ if (mapptr == MAP_FAILED) -+ return NULL; -+ -+ dh = malloc(sizeof(*dh)); -+ if (!dh) -+ return NULL; -+ -+ *dh = (struct dmabuf_h) { -+ .fd = -1, -+ .size = size, -+ .mapptr = mapptr -+ }; -+ -+ return dh; -+} -+ -+struct dmabuf_h * dmabuf_import(int fd, size_t size) -+{ -+ struct dmabuf_h *dh; -+ -+ fd = dup(fd); -+ if (fd < 0 || size == 0) -+ return NULL; -+ -+ dh = malloc(sizeof(*dh)); -+ if (!dh) { -+ close(fd); -+ return NULL; -+ } -+ -+ *dh = (struct dmabuf_h) { -+ .fd = fd, -+ .size = size, -+ .mapptr = MAP_FAILED -+ }; -+ -+#if TRACE_ALLOC -+ ++total_bufs; -+ total_size += dh->size; -+ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ return dh; -+} -+ -+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) -+{ -+ struct dmabuf_h * dh; -+ if (old != NULL) { -+ if (old->size >= size) { -+ return old; -+ } -+ dmabuf_free(old); -+ } -+ -+ if (size == 0 || -+ (dh = malloc(sizeof(*dh))) == NULL) -+ return NULL; -+ -+ *dh = (struct dmabuf_h){ -+ .fd = -1, -+ .mapptr = MAP_FAILED, -+ .fns = dbsc->fns -+ }; -+ -+ if (dh->fns->buf_alloc(dbsc, dh, size) != 0) -+ goto fail; -+ -+ -+#if TRACE_ALLOC -+ ++total_bufs; -+ total_size += dh->size; -+ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ return dh; -+ -+fail: -+ free(dh); -+ return NULL; -+} -+ -+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) -+{ -+ struct dma_buf_sync sync = { -+ .flags = flags -+ }; -+ if (dh->fd == -1) -+ return 0; -+ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { -+ const int err = errno; -+ if (errno == EINTR) -+ continue; -+ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags); -+ return -err; -+ } -+ return 0; -+} -+ -+int dmabuf_write_start(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE); -+} -+ -+int dmabuf_write_end(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE); -+} -+ -+int dmabuf_read_start(struct dmabuf_h * const dh) -+{ -+ if (!dmabuf_map(dh)) -+ return -1; -+ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ); -+} -+ -+int dmabuf_read_end(struct dmabuf_h * const dh) -+{ -+ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ); -+} -+ -+ -+void * dmabuf_map(struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return NULL; -+ if (dh->mapptr != MAP_FAILED) -+ return dh->mapptr; -+ dh->mapptr = mmap(NULL, dh->size, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ dh->fd, 0); -+ if (dh->mapptr == MAP_FAILED) { -+ request_log("%s: Map failed\n", __func__); -+ return NULL; -+ } -+ return dh->mapptr; -+} -+ -+int dmabuf_fd(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return -1; -+ return dh->fd; -+} -+ -+size_t dmabuf_size(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return 0; -+ return dh->size; -+} -+ -+size_t dmabuf_len(const struct dmabuf_h * const dh) -+{ -+ if (!dh) -+ return 0; -+ return dh->len; -+} -+ -+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) -+{ -+ dh->len = len; -+} -+ -+void dmabuf_free(struct dmabuf_h * dh) -+{ -+ if (!dh) -+ return; -+ -+#if TRACE_ALLOC -+ --total_bufs; -+ total_size -= dh->size; -+ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); -+#endif -+ -+ dh->fns->buf_free(dh); -+ -+ if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL) -+ munmap(dh->mapptr, dh->size); -+ if (dh->fd != -1) -+ while (close(dh->fd) == -1 && errno == EINTR) -+ /* loop */; -+ free(dh); -+} -+ -+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns) -+{ -+ struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc)); -+ -+ if (!dbsc) -+ return NULL; -+ -+ dbsc->fd = -1; -+ dbsc->fns = fns; -+ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); -+ -+ if (fns->ctl_new(dbsc) != 0) -+ goto fail; -+ -+ return dbsc; -+ -+fail: -+ free(dbsc); -+ return NULL; -+} -+ -+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) -+{ -+ request_debug(NULL, "Free dmabuf ctl\n"); -+ -+ dbsc->fns->ctl_free(dbsc); -+ -+ free(dbsc); -+} -+ -+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc) -+{ -+ struct dmabufs_ctl * const dbsc = *pDbsc; -+ -+ if (!dbsc) -+ return; -+ *pDbsc = NULL; -+ -+ if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0) -+ return; -+ -+ dmabufs_ctl_free(dbsc); -+} -+ -+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc) -+{ -+ atomic_fetch_add(&dbsc->ref_count, 1); -+ return dbsc; -+} -+ -+//----------------------------------------------------------------------------- -+// -+// Alloc dmabuf via CMA -+ -+static int ctl_cma_new(struct dmabufs_ctl * dbsc) -+{ -+ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ -+ if (dbsc->fd == -1) { -+ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ if (dbsc->fd == -1) { -+ request_log("Unable to open either %s or %s\n", -+ DMABUF_NAME1, DMABUF_NAME2); -+ return -1; -+ } -+ } -+ return 0; -+} -+ -+static void ctl_cma_free(struct dmabufs_ctl * dbsc) -+{ -+ if (dbsc->fd != -1) -+ while (close(dbsc->fd) == -1 && errno == EINTR) -+ /* loop */; -+ -+} -+ -+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size) -+{ -+ struct dma_heap_allocation_data data = { -+ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), -+ .fd = 0, -+ .fd_flags = O_RDWR, -+ .heap_flags = 0 -+ }; -+ -+ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { -+ int err = errno; -+ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", -+ (uint64_t)data.len, -+ dbsc->fd, -+ err, -+ strerror(err)); -+ if (err == EINTR) -+ continue; -+ return -err; -+ } -+ -+ dh->fd = data.fd; -+ dh->size = (size_t)data.len; -+ return 0; -+} -+ -+static void buf_cma_free(struct dmabuf_h * dh) -+{ -+ // Nothing needed -+} -+ -+static const struct dmabuf_fns dmabuf_cma_fns = { -+ .buf_alloc = buf_cma_alloc, -+ .buf_free = buf_cma_free, -+ .ctl_new = ctl_cma_new, -+ .ctl_free = ctl_cma_free, -+}; -+ -+struct dmabufs_ctl * dmabufs_ctl_new(void) -+{ -+ request_debug(NULL, "Dmabufs using CMA\n");; -+ return dmabufs_ctl_new2(&dmabuf_cma_fns); -+} -+ -diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h -new file mode 100644 -index 0000000000..381ba2708d ---- /dev/null -+++ b/libavcodec/v4l2_req_dmabufs.h -@@ -0,0 +1,44 @@ -+#ifndef DMABUFS_H -+#define DMABUFS_H -+ -+#include -+ -+struct dmabufs_ctl; -+struct dmabuf_h; -+ -+struct dmabufs_ctl * dmabufs_ctl_new(void); -+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc); -+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc); -+ -+// Need not preserve old contents -+// On NULL return old buffer is freed -+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size); -+ -+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) { -+ return dmabuf_realloc(dbsc, NULL, size); -+} -+/* Create from existing fd - dups(fd) */ -+struct dmabuf_h * dmabuf_import(int fd, size_t size); -+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */ -+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size); -+ -+void * dmabuf_map(struct dmabuf_h * const dh); -+ -+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ -+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags); -+ -+int dmabuf_write_start(struct dmabuf_h * const dh); -+int dmabuf_write_end(struct dmabuf_h * const dh); -+int dmabuf_read_start(struct dmabuf_h * const dh); -+int dmabuf_read_end(struct dmabuf_h * const dh); -+ -+int dmabuf_fd(const struct dmabuf_h * const dh); -+/* Allocated size */ -+size_t dmabuf_size(const struct dmabuf_h * const dh); -+/* Bytes in use */ -+size_t dmabuf_len(const struct dmabuf_h * const dh); -+/* Set bytes in use */ -+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len); -+void dmabuf_free(struct dmabuf_h * dh); -+ -+#endif -diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c -new file mode 100644 -index 0000000000..169b532832 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v1.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 1 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c -new file mode 100644 -index 0000000000..42af98e156 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v2.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 2 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c -new file mode 100644 -index 0000000000..dcc8d95632 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v3.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 3 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c -new file mode 100644 -index 0000000000..c35579d8e0 ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_v4.c -@@ -0,0 +1,3 @@ -+#define HEVC_CTRLS_VERSION 4 -+#include "v4l2_req_hevc_vx.c" -+ -diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c -new file mode 100644 -index 0000000000..b98d8464ca ---- /dev/null -+++ b/libavcodec/v4l2_req_hevc_vx.c -@@ -0,0 +1,1360 @@ -+// File included by v4l2_req_hevc_v* - not compiled on its own -+ -+#include "decode.h" -+#include "hevcdec.h" -+#include "hwconfig.h" -+ -+#if HEVC_CTRLS_VERSION == 1 -+#include "hevc-ctrls-v1.h" -+ -+// Fixup renamed entries -+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT -+ -+#elif HEVC_CTRLS_VERSION == 2 -+#include "hevc-ctrls-v2.h" -+#elif HEVC_CTRLS_VERSION == 3 -+#include "hevc-ctrls-v3.h" -+#elif HEVC_CTRLS_VERSION == 4 -+#include -+#if !defined(V4L2_CID_STATELESS_HEVC_SPS) -+#include "hevc-ctrls-v4.h" -+#endif -+#else -+#error Unknown HEVC_CTRLS_VERSION -+#endif -+ -+#ifndef V4L2_CID_STATELESS_HEVC_SPS -+#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS -+#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS -+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS -+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX -+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS -+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE -+#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE -+ -+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED -+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED -+#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE -+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B -+#endif -+ -+#include "v4l2_request_hevc.h" -+ -+#include "libavutil/hwcontext_drm.h" -+ -+#include -+#include -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_utils.h" -+ -+// Attached to buf[0] in frame -+// Pooled in hwcontext so generally create once - 1/frame -+typedef struct V4L2MediaReqDescriptor { -+ AVDRMFrameDescriptor drm; -+ -+ // Media -+ uint64_t timestamp; -+ struct qent_dst * qe_dst; -+ -+ // Decode only - should be NULL by the time we emit the frame -+ struct req_decode_ent decode_ent; -+ -+ struct media_request *req; -+ struct qent_src *qe_src; -+ -+#if HEVC_CTRLS_VERSION >= 2 -+ struct v4l2_ctrl_hevc_decode_params dec; -+#endif -+ -+ size_t num_slices; -+ size_t alloced_slices; -+ struct v4l2_ctrl_hevc_slice_params * slice_params; -+ struct slice_info * slices; -+ -+ size_t num_offsets; -+ size_t alloced_offsets; -+ uint32_t *offsets; -+ -+} V4L2MediaReqDescriptor; -+ -+struct slice_info { -+ const uint8_t * ptr; -+ size_t len; // bytes -+ size_t n_offsets; -+}; -+ -+// Handy container for accumulating controls before setting -+struct req_controls { -+ int has_scaling; -+ struct timeval tv; -+ struct v4l2_ctrl_hevc_sps sps; -+ struct v4l2_ctrl_hevc_pps pps; -+ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; -+}; -+ -+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; -+ -+ -+// Get an FFmpeg format from the v4l2 format -+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format) -+{ -+ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? -+ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) { -+ case V4L2_PIX_FMT_YUV420: -+ return AV_PIX_FMT_YUV420P; -+ case V4L2_PIX_FMT_NV12: -+ return AV_PIX_FMT_NV12; -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ return AV_PIX_FMT_RPI4_8; -+ case V4L2_PIX_FMT_NV12_10_COL128: -+ return AV_PIX_FMT_RPI4_10; -+#endif -+ default: -+ break; -+ } -+ return AV_PIX_FMT_NONE; -+} -+ -+static inline uint64_t frame_capture_dpb(const AVFrame * const frame) -+{ -+ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; -+ return rd->timestamp; -+} -+ -+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp) -+{ -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; -+ rd->timestamp = dpb_stamp; -+} -+ -+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) -+{ -+ int32_t luma_weight_denom, chroma_weight_denom; -+ const SliceHeader *sh = &h->sh; -+ -+ if (sh->slice_type == HEVC_SLICE_I || -+ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || -+ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) -+ return; -+ -+ table->luma_log2_weight_denom = sh->luma_log2_weight_denom; -+ -+ if (h->ps.sps->chroma_format_idc) -+ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; -+ -+ luma_weight_denom = (1 << sh->luma_log2_weight_denom); -+ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); -+ -+ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { -+ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; -+ table->luma_offset_l0[i] = sh->luma_offset_l0[i]; -+ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; -+ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; -+ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; -+ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; -+ } -+ -+ if (sh->slice_type != HEVC_SLICE_B) -+ return; -+ -+ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { -+ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; -+ table->luma_offset_l1[i] = sh->luma_offset_l1[i]; -+ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; -+ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; -+ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; -+ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; -+ } -+} -+ -+#if HEVC_CTRLS_VERSION <= 2 -+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) -+{ -+ const HEVCFrame *frame; -+ int i; -+ -+ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { -+ frame = h->rps[ST_CURR_BEF].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; -+ } -+ -+ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { -+ frame = h->rps[ST_CURR_AFT].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; -+ } -+ -+ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { -+ frame = h->rps[LT_CURR].ref[i]; -+ if (frame && timestamp == frame_capture_dpb(frame->frame)) -+ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; -+ } -+ -+ return 0; -+} -+#endif -+ -+static unsigned int -+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, -+ const struct v4l2_hevc_dpb_entry * const entries, -+ const unsigned int num_entries) -+{ -+ uint64_t timestamp; -+ -+ if (!frame) -+ return 0; -+ -+ timestamp = frame_capture_dpb(frame->frame); -+ -+ for (unsigned int i = 0; i < num_entries; i++) { -+ if (entries[i].timestamp == timestamp) -+ return i; -+ } -+ -+ return 0; -+} -+ -+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) -+{ -+ unsigned int z = 0; -+ while (idx--) { -+ if (*b++ == 0) { -+ ++z; -+ if (z >= 2 && *b == 3) { -+ ++b; -+ z = 0; -+ } -+ } -+ else { -+ z = 0; -+ } -+ } -+ return b; -+} -+ -+static int slice_add(V4L2MediaReqDescriptor * const rd) -+{ -+ if (rd->num_slices >= rd->alloced_slices) { -+ struct v4l2_ctrl_hevc_slice_params * p2; -+ struct slice_info * s2; -+ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2; -+ -+ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); -+ if (p2 == NULL) -+ return AVERROR(ENOMEM); -+ rd->slice_params = p2; -+ -+ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2)); -+ if (s2 == NULL) -+ return AVERROR(ENOMEM); -+ rd->slices = s2; -+ -+ rd->alloced_slices = n2; -+ } -+ ++rd->num_slices; -+ return 0; -+} -+ -+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets) -+{ -+ if (rd->num_offsets + n > rd->alloced_offsets) { -+ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2; -+ void * p2; -+ while (rd->num_offsets + n > n2) -+ n2 *= 2; -+ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL) -+ return AVERROR(ENOMEM); -+ rd->offsets = p2; -+ rd->alloced_offsets = n2; -+ } -+ for (size_t i = 0; i != n; ++i) -+ rd->offsets[rd->num_offsets++] = offsets[i] - 1; -+ return 0; -+} -+ -+static unsigned int -+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) -+{ -+ unsigned int i; -+ unsigned int n = 0; -+ const HEVCFrame * const pic = h->ref; -+ -+ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { -+ const HEVCFrame * const frame = &h->DPB[i]; -+ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { -+ struct v4l2_hevc_dpb_entry * const entry = entries + n++; -+ -+ entry->timestamp = frame_capture_dpb(frame->frame); -+#if HEVC_CTRLS_VERSION <= 2 -+ entry->rps = find_frame_rps_type(h, entry->timestamp); -+#else -+ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : -+ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; -+#endif -+ entry->field_pic = frame->frame->interlaced_frame; -+ -+#if HEVC_CTRLS_VERSION <= 3 -+ /* TODO: Interleaved: Get the POC for each field. */ -+ entry->pic_order_cnt[0] = frame->poc; -+ entry->pic_order_cnt[1] = frame->poc; -+#else -+ entry->pic_order_cnt_val = frame->poc; -+#endif -+ } -+ } -+ return n; -+} -+ -+static void fill_slice_params(const HEVCContext * const h, -+#if HEVC_CTRLS_VERSION >= 2 -+ const struct v4l2_ctrl_hevc_decode_params * const dec, -+#endif -+ struct v4l2_ctrl_hevc_slice_params *slice_params, -+ uint32_t bit_size, uint32_t bit_offset) -+{ -+ const SliceHeader * const sh = &h->sh; -+#if HEVC_CTRLS_VERSION >= 2 -+ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb; -+ const unsigned int dpb_n = dec->num_active_dpb_entries; -+#else -+ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb; -+ unsigned int dpb_n; -+#endif -+ unsigned int i; -+ RefPicList *rpl; -+ -+ *slice_params = (struct v4l2_ctrl_hevc_slice_params) { -+ .bit_size = bit_size, -+#if HEVC_CTRLS_VERSION <= 3 -+ .data_bit_offset = bit_offset, -+#else -+ .data_byte_offset = bit_offset / 8 + 1, -+#endif -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .slice_segment_addr = sh->slice_segment_addr, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ -+ .nal_unit_type = h->nal_unit_type, -+ .nuh_temporal_id_plus1 = h->temporal_id + 1, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .slice_type = sh->slice_type, -+ .colour_plane_id = sh->colour_plane_id, -+ .slice_pic_order_cnt = h->ref->poc, -+ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, -+ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, -+ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, -+ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, -+ .slice_qp_delta = sh->slice_qp_delta, -+ .slice_cb_qp_offset = sh->slice_cb_qp_offset, -+ .slice_cr_qp_offset = sh->slice_cr_qp_offset, -+ .slice_act_y_qp_offset = 0, -+ .slice_act_cb_qp_offset = 0, -+ .slice_act_cr_qp_offset = 0, -+ .slice_beta_offset_div2 = sh->beta_offset / 2, -+ .slice_tc_offset_div2 = sh->tc_offset / 2, -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ -+ .pic_struct = h->sei.picture_timing.picture_struct, -+ -+#if HEVC_CTRLS_VERSION < 2 -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ -+ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, -+ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, -+ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, -+#endif -+ }; -+ -+ if (sh->slice_sample_adaptive_offset_flag[0]) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; -+ -+ if (sh->slice_sample_adaptive_offset_flag[1]) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; -+ -+ if (sh->slice_temporal_mvp_enabled_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; -+ -+ if (sh->mvd_l1_zero_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; -+ -+ if (sh->cabac_init_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; -+ -+ if (sh->collocated_list == L0) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; -+ -+ if (sh->disable_deblocking_filter_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; -+ -+ if (sh->slice_loop_filter_across_slices_enabled_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; -+ -+ if (sh->dependent_slice_segment_flag) -+ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; -+ -+#if HEVC_CTRLS_VERSION < 2 -+ dpb_n = fill_dpb_entries(h, dpb); -+ slice_params->num_active_dpb_entries = dpb_n; -+#endif -+ -+ if (sh->slice_type != HEVC_SLICE_I) { -+ rpl = &h->ref->refPicList[0]; -+ for (i = 0; i < rpl->nb_refs; i++) -+ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); -+ } -+ -+ if (sh->slice_type == HEVC_SLICE_B) { -+ rpl = &h->ref->refPicList[1]; -+ for (i = 0; i < rpl->nb_refs; i++) -+ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); -+ } -+ -+ fill_pred_table(h, &slice_params->pred_weight_table); -+ -+ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; -+#if HEVC_CTRLS_VERSION <= 3 -+ if (slice_params->num_entry_point_offsets > 256) { -+ slice_params->num_entry_point_offsets = 256; -+ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); -+ } -+ -+ for (i = 0; i < slice_params->num_entry_point_offsets; i++) -+ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; -+#endif -+} -+ -+#if HEVC_CTRLS_VERSION >= 2 -+static void -+fill_decode_params(const HEVCContext * const h, -+ struct v4l2_ctrl_hevc_decode_params * const dec) -+{ -+ unsigned int i; -+ -+ *dec = (struct v4l2_ctrl_hevc_decode_params){ -+ .pic_order_cnt_val = h->poc, -+ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, -+ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, -+ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs, -+ }; -+ -+ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb); -+ -+ // The docn does seem to ask that we fit our 32 bit signed POC into -+ // a U8 so... (To be fair 16 bits would be enough) -+ // Luckily we (Pi) don't use these fields -+ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i) -+ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc; -+ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i) -+ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc; -+ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i) -+ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc; -+ -+ if (IS_IRAP(h)) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; -+ if (IS_IDR(h)) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; -+ if (h->sh.no_output_of_prior_pics_flag) -+ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR; -+ -+} -+#endif -+ -+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps) -+{ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ -+ *ctrl = (struct v4l2_ctrl_hevc_sps) { -+ .chroma_format_idc = sps->chroma_format_idc, -+ .pic_width_in_luma_samples = sps->width, -+ .pic_height_in_luma_samples = sps->height, -+ .bit_depth_luma_minus8 = sps->bit_depth - 8, -+ .bit_depth_chroma_minus8 = sps->bit_depth - 8, -+ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, -+ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, -+ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, -+ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, -+ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, -+ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, -+ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, -+ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, -+ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, -+ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, -+ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, -+ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, -+ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, -+ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, -+ .num_short_term_ref_pic_sets = sps->nb_st_rps, -+ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, -+ .chroma_format_idc = sps->chroma_format_idc, -+ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1, -+ }; -+ -+ if (sps->separate_colour_plane_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; -+ -+ if (sps->scaling_list_enable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; -+ -+ if (sps->amp_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; -+ -+ if (sps->sao_enabled) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; -+ -+ if (sps->pcm_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; -+ -+ if (sps->pcm.loop_filter_disable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; -+ -+ if (sps->long_term_ref_pics_present_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; -+ -+ if (sps->sps_temporal_mvp_enabled_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; -+ -+ if (sps->sps_strong_intra_smoothing_enable_flag) -+ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; -+} -+ -+static void fill_scaling_matrix(const ScalingList * const sl, -+ struct v4l2_ctrl_hevc_scaling_matrix * const sm) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < 6; i++) { -+ unsigned int j; -+ -+ for (j = 0; j < 16; j++) -+ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j]; -+ for (j = 0; j < 64; j++) { -+ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j]; -+ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j]; -+ if (i < 2) -+ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; -+ } -+ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; -+ if (i < 2) -+ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; -+ } -+} -+ -+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps) -+{ -+ uint64_t flags = 0; -+ -+ if (pps->dependent_slice_segments_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; -+ -+ if (pps->output_flag_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; -+ -+ if (pps->sign_data_hiding_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; -+ -+ if (pps->cabac_init_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; -+ -+ if (pps->constrained_intra_pred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; -+ -+ if (pps->transform_skip_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; -+ -+ if (pps->cu_qp_delta_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; -+ -+ if (pps->pic_slice_level_chroma_qp_offsets_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; -+ -+ if (pps->weighted_pred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; -+ -+ if (pps->weighted_bipred_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; -+ -+ if (pps->transquant_bypass_enable_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; -+ -+ if (pps->tiles_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; -+ -+ if (pps->entropy_coding_sync_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; -+ -+ if (pps->loop_filter_across_tiles_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; -+ -+ if (pps->seq_loop_filter_across_slices_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; -+ -+ if (pps->deblocking_filter_override_enabled_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; -+ -+ if (pps->disable_dbf) -+ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; -+ -+ if (pps->lists_modification_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; -+ -+ if (pps->slice_header_extension_present_flag) -+ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; -+ -+ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ -+ *ctrl = (struct v4l2_ctrl_hevc_pps) { -+ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, -+ .init_qp_minus26 = pps->pic_init_qp_minus26, -+ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, -+ .pps_cb_qp_offset = pps->cb_qp_offset, -+ .pps_cr_qp_offset = pps->cr_qp_offset, -+ .pps_beta_offset_div2 = pps->beta_offset / 2, -+ .pps_tc_offset_div2 = pps->tc_offset / 2, -+ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, -+ .flags = flags -+ }; -+ -+ -+ if (pps->tiles_enabled_flag) { -+ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1; -+ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1; -+ -+ for (int i = 0; i < pps->num_tile_columns; i++) -+ ctrl->column_width_minus1[i] = pps->column_width[i] - 1; -+ -+ for (int i = 0; i < pps->num_tile_rows; i++) -+ ctrl->row_height_minus1[i] = pps->row_height[i] - 1; -+ } -+} -+ -+// Called before finally returning the frame to the user -+// Set corrupt flag here as this is actually the frame structure that -+// is going to the user (in MT land each thread has its own pool) -+static int frame_post_process(void *logctx, AVFrame *frame) -+{ -+ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0]; -+ -+// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); -+ frame->flags &= ~AV_FRAME_FLAG_CORRUPT; -+ if (rd->qe_dst) { -+ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst); -+ if (stat != MEDIABUFS_STATUS_SUCCESS) { -+ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__); -+ frame->flags |= AV_FRAME_FLAG_CORRUPT; -+ } -+ } -+ -+ return 0; -+} -+ -+static inline struct timeval cvt_dpb_to_tv(uint64_t t) -+{ -+ t /= 1000; -+ return (struct timeval){ -+ .tv_usec = t % 1000000, -+ .tv_sec = t / 1000000 -+ }; -+} -+ -+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t) -+{ -+ return (uint64_t)t * 1000; -+} -+ -+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, -+ av_unused const uint8_t *buffer, -+ av_unused uint32_t size) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); -+ decode_q_add(&ctx->decode_q, &rd->decode_ent); -+ -+ rd->num_slices = 0; -+ ctx->timestamp++; -+ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp); -+ -+ { -+ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data; -+ fdd->post_process = frame_post_process; -+ } -+ -+ // qe_dst needs to be bound to the data buffer and only returned when that is -+ if (!rd->qe_dst) -+ { -+ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ } -+ -+ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame -+ -+ return 0; -+} -+ -+// Object fd & size will be zapped by this & need setting later -+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format) -+{ -+ AVDRMLayerDescriptor *layer = &desc->layers[0]; -+ unsigned int width; -+ unsigned int height; -+ unsigned int bpl; -+ uint32_t pixelformat; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ width = format->fmt.pix_mp.width; -+ height = format->fmt.pix_mp.height; -+ pixelformat = format->fmt.pix_mp.pixelformat; -+ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline; -+ } -+ else { -+ width = format->fmt.pix.width; -+ height = format->fmt.pix.height; -+ pixelformat = format->fmt.pix.pixelformat; -+ bpl = format->fmt.pix.bytesperline; -+ } -+ -+ switch (pixelformat) { -+ case V4L2_PIX_FMT_NV12: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); -+ break; -+ case V4L2_PIX_FMT_NV12_10_COL128: -+ layer->format = DRM_FORMAT_P030; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); -+ break; -+#endif -+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED -+ case V4L2_PIX_FMT_SUNXI_TILED_NV12: -+ layer->format = DRM_FORMAT_NV12; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; -+ break; -+#endif -+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) -+ case V4L2_PIX_FMT_NV15: -+ layer->format = DRM_FORMAT_NV15; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#endif -+ case V4L2_PIX_FMT_NV16: -+ layer->format = DRM_FORMAT_NV16; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) -+ case V4L2_PIX_FMT_NV20: -+ layer->format = DRM_FORMAT_NV20; -+ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; -+ break; -+#endif -+ default: -+ return -1; -+ } -+ -+ desc->nb_objects = 1; -+ desc->objects[0].fd = -1; -+ desc->objects[0].size = 0; -+ -+ desc->nb_layers = 1; -+ layer->nb_planes = 2; -+ -+ layer->planes[0].object_index = 0; -+ layer->planes[0].offset = 0; -+ layer->planes[0].pitch = bpl; -+#if CONFIG_SAND -+ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = width; -+ layer->planes[1].pitch = width; -+ } -+ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy -+ layer->planes[1].pitch = width * 2; -+ } -+ else -+#endif -+ { -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = layer->planes[0].pitch * height; -+ layer->planes[1].pitch = layer->planes[0].pitch; -+ } -+ -+ return 0; -+} -+ -+static int -+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, -+ struct req_controls *const controls, -+#if HEVC_CTRLS_VERSION >= 2 -+ struct v4l2_ctrl_hevc_decode_params * const dec, -+#endif -+ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count, -+ void * const offsets, const size_t offset_count) -+{ -+ int rv; -+#if HEVC_CTRLS_VERSION >= 2 -+ unsigned int n = 3; -+#else -+ unsigned int n = 2; -+#endif -+ -+ struct v4l2_ext_control control[6] = { -+ { -+ .id = V4L2_CID_STATELESS_HEVC_SPS, -+ .ptr = &controls->sps, -+ .size = sizeof(controls->sps), -+ }, -+ { -+ .id = V4L2_CID_STATELESS_HEVC_PPS, -+ .ptr = &controls->pps, -+ .size = sizeof(controls->pps), -+ }, -+#if HEVC_CTRLS_VERSION >= 2 -+ { -+ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, -+ .ptr = dec, -+ .size = sizeof(*dec), -+ }, -+#endif -+ }; -+ -+ if (slices) -+ control[n++] = (struct v4l2_ext_control) { -+ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, -+ .ptr = slices, -+ .size = sizeof(*slices) * slice_count, -+ }; -+ -+ if (controls->has_scaling) -+ control[n++] = (struct v4l2_ext_control) { -+ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, -+ .ptr = &controls->scaling_matrix, -+ .size = sizeof(controls->scaling_matrix), -+ }; -+ -+#if HEVC_CTRLS_VERSION >= 4 -+ if (offsets) -+ control[n++] = (struct v4l2_ext_control) { -+ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, -+ .ptr = offsets, -+ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count, -+ }; -+#endif -+ -+ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n); -+ -+ return rv; -+} -+ -+// This only works because we started out from a single coded frame buffer -+// that will remain intact until after end_frame -+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; -+ int bcount = get_bits_count(&h->HEVClc->gb); -+ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; -+ -+ const unsigned int n = rd->num_slices; -+ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices; -+ -+ int rv; -+ struct slice_info * si; -+ -+ // This looks dodgy but we know that FFmpeg has parsed this from a buffer -+ // that contains the entire frame including the start code -+ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { -+ buffer -= 3; -+ size += 3; -+ boff += 24; -+ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) { -+ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n", -+ buffer[0], buffer[1], buffer[2]); -+ } -+ } -+ -+ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { -+ if (rd->slices == NULL) { -+ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL) -+ return AVERROR(ENOMEM); -+ rd->slices->ptr = buffer; -+ rd->num_slices = 1; -+ } -+ rd->slices->len = buffer - rd->slices->ptr + size; -+ return 0; -+ } -+ -+ if ((rv = slice_add(rd)) != 0) -+ return rv; -+ -+ si = rd->slices + n; -+ si->ptr = buffer; -+ si->len = size; -+ si->n_offsets = rd->num_offsets; -+ -+ if (n != block_start) { -+ struct slice_info *const si0 = rd->slices + block_start; -+ const size_t offset = (buffer - si0->ptr); -+ boff += offset * 8; -+ size += offset; -+ si0->len = si->len + offset; -+ } -+ -+#if HEVC_CTRLS_VERSION >= 2 -+ if (n == 0) -+ fill_decode_params(h, &rd->dec); -+ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff); -+#else -+ fill_slice_params(h, rd->slice_params + n, size * 8, boff); -+#endif -+ if (ctx->max_offsets != 0 && -+ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0) -+ return rv; -+ -+ return 0; -+} -+ -+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ if (h->ref != NULL) { -+ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ media_request_abort(&rd->req); -+ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src); -+ -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ } -+} -+ -+static int send_slice(AVCodecContext * const avctx, -+ V4L2MediaReqDescriptor * const rd, -+ struct req_controls *const controls, -+ const unsigned int i, const unsigned int j) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ const int is_last = (j == rd->num_slices); -+ struct slice_info *const si = rd->slices + i; -+ struct media_request * req = NULL; -+ struct qent_src * src = NULL; -+ MediaBufsStatus stat; -+ void * offsets = rd->offsets + rd->slices[i].n_offsets; -+ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets; -+ -+ if ((req = media_request_get(ctx->mpool)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ -+ if (set_req_ctls(ctx, req, -+ controls, -+#if HEVC_CTRLS_VERSION >= 2 -+ &rd->dec, -+#endif -+ rd->slice_params + i, j - i, -+ offsets, n_offsets)) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); -+ goto fail1; -+ } -+ -+ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__); -+ goto fail1; -+ } -+ -+ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__); -+ goto fail2; -+ } -+ -+ if (qent_src_params_set(src, &controls->tv)) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__); -+ goto fail2; -+ } -+ -+ stat = mediabufs_start_request(ctx->mbufs, &req, &src, -+ i == 0 ? rd->qe_dst : NULL, -+ is_last); -+ -+ if (stat != MEDIABUFS_STATUS_SUCCESS) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); -+ return AVERROR_UNKNOWN; -+ } -+ return 0; -+ -+fail2: -+ mediabufs_src_qent_abort(ctx->mbufs, &src); -+fail1: -+ media_request_abort(&req); -+ return AVERROR_UNKNOWN; -+} -+ -+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) -+{ -+ const HEVCContext * const h = avctx->priv_data; -+ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ struct req_controls rc; -+ unsigned int i; -+ int rv; -+ -+ // It is possible, though maybe a bug, to get an end_frame without -+ // a previous start_frame. If we do then give up. -+ if (!decode_q_in_q(&rd->decode_ent)) { -+ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ { -+ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ? -+ &h->ps.pps->scaling_list : -+ h->ps.sps->scaling_list_enable_flag ? -+ &h->ps.sps->scaling_list : NULL; -+ -+ -+ memset(&rc, 0, sizeof(rc)); -+ rc.tv = cvt_dpb_to_tv(rd->timestamp); -+ fill_sps(&rc.sps, h->ps.sps); -+ fill_pps(&rc.pps, h->ps.pps); -+ if (sl) { -+ rc.has_scaling = 1; -+ fill_scaling_matrix(sl, &rc.scaling_matrix); -+ } -+ } -+ -+ decode_q_wait(&ctx->decode_q, &rd->decode_ent); -+ -+ // qe_dst needs to be bound to the data buffer and only returned when that is -+ // Alloc almost certainly wants to be serialised if there is any chance of blocking -+ // so we get the next frame to be free in the thread that needs it for decode first. -+ // -+ // In our current world this probably isn't a concern but put it here anyway -+ if (!rd->qe_dst) -+ { -+ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); -+ rv = AVERROR(ENOMEM); -+ goto fail; -+ } -+ } -+ -+ // Send as slices -+ for (i = 0; i < rd->num_slices; i += ctx->max_slices) { -+ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices); -+ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0) -+ goto fail; -+ } -+ -+ // Set the drm_prime desriptor -+ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); -+ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0)); -+ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0)); -+ -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ return 0; -+ -+fail: -+ decode_q_remove(&ctx->decode_q, &rd->decode_ent); -+ return rv; -+} -+ -+static inline int -+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) -+{ -+ return v >= c->minimum && v <= c->maximum; -+} -+ -+// Initial check & init -+static int -+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ const HEVCSPS * const sps = h->ps.sps; -+ struct v4l2_ctrl_hevc_sps ctrl_sps; -+ unsigned int i; -+ -+ // Check for var slice array -+ struct v4l2_query_ext_ctrl qc[] = { -+ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_SPS }, -+ { .id = V4L2_CID_STATELESS_HEVC_PPS }, -+ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, -+#if HEVC_CTRLS_VERSION >= 2 -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS }, -+#endif -+ }; -+ // Order & size must match! -+ static const size_t ctrl_sizes[] = { -+ sizeof(struct v4l2_ctrl_hevc_slice_params), -+ sizeof(int32_t), -+ sizeof(struct v4l2_ctrl_hevc_sps), -+ sizeof(struct v4l2_ctrl_hevc_pps), -+ sizeof(struct v4l2_ctrl_hevc_scaling_matrix), -+#if HEVC_CTRLS_VERSION >= 2 -+ sizeof(struct v4l2_ctrl_hevc_decode_params), -+#endif -+ }; -+ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); -+ -+#if HEVC_CTRLS_VERSION == 2 -+ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) -+ return AVERROR(EINVAL); -+#elif HEVC_CTRLS_VERSION == 3 -+ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) -+ return AVERROR(EINVAL); -+#endif -+ -+ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls); -+ i = 0; -+#if HEVC_CTRLS_VERSION >= 4 -+ // Skip slice check if no slice mode -+ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -+ i = 1; -+#else -+ // Fail frame mode silently for anything prior to V4 -+ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -+ return AVERROR(EINVAL); -+#endif -+ for (; i != noof_ctrls; ++i) { -+ if (qc[i].type == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id); -+ return AVERROR(EINVAL); -+ } -+ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { -+ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", -+ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ fill_sps(&ctrl_sps, sps); -+ -+ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+// Final init -+static int -+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) -+{ -+ int ret; -+ -+ struct v4l2_query_ext_ctrl querys[] = { -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, -+#if HEVC_CTRLS_VERSION >= 4 -+ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, }, -+#endif -+ }; -+ -+ struct v4l2_ext_control ctrls[] = { -+ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, -+ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, -+ }; -+ -+ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); -+ -+ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || -+ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? -+ 1 : querys[2].dims[0]; -+ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); -+ -+#if HEVC_CTRLS_VERSION >= 4 -+ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? -+ 0 : querys[3].dims[0]; -+ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); -+#else -+ ctx->max_offsets = 0; -+#endif -+ -+ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED || -+ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) -+ ctx->decode_mode = querys[0].default_value; -+ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)) -+ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED; -+ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) -+ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; -+ else { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ -+ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE || -+ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) -+ ctx->start_code = querys[1].default_value; -+ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; -+ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -+ else { -+ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ -+ // If we are in slice mode & START_CODE_NONE supported then pick that -+ // as it doesn't require the slightly dodgy look backwards in our raw buffer -+ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && -+ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) -+ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; -+ -+ ctrls[0].value = ctx->decode_mode; -+ ctrls[1].value = ctx->start_code; -+ -+ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls)); -+ return !ret ? 0 : AVERROR(-ret); -+} -+ -+static void v4l2_req_frame_free(void *opaque, uint8_t *data) -+{ -+ AVCodecContext *avctx = opaque; -+ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data); -+ -+ qent_dst_unref(&rd->qe_dst); -+ -+ // We don't expect req or qe_src to be set -+ if (rd->req || rd->qe_src) -+ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src); -+ -+ av_freep(&rd->slices); -+ av_freep(&rd->slice_params); -+ av_freep(&rd->offsets); -+ -+ av_free(rd); -+} -+ -+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size) -+{ -+ AVCodecContext *avctx = opaque; -+// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+// V4L2MediaReqDescriptor *req; -+ AVBufferRef *ref; -+ uint8_t *data; -+// int ret; -+ -+ data = av_mallocz(size); -+ if (!data) -+ return NULL; -+ -+ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); -+ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0); -+ if (!ref) { -+ av_freep(&data); -+ return NULL; -+ } -+ return ref; -+} -+ -+#if 0 -+static void v4l2_req_pool_free(void *opaque) -+{ -+ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); -+} -+ -+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc) -+{ -+ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); -+ -+ av_buffer_pool_uninit(&hwfc->pool); -+} -+#endif -+ -+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) -+{ -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; -+ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs); -+ -+ hwfc->format = AV_PIX_FMT_DRM_PRIME; -+ hwfc->sw_format = pixel_format_from_format(vfmt); -+ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) { -+ hwfc->width = vfmt->fmt.pix_mp.width; -+ hwfc->height = vfmt->fmt.pix_mp.height; -+ } else { -+ hwfc->width = vfmt->fmt.pix.width; -+ hwfc->height = vfmt->fmt.pix.height; -+ } -+#if 0 -+ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); -+ if (!hwfc->pool) -+ return AVERROR(ENOMEM); -+ -+ hwfc->free = v4l2_req_hwframe_ctx_free; -+ -+ hwfc->initial_pool_size = 1; -+ -+ switch (avctx->codec_id) { -+ case AV_CODEC_ID_VP9: -+ hwfc->initial_pool_size += 8; -+ break; -+ case AV_CODEC_ID_VP8: -+ hwfc->initial_pool_size += 3; -+ break; -+ default: -+ hwfc->initial_pool_size += 2; -+ } -+#endif -+ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); -+ -+ return 0; -+} -+ -+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) -+{ -+ int rv; -+ -+ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); -+ if (!frame->buf[0]) -+ return AVERROR(ENOMEM); -+ -+ frame->data[0] = frame->buf[0]->data; -+ -+ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); -+ -+ if ((rv = ff_attach_decode_data(frame)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); -+ av_frame_unref(frame); -+ return rv; -+ } -+ -+ return 0; -+} -+ -+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { -+ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, -+ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION), -+ .probe = probe, -+ .set_controls = set_controls, -+ -+ .start_frame = v4l2_request_hevc_start_frame, -+ .decode_slice = v4l2_request_hevc_decode_slice, -+ .end_frame = v4l2_request_hevc_end_frame, -+ .abort_frame = v4l2_request_hevc_abort_frame, -+ .frame_params = frame_params, -+ .alloc_frame = alloc_frame, -+}; -+ -diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c -new file mode 100644 -index 0000000000..1a9944774a ---- /dev/null -+++ b/libavcodec/v4l2_req_media.c -@@ -0,0 +1,1802 @@ -+/* -+ * Copyright (C) 2018 Paul Kocialkowski -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_utils.h" -+#include "weak_link.h" -+ -+ -+/* floor(log2(x)) */ -+static unsigned int log2_size(size_t x) -+{ -+ unsigned int n = 0; -+ -+ if (x & ~0xffff) { -+ n += 16; -+ x >>= 16; -+ } -+ if (x & ~0xff) { -+ n += 8; -+ x >>= 8; -+ } -+ if (x & ~0xf) { -+ n += 4; -+ x >>= 4; -+ } -+ if (x & ~3) { -+ n += 2; -+ x >>= 2; -+ } -+ return (x & ~1) ? n + 1 : n; -+} -+ -+static size_t round_up_size(const size_t x) -+{ -+ /* Admit no size < 256 */ -+ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1; -+ -+ return x >= (3 << n) ? 4 << n : (3 << n); -+} -+ -+struct media_request; -+ -+struct media_pool { -+ int fd; -+ sem_t sem; -+ pthread_mutex_t lock; -+ struct media_request * free_reqs; -+ struct pollqueue * pq; -+}; -+ -+struct media_request { -+ struct media_request * next; -+ struct media_pool * mp; -+ int fd; -+ struct polltask * pt; -+}; -+ -+static inline enum v4l2_memory -+mediabufs_memory_to_v4l2(const enum mediabufs_memory m) -+{ -+ return (enum v4l2_memory)m; -+} -+ -+const char * -+mediabufs_memory_name(const enum mediabufs_memory m) -+{ -+ switch (m) { -+ case MEDIABUFS_MEMORY_UNSET: -+ return "Unset"; -+ case MEDIABUFS_MEMORY_MMAP: -+ return "MMap"; -+ case MEDIABUFS_MEMORY_USERPTR: -+ return "UserPtr"; -+ case MEDIABUFS_MEMORY_OVERLAY: -+ return "Overlay"; -+ case MEDIABUFS_MEMORY_DMABUF: -+ return "DMABuf"; -+ default: -+ break; -+ } -+ return "Unknown"; -+} -+ -+ -+static inline int do_trywait(sem_t *const sem) -+{ -+ while (sem_trywait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static inline int do_wait(sem_t *const sem) -+{ -+ while (sem_wait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static int request_buffers(int video_fd, unsigned int type, -+ enum mediabufs_memory memory, unsigned int buffers_count) -+{ -+ struct v4l2_requestbuffers buffers; -+ int rc; -+ -+ memset(&buffers, 0, sizeof(buffers)); -+ buffers.type = type; -+ buffers.memory = mediabufs_memory_to_v4l2(memory); -+ buffers.count = buffers_count; -+ -+ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); -+ if (rc < 0) { -+ rc = -errno; -+ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc)); -+ return rc; -+ } -+ -+ return 0; -+} -+ -+ -+static int set_stream(int video_fd, unsigned int type, bool enable) -+{ -+ enum v4l2_buf_type buf_type = type; -+ int rc; -+ -+ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, -+ &buf_type); -+ if (rc < 0) { -+ rc = -errno; -+ request_log("Unable to %sable stream: %s\n", -+ enable ? "en" : "dis", strerror(-rc)); -+ return rc; -+ } -+ -+ return 0; -+} -+ -+ -+ -+struct media_request * media_request_get(struct media_pool * const mp) -+{ -+ struct media_request *req = NULL; -+ -+ /* Timeout handled by poll code */ -+ if (do_wait(&mp->sem)) -+ return NULL; -+ -+ pthread_mutex_lock(&mp->lock); -+ req = mp->free_reqs; -+ if (req) { -+ mp->free_reqs = req->next; -+ req->next = NULL; -+ } -+ pthread_mutex_unlock(&mp->lock); -+ return req; -+} -+ -+int media_request_fd(const struct media_request * const req) -+{ -+ return req->fd; -+} -+ -+int media_request_start(struct media_request * const req) -+{ -+ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1) -+ { -+ const int err = errno; -+ if (err == EINTR) -+ continue; -+ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err)); -+ return -err; -+ } -+ -+ pollqueue_add_task(req->pt, 2000); -+ return 0; -+} -+ -+static void media_request_done(void *v, short revents) -+{ -+ struct media_request *const req = v; -+ struct media_pool *const mp = req->mp; -+ -+ /* ** Not sure what to do about timeout */ -+ -+ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0) -+ request_log("Unable to reinit media request: %s\n", -+ strerror(errno)); -+ -+ pthread_mutex_lock(&mp->lock); -+ req->next = mp->free_reqs; -+ mp->free_reqs = req; -+ pthread_mutex_unlock(&mp->lock); -+ sem_post(&mp->sem); -+} -+ -+int media_request_abort(struct media_request ** const preq) -+{ -+ struct media_request * const req = *preq; -+ -+ if (req == NULL) -+ return 0; -+ *preq = NULL; -+ -+ media_request_done(req, 0); -+ return 0; -+} -+ -+static void delete_req_chain(struct media_request * const chain) -+{ -+ struct media_request * next = chain; -+ while (next) { -+ struct media_request * const req = next; -+ next = req->next; -+ if (req->pt) -+ polltask_delete(&req->pt); -+ if (req->fd != -1) -+ close(req->fd); -+ free(req); -+ } -+} -+ -+struct media_pool * media_pool_new(const char * const media_path, -+ struct pollqueue * const pq, -+ const unsigned int n) -+{ -+ struct media_pool * const mp = calloc(1, sizeof(*mp)); -+ unsigned int i; -+ -+ if (!mp) -+ goto fail0; -+ -+ mp->pq = pq; -+ pthread_mutex_init(&mp->lock, NULL); -+ mp->fd = open(media_path, O_RDWR | O_NONBLOCK); -+ if (mp->fd == -1) { -+ request_log("Failed to open '%s': %s\n", media_path, strerror(errno)); -+ goto fail1; -+ } -+ -+ for (i = 0; i != n; ++i) { -+ struct media_request * req = malloc(sizeof(*req)); -+ if (!req) -+ goto fail4; -+ -+ *req = (struct media_request){ -+ .next = mp->free_reqs, -+ .mp = mp, -+ .fd = -1 -+ }; -+ mp->free_reqs = req; -+ -+ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) { -+ request_log("Failed to alloc request %d: %s\n", i, strerror(errno)); -+ goto fail4; -+ } -+ -+ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req); -+ if (!req->pt) -+ goto fail4; -+ } -+ -+ sem_init(&mp->sem, 0, n); -+ -+ return mp; -+ -+fail4: -+ delete_req_chain(mp->free_reqs); -+ close(mp->fd); -+ pthread_mutex_destroy(&mp->lock); -+fail1: -+ free(mp); -+fail0: -+ return NULL; -+} -+ -+void media_pool_delete(struct media_pool ** pMp) -+{ -+ struct media_pool * const mp = *pMp; -+ -+ if (!mp) -+ return; -+ *pMp = NULL; -+ -+ delete_req_chain(mp->free_reqs); -+ close(mp->fd); -+ sem_destroy(&mp->sem); -+ pthread_mutex_destroy(&mp->lock); -+ free(mp); -+} -+ -+ -+#define INDEX_UNSET (~(uint32_t)0) -+ -+enum qent_status { -+ QENT_NEW = 0, // Initial state - shouldn't last -+ QENT_FREE, // On free chain -+ QENT_PENDING, // User has ent -+ QENT_WAITING, // On inuse -+ QENT_DONE, // Frame rx -+ QENT_ERROR, // Error -+ QENT_IMPORT -+}; -+ -+struct qent_base { -+ atomic_int ref_count; -+ struct qent_base *next; -+ struct qent_base *prev; -+ enum qent_status status; -+ enum mediabufs_memory memtype; -+ uint32_t index; -+ struct dmabuf_h *dh[VIDEO_MAX_PLANES]; -+ struct timeval timestamp; -+}; -+ -+struct qent_src { -+ struct qent_base base; -+ int fixed_size; -+}; -+ -+struct qent_dst { -+ struct qent_base base; -+ bool waiting; -+ pthread_mutex_t lock; -+ pthread_cond_t cond; -+ struct ff_weak_link_client * mbc_wl; -+}; -+ -+struct qe_list_head { -+ struct qent_base *head; -+ struct qent_base *tail; -+}; -+ -+struct buf_pool { -+ enum mediabufs_memory memtype; -+ pthread_mutex_t lock; -+ sem_t free_sem; -+ struct qe_list_head free; -+ struct qe_list_head inuse; -+}; -+ -+ -+static inline struct qent_dst *base_to_dst(struct qent_base *be) -+{ -+ return (struct qent_dst *)be; -+} -+ -+static inline struct qent_src *base_to_src(struct qent_base *be) -+{ -+ return (struct qent_src *)be; -+} -+ -+ -+#define QENT_BASE_INITIALIZER(mtype) {\ -+ .ref_count = ATOMIC_VAR_INIT(0),\ -+ .status = QENT_NEW,\ -+ .memtype = (mtype),\ -+ .index = INDEX_UNSET\ -+} -+ -+static void qe_base_uninit(struct qent_base *const be) -+{ -+ unsigned int i; -+ for (i = 0; i != VIDEO_MAX_PLANES; ++i) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+} -+ -+static void qe_src_free(struct qent_src *const be_src) -+{ -+ if (!be_src) -+ return; -+ qe_base_uninit(&be_src->base); -+ free(be_src); -+} -+ -+static struct qent_src * qe_src_new(enum mediabufs_memory mtype) -+{ -+ struct qent_src *const be_src = malloc(sizeof(*be_src)); -+ if (!be_src) -+ return NULL; -+ *be_src = (struct qent_src){ -+ .base = QENT_BASE_INITIALIZER(mtype) -+ }; -+ return be_src; -+} -+ -+static void qe_dst_free(struct qent_dst *const be_dst) -+{ -+ if (!be_dst) -+ return; -+ -+ ff_weak_link_unref(&be_dst->mbc_wl); -+ pthread_cond_destroy(&be_dst->cond); -+ pthread_mutex_destroy(&be_dst->lock); -+ qe_base_uninit(&be_dst->base); -+ free(be_dst); -+} -+ -+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype) -+{ -+ struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); -+ if (!be_dst) -+ return NULL; -+ *be_dst = (struct qent_dst){ -+ .base = QENT_BASE_INITIALIZER(memtype), -+ .lock = PTHREAD_MUTEX_INITIALIZER, -+ .cond = PTHREAD_COND_INITIALIZER, -+ .mbc_wl = ff_weak_link_ref(wl) -+ }; -+ return be_dst; -+} -+ -+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be) -+{ -+ if (ql->tail) -+ ql->tail->next = be; -+ else -+ ql->head = be; -+ be->prev = ql->tail; -+ be->next = NULL; -+ ql->tail = be; -+} -+ -+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be) -+{ -+ if (!be) -+ return NULL; -+ -+ if (be->next) -+ be->next->prev = be->prev; -+ else -+ ql->tail = be->prev; -+ if (be->prev) -+ be->prev->next = be->next; -+ else -+ ql->head = be->next; -+ be->next = NULL; -+ be->prev = NULL; -+ return be; -+} -+ -+ -+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be) -+{ -+ ql_add_tail(&bp->free, be); -+} -+ -+static struct qent_base * bq_get_free(struct buf_pool *const bp) -+{ -+ return ql_extract(&bp->free, bp->free.head); -+} -+ -+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be) -+{ -+ return ql_extract(&bp->inuse, be); -+} -+ -+static struct qent_base * bq_get_inuse(struct buf_pool *const bp) -+{ -+ return ql_extract(&bp->inuse, bp->inuse.head); -+} -+ -+static void bq_free_all_free_src(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_free(bp)) != NULL) -+ qe_src_free(base_to_src(be)); -+} -+ -+static void bq_free_all_inuse_src(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_inuse(bp)) != NULL) -+ qe_src_free(base_to_src(be)); -+} -+ -+static void bq_free_all_free_dst(struct buf_pool *const bp) -+{ -+ struct qent_base *be; -+ while ((be = bq_get_free(bp)) != NULL) -+ qe_dst_free(base_to_dst(be)); -+} -+ -+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be) -+{ -+ unsigned int i; -+ -+ pthread_mutex_lock(&bp->lock); -+ /* Clear out state vars */ -+ be->timestamp.tv_sec = 0; -+ be->timestamp.tv_usec = 0; -+ be->status = QENT_FREE; -+ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) -+ dmabuf_len_set(be->dh[i], 0); -+ bq_put_free(bp, be); -+ pthread_mutex_unlock(&bp->lock); -+ sem_post(&bp->free_sem); -+} -+ -+static bool queue_is_inuse(const struct buf_pool *const bp) -+{ -+ return bp->inuse.tail != NULL; -+} -+ -+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be) -+{ -+ if (!be) -+ return; -+ pthread_mutex_lock(&bp->lock); -+ ql_add_tail(&bp->inuse, be); -+ be->status = QENT_WAITING; -+ pthread_mutex_unlock(&bp->lock); -+} -+ -+static struct qent_base *queue_get_free(struct buf_pool *const bp) -+{ -+ struct qent_base *buf; -+ -+ if (do_wait(&bp->free_sem)) -+ return NULL; -+ pthread_mutex_lock(&bp->lock); -+ buf = bq_get_free(bp); -+ pthread_mutex_unlock(&bp->lock); -+ return buf; -+} -+ -+static struct qent_base *queue_tryget_free(struct buf_pool *const bp) -+{ -+ struct qent_base *buf; -+ -+ if (do_trywait(&bp->free_sem)) -+ return NULL; -+ pthread_mutex_lock(&bp->lock); -+ buf = bq_get_free(bp); -+ pthread_mutex_unlock(&bp->lock); -+ return buf; -+} -+ -+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index) -+{ -+ struct qent_base *be; -+ -+ pthread_mutex_lock(&bp->lock); -+ /* Expect 1st in Q, but allow anywhere */ -+ for (be = bp->inuse.head; be; be = be->next) { -+ if (be->index == index) { -+ bq_extract_inuse(bp, be); -+ break; -+ } -+ } -+ pthread_mutex_unlock(&bp->lock); -+ -+ return be; -+} -+ -+static void queue_delete(struct buf_pool *const bp) -+{ -+ sem_destroy(&bp->free_sem); -+ pthread_mutex_destroy(&bp->lock); -+ free(bp); -+} -+ -+static struct buf_pool* queue_new(const int vfd) -+{ -+ struct buf_pool *bp = calloc(1, sizeof(*bp)); -+ if (!bp) -+ return NULL; -+ pthread_mutex_init(&bp->lock, NULL); -+ sem_init(&bp->free_sem, 0, 0); -+ return bp; -+} -+ -+ -+struct mediabufs_ctl { -+ atomic_int ref_count; /* 0 is single ref for easier atomics */ -+ void * dc; -+ int vfd; -+ bool stream_on; -+ bool polling; -+ bool dst_fixed; // Dst Q is fixed size -+ pthread_mutex_t lock; -+ struct buf_pool * src; -+ struct buf_pool * dst; -+ struct polltask * pt; -+ struct pollqueue * pq; -+ struct ff_weak_link_master * this_wlm; -+ -+ enum mediabufs_memory src_memtype; -+ enum mediabufs_memory dst_memtype; -+ struct v4l2_format src_fmt; -+ struct v4l2_format dst_fmt; -+ struct v4l2_capability capability; -+}; -+ -+static int qe_v4l2_queue(struct qent_base *const be, -+ const int vfd, struct media_request *const mreq, -+ const struct v4l2_format *const fmt, -+ const bool is_dst, const bool hold_flag) -+{ -+ struct v4l2_buffer buffer = { -+ .type = fmt->type, -+ .memory = mediabufs_memory_to_v4l2(be->memtype), -+ .index = be->index -+ }; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ unsigned int i; -+ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (is_dst) -+ dmabuf_len_set(be->dh[i], 0); -+ -+ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ -+ planes[i].length = dmabuf_size(be->dh[i]); -+ planes[i].bytesused = dmabuf_len(be->dh[i]); -+ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) -+ planes[i].m.fd = dmabuf_fd(be->dh[i]); -+ else -+ planes[i].m.mem_offset = 0; -+ } -+ buffer.m.planes = planes; -+ buffer.length = i; -+ } -+ else { -+ if (is_dst) -+ dmabuf_len_set(be->dh[0], 0); -+ -+ buffer.bytesused = dmabuf_len(be->dh[0]); -+ buffer.length = dmabuf_size(be->dh[0]); -+ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) -+ buffer.m.fd = dmabuf_fd(be->dh[0]); -+ else -+ buffer.m.offset = 0; -+ } -+ -+ if (!is_dst && mreq) { -+ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD; -+ buffer.request_fd = media_request_fd(mreq); -+ if (hold_flag) -+ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF; -+ } -+ -+ if (is_dst) -+ be->timestamp = (struct timeval){0,0}; -+ -+ buffer.timestamp = be->timestamp; -+ -+ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) { -+ const int err = errno; -+ if (err != EINTR) { -+ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err)); -+ return -err; -+ } -+ } -+ return 0; -+} -+ -+static struct qent_base * qe_dequeue(struct buf_pool *const bp, -+ const int vfd, -+ const struct v4l2_format * const f) -+{ -+ struct qent_base *be; -+ int rc; -+ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); -+ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; -+ struct v4l2_buffer buffer = { -+ .type = f->type, -+ .memory = mediabufs_memory_to_v4l2(bp->memtype) -+ }; -+ if (mp) { -+ buffer.length = f->fmt.pix_mp.num_planes; -+ buffer.m.planes = planes; -+ } -+ -+ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 && -+ errno == EINTR) -+ /* Loop */; -+ if (rc) { -+ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno)); -+ return NULL; -+ } -+ -+ be = queue_find_extract_index(bp, buffer.index); -+ if (!be) { -+ request_log("Failed to find index %d in Q\n", buffer.index); -+ return NULL; -+ } -+ -+ if (mp) { -+ unsigned int i; -+ for (i = 0; i != buffer.length; ++i) -+ dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0); -+ } -+ else -+ dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0); -+ -+ be->timestamp = buffer.timestamp; -+ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; -+ return be; -+} -+ -+static void qe_dst_done(struct qent_dst * dst_be) -+{ -+ pthread_mutex_lock(&dst_be->lock); -+ dst_be->waiting = false; -+ pthread_cond_broadcast(&dst_be->cond); -+ pthread_mutex_unlock(&dst_be->lock); -+ -+ qent_dst_unref(&dst_be); -+} -+ -+static bool qe_dst_waiting(struct qent_dst *const dst_be) -+{ -+ bool waiting; -+ pthread_mutex_lock(&dst_be->lock); -+ waiting = dst_be->waiting; -+ dst_be->waiting = true; -+ pthread_mutex_unlock(&dst_be->lock); -+ return waiting; -+} -+ -+ -+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc) -+{ -+ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst); -+} -+ -+static void mediabufs_poll_cb(void * v, short revents) -+{ -+ struct mediabufs_ctl *mbc = v; -+ struct qent_src *src_be = NULL; -+ struct qent_dst *dst_be = NULL; -+ -+ if (!revents) -+ request_err(mbc->dc, "%s: Timeout\n", __func__); -+ -+ pthread_mutex_lock(&mbc->lock); -+ mbc->polling = false; -+ -+ if ((revents & POLLOUT) != 0) -+ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt)); -+ if ((revents & POLLIN) != 0) -+ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt)); -+ -+ /* Reschedule */ -+ if (mediabufs_wants_poll(mbc)) { -+ mbc->polling = true; -+ pollqueue_add_task(mbc->pt, 2000); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ -+ if (src_be) -+ queue_put_free(mbc->src, &src_be->base); -+ if (dst_be) -+ qe_dst_done(dst_be); -+} -+ -+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp) -+{ -+ struct qent_base *const be = &be_src->base; -+ -+ be->timestamp = *timestamp; -+ return 0; -+} -+ -+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst) -+{ -+ return be_dst->base.timestamp; -+} -+ -+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { -+ size_t newsize = round_up_size(len); -+ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); -+ if (!dbsc) { -+ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); -+ return -ENOMEM; -+ } -+ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { -+ request_log("%s: Realloc %zd failed\n", __func__, newsize); -+ return -ENOMEM; -+ } -+ } -+ return 0; -+} -+ -+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ struct qent_base *const be = &be_src->base; -+ return qent_base_realloc(be, len, dbsc); -+} -+ -+ -+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc) -+{ -+ void * dst; -+ struct qent_base *const be = &be_src->base; -+ int rv; -+ -+ // Realloc doesn't copy so don't alloc if offset != 0 -+ if ((rv = qent_base_realloc(be, offset + len, -+ be_src->fixed_size || offset ? NULL : dbsc)) != 0) -+ return rv; -+ -+ dmabuf_write_start(be->dh[0]); -+ dst = dmabuf_map(be->dh[0]); -+ if (!dst) -+ return -1; -+ memcpy((char*)dst + offset, src, len); -+ dmabuf_len_set(be->dh[0], len); -+ dmabuf_write_end(be->dh[0]); -+ return 0; -+} -+ -+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane) -+{ -+ const struct qent_base *const be = &be_dst->base; -+ -+ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane]; -+} -+ -+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane) -+{ -+ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane))); -+} -+ -+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, -+ struct media_request **const pmreq, -+ struct qent_src **const psrc_be, -+ struct qent_dst *const dst_be, -+ const bool is_final) -+{ -+ struct media_request * mreq = *pmreq; -+ struct qent_src *const src_be = *psrc_be; -+ -+ // Req & src are always both "consumed" -+ *pmreq = NULL; -+ *psrc_be = NULL; -+ -+ pthread_mutex_lock(&mbc->lock); -+ -+ if (!src_be) -+ goto fail1; -+ -+ if (dst_be) { -+ if (qe_dst_waiting(dst_be)) { -+ request_info(mbc->dc, "Request buffer already waiting on start\n"); -+ goto fail1; -+ } -+ dst_be->base.timestamp = (struct timeval){0,0}; -+ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false)) -+ goto fail1; -+ -+ qent_dst_ref(dst_be); -+ queue_put_inuse(mbc->dst, &dst_be->base); -+ } -+ -+ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final)) -+ goto fail1; -+ queue_put_inuse(mbc->src, &src_be->base); -+ -+ if (!mbc->polling && mediabufs_wants_poll(mbc)) { -+ mbc->polling = true; -+ pollqueue_add_task(mbc->pt, 2000); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ -+ if (media_request_start(mreq)) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail1: -+ media_request_abort(&mreq); -+ if (src_be) -+ queue_put_free(mbc->src, &src_be->base); -+ -+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q -+ if (dst_be) { -+ dst_be->base.status = QENT_ERROR; -+ qe_dst_done(dst_be); -+ } -+ pthread_mutex_unlock(&mbc->lock); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+ -+static int qe_alloc_from_fmt(struct qent_base *const be, -+ struct dmabufs_ctl *const dbsc, -+ const struct v4l2_format *const fmt) -+{ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ unsigned int i; -+ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) { -+ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i], -+ fmt->fmt.pix_mp.plane_fmt[i].sizeimage); -+ /* On failure tidy up and die */ -+ if (!be->dh[i]) { -+ while (i--) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+ return -1; -+ } -+ } -+ } -+ else { -+// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage); -+ size_t size = fmt->fmt.pix.sizeimage; -+ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size); -+ if (!be->dh[0]) -+ return -1; -+ } -+ return 0; -+} -+ -+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd, -+ const enum v4l2_buf_type buftype, -+ uint32_t pixfmt, -+ const unsigned int width, const unsigned int height, -+ const size_t bufsize) -+{ -+ *fmt = (struct v4l2_format){.type = buftype}; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { -+ fmt->fmt.pix_mp.width = width; -+ fmt->fmt.pix_mp.height = height; -+ fmt->fmt.pix_mp.pixelformat = pixfmt; -+ if (bufsize) { -+ fmt->fmt.pix_mp.num_planes = 1; -+ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize; -+ } -+ } -+ else { -+ fmt->fmt.pix.width = width; -+ fmt->fmt.pix.height = height; -+ fmt->fmt.pix.pixelformat = pixfmt; -+ fmt->fmt.pix.sizeimage = bufsize; -+ } -+ -+ while (ioctl(fd, VIDIOC_S_FMT, fmt)) -+ if (errno != EINTR) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ // Treat anything where we don't get at least what we asked for as a fail -+ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { -+ if (fmt->fmt.pix_mp.width < width || -+ fmt->fmt.pix_mp.height < height || -+ fmt->fmt.pix_mp.pixelformat != pixfmt) { -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ } -+ else { -+ if (fmt->fmt.pix.width < width || -+ fmt->fmt.pix.height < height || -+ fmt->fmt.pix.pixelformat != pixfmt) { -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ } -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt, -+ const int fd, -+ const unsigned int type_v4l2, -+ const uint32_t flags_must, -+ const uint32_t flags_not, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v) -+{ -+ unsigned int i; -+ -+ for (i = 0;; ++i) { -+ struct v4l2_fmtdesc fmtdesc = { -+ .index = i, -+ .type = type_v4l2 -+ }; -+ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { -+ if (errno != EINTR) -+ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; -+ } -+ if ((fmtdesc.flags & flags_must) != flags_must || -+ (fmtdesc.flags & flags_not)) -+ continue; -+ if (!accept_fn(accept_v, &fmtdesc)) -+ continue; -+ -+ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat, -+ width, height, 0) == MEDIABUFS_STATUS_SUCCESS) -+ return MEDIABUFS_STATUS_SUCCESS; -+ } -+ return 0; -+} -+ -+ -+/* Wait for qent done */ -+ -+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ enum qent_status estat; -+ -+ pthread_mutex_lock(&be_dst->lock); -+ while (be_dst->waiting && -+ !pthread_cond_wait(&be_dst->cond, &be_dst->lock)) -+ /* Loop */; -+ estat = be->status; -+ pthread_mutex_unlock(&be_dst->lock); -+ -+ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS : -+ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR : -+ MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no) -+{ -+ struct qent_base *const be = &be_dst->base; -+ return dmabuf_map(be->dh[buf_no]); -+} -+ -+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ unsigned int i; -+ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (dmabuf_read_start(be->dh[i])) { -+ while (i--) -+ dmabuf_read_end(be->dh[i]); -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ } -+ } -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst) -+{ -+ struct qent_base *const be = &be_dst->base; -+ unsigned int i; -+ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; -+ -+ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { -+ if (dmabuf_read_end(be->dh[i])) -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ return status; -+} -+ -+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst) -+{ -+ if (be_dst) -+ atomic_fetch_add(&be_dst->base.ref_count, 1); -+ return be_dst; -+} -+ -+void qent_dst_unref(struct qent_dst ** const pbe_dst) -+{ -+ struct qent_dst * const be_dst = *pbe_dst; -+ struct mediabufs_ctl * mbc; -+ if (!be_dst) -+ return; -+ *pbe_dst = NULL; -+ -+ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0) -+ return; -+ -+ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) { -+ queue_put_free(mbc->dst, &be_dst->base); -+ ff_weak_link_unlock(be_dst->mbc_wl); -+ } -+ else { -+ qe_dst_free(be_dst); -+ } -+} -+ -+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, -+ unsigned int plane, -+ int fd, size_t size) -+{ -+ struct qent_base *const be = &be_dst->base; -+ struct dmabuf_h * dh; -+ -+ if (be->status != QENT_IMPORT || be->dh[plane]) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ dh = dmabuf_import(fd, size); -+ if (!dh) -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ -+ be->dh[plane] = dh; -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+// Returns noof buffers created, -ve for error -+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) -+{ -+ unsigned int i; -+ -+ struct v4l2_create_buffers cbuf = { -+ .count = n, -+ .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype), -+ .format = mbc->dst_fmt, -+ }; -+ -+ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) { -+ const int err = -errno; -+ if (err != EINTR) { -+ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__); -+ return -err; -+ } -+ } -+ -+ if (cbuf.count != n) -+ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); -+ -+ for (i = 0; i != cbuf.count; ++i) -+ qes[i]->base.index = cbuf.index + i; -+ -+ return cbuf.count; -+} -+ -+static MediaBufsStatus -+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt, -+ const unsigned int n, const bool x_dmabuf) -+{ -+ struct v4l2_buffer buf = { -+ .index = n, -+ .type = fmt->type, -+ }; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int ret; -+ -+ if (be->dh[0]) -+ return 0; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.m.planes = planes; -+ buf.length = VIDEO_MAX_PLANES; -+ } -+ -+ if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) { -+ request_err(mbc->dc, "VIDIOC_QUERYBUF failed"); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) -+ { -+ unsigned int i; -+ for (i = 0; i != buf.length; ++i) { -+ if (x_dmabuf) { -+ struct v4l2_exportbuffer xbuf = { -+ .type = buf.type, -+ .index = buf.index, -+ .plane = i, -+ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine -+ }; -+ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) -+ be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length); -+ } -+ else { -+ be->dh[i] = dmabuf_import_mmap( -+ mmap(NULL, planes[i].length, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ mbc->vfd, planes[i].m.mem_offset), -+ planes[i].length); -+ } -+ /* On failure tidy up and die */ -+ if (!be->dh[i]) { -+ while (i--) { -+ dmabuf_free(be->dh[i]); -+ be->dh[i] = NULL; -+ } -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ } -+ else -+ { -+ if (x_dmabuf) { -+ struct v4l2_exportbuffer xbuf = { -+ .type = buf.type, -+ .index = buf.index, -+ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine -+ }; -+ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) -+ be->dh[0] = dmabuf_import(xbuf.fd, buf.length); -+ } -+ else { -+ be->dh[0] = dmabuf_import_mmap( -+ mmap(NULL, buf.length, -+ PROT_READ | PROT_WRITE, -+ MAP_SHARED | MAP_POPULATE, -+ mbc->vfd, buf.m.offset), -+ buf.length); -+ } -+ /* On failure tidy up and die */ -+ if (!be->dh[0]) { -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ -+ return 0; -+} -+ -+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) -+{ -+ struct qent_dst * be_dst; -+ -+ if (mbc == NULL) { -+ be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF); -+ if (be_dst) -+ be_dst->base.status = QENT_IMPORT; -+ return be_dst; -+ } -+ -+ if (mbc->dst_fixed) { -+ be_dst = base_to_dst(queue_get_free(mbc->dst)); -+ if (!be_dst) -+ return NULL; -+ } -+ else { -+ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); -+ if (!be_dst) { -+ be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype); -+ if (!be_dst) -+ return NULL; -+ -+ if (create_dst_bufs(mbc, 1, &be_dst) != 1) { -+ qe_dst_free(be_dst); -+ return NULL; -+ } -+ } -+ } -+ -+ if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) { -+ if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) { -+ request_err(mbc->dc, "Failed to export as dmabuf\n"); -+ queue_put_free(mbc->dst, &be_dst->base); -+ return NULL; -+ } -+ } -+ else { -+ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { -+ /* Given how create buf works we can't uncreate it on alloc failure -+ * all we can do is put it on the free Q -+ */ -+ queue_put_free(mbc->dst, &be_dst->base); -+ return NULL; -+ } -+ } -+ -+ be_dst->base.status = QENT_PENDING; -+ atomic_store(&be_dst->base.ref_count, 0); -+ return be_dst; -+} -+ -+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc) -+{ -+ return &mbc->dst_fmt; -+} -+ -+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v) -+{ -+ MediaBufsStatus status; -+ unsigned int i; -+ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type; -+ static const struct { -+ unsigned int flags_must; -+ unsigned int flags_not; -+ } trys[] = { -+ {0, V4L2_FMT_FLAG_EMULATED}, -+ {V4L2_FMT_FLAG_EMULATED, 0}, -+ }; -+ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) { -+ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd, -+ buf_type, -+ trys[i].flags_must, -+ trys[i].flags_not, -+ width, height, accept_fn, accept_v); -+ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE) -+ return status; -+ } -+ -+ if (status != MEDIABUFS_STATUS_SUCCESS) -+ return status; -+ -+ /* Try to create a buffer - don't alloc */ -+ return status; -+} -+ -+// ** This is a mess if we get partial alloc but without any way to remove -+// individual V4L2 Q members we are somewhat stuffed -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype) -+{ -+ unsigned int i; -+ int a = 0; -+ unsigned int qc; -+ struct qent_dst * qes[32]; -+ -+ if (n > 32) -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+ -+ mbc->dst->memtype = memtype; -+ -+ // Create qents first as it is hard to get rid of the V4L2 buffers on error -+ for (qc = 0; qc != n; ++qc) -+ { -+ if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL) -+ goto fail; -+ } -+ -+ if ((a = create_dst_bufs(mbc, n, qes)) < 0) -+ goto fail; -+ -+ for (i = 0; i != a; ++i) -+ queue_put_free(mbc->dst, &qes[i]->base); -+ -+ if (a != n) -+ goto fail; -+ -+ mbc->dst_fixed = fixed; -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail: -+ for (i = (a < 0 ? 0 : a); i != qc; ++i) -+ qe_dst_free(qes[i]); -+ -+ return MEDIABUFS_ERROR_ALLOCATION_FAILED; -+} -+ -+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) -+{ -+ struct qent_base * buf = queue_get_free(mbc->src); -+ buf->status = QENT_PENDING; -+ return base_to_src(buf); -+} -+ -+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src) -+{ -+ struct qent_src *const qe_src = *pqe_src; -+ if (!qe_src) -+ return; -+ *pqe_src = NULL; -+ queue_put_free(mbc->src, &qe_src->base); -+} -+ -+static MediaBufsStatus -+chk_memory_type(struct mediabufs_ctl *const mbc, -+ const struct v4l2_format * const f, -+ const enum mediabufs_memory m) -+{ -+ struct v4l2_create_buffers cbuf = { -+ .count = 0, -+ .memory = V4L2_MEMORY_MMAP, -+ .format = *f -+ }; -+ -+ if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0) -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ -+ switch (m) { -+ case MEDIABUFS_MEMORY_DMABUF: -+ // 0 = Unknown but assume not in that case -+ if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0) -+ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; -+ break; -+ case MEDIABUFS_MEMORY_MMAP: -+ break; -+ default: -+ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; -+ } -+ -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus -+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) -+{ -+ return chk_memory_type(mbc, &mbc->src_fmt, memtype); -+} -+ -+MediaBufsStatus -+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) -+{ -+ return chk_memory_type(mbc, &mbc->dst_fmt, memtype); -+} -+ -+/* src format must have been set up before this */ -+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, -+ struct dmabufs_ctl * const dbsc, -+ unsigned int n, const enum mediabufs_memory memtype) -+{ -+ unsigned int i; -+ struct v4l2_requestbuffers req = { -+ .count = n, -+ .type = mbc->src_fmt.type, -+ .memory = mediabufs_memory_to_v4l2(memtype) -+ }; -+ -+ bq_free_all_free_src(mbc->src); -+ -+ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { -+ if (errno != EINTR) { -+ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ } -+ -+ if (n > req.count) { -+ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n); -+ n = req.count; -+ } -+ -+ for (i = 0; i != n; ++i) { -+ struct qent_src *const be_src = qe_src_new(memtype); -+ if (!be_src) { -+ request_err(mbc->dc, "Failed to create src be %d\n", i); -+ goto fail; -+ } -+ switch (memtype) { -+ case MEDIABUFS_MEMORY_MMAP: -+ if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) { -+ qe_src_free(be_src); -+ goto fail; -+ } -+ be_src->fixed_size = 1; -+ break; -+ case MEDIABUFS_MEMORY_DMABUF: -+ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { -+ qe_src_free(be_src); -+ goto fail; -+ } -+ be_src->fixed_size = !mediabufs_src_resizable(mbc); -+ break; -+ default: -+ request_err(mbc->dc, "Unexpected memorty type\n"); -+ goto fail; -+ } -+ be_src->base.index = i; -+ -+ queue_put_free(mbc->src, &be_src->base); -+ } -+ -+ mbc->src->memtype = memtype; -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+fail: -+ bq_free_all_free_src(mbc->src); -+ req.count = 0; -+ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 && -+ errno == EINTR) -+ /* Loop */; -+ -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+ -+ -+/* -+ * Set stuff order: -+ * Set src fmt -+ * Set parameters (sps) on vfd -+ * Negotiate dst format (dst_fmt_set) -+ * Create src buffers -+ * Alloc a dst buffer or Create dst slots -+*/ -+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc) -+{ -+ if (mbc->stream_on) -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) { -+ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) { -+ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type); -+ set_stream(mbc->vfd, mbc->src_fmt.type, false); -+ return MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ mbc->stream_on = true; -+ return MEDIABUFS_STATUS_SUCCESS; -+} -+ -+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc) -+{ -+ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; -+ -+ if (!mbc->stream_on) -+ return MEDIABUFS_STATUS_SUCCESS; -+ -+ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) { -+ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type); -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) { -+ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type); -+ status = MEDIABUFS_ERROR_OPERATION_FAILED; -+ } -+ -+ mbc->stream_on = false; -+ return status; -+} -+ -+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n) -+{ -+ struct v4l2_ext_controls controls = { -+ .controls = control_array, -+ .count = n -+ }; -+ -+ if (mreq) { -+ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL; -+ controls.request_fd = media_request_fd(mreq); -+ } -+ -+ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls)) -+ { -+ const int err = errno; -+ if (err != EINTR) { -+ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err)); -+ return -err; -+ } -+ } -+ -+ return 0; -+} -+ -+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, -+ struct media_request * const mreq, -+ unsigned int id, void *data, -+ unsigned int size) -+{ -+ struct v4l2_ext_control control = { -+ .id = id, -+ .ptr = data, -+ .size = size -+ }; -+ -+ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1); -+ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED; -+} -+ -+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, -+ enum v4l2_buf_type buf_type, -+ const uint32_t pixfmt, -+ const uint32_t width, const uint32_t height, -+ const size_t bufsize) -+{ -+ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize); -+ if (rv != MEDIABUFS_STATUS_SUCCESS) -+ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height); -+ -+ return rv; -+} -+ -+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n) -+{ -+ int rv = 0; -+ while (n--) { -+ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) { -+ const int err = errno; -+ if (err != EINTR) { -+ // Often used for probing - errors are to be expected -+ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err); -+ ctrls->type = 0; // 0 is invalid -+ rv = -err; -+ break; -+ } -+ } -+ ++ctrls; -+ } -+ return rv; -+} -+ -+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) -+{ -+#if 1 -+ return 0; -+#else -+ // Single planar OUTPUT can only take exact size buffers -+ // Multiplanar will take larger than negotiated -+ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); -+#endif -+} -+ -+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) -+{ -+ if (!mbc) -+ return; -+ -+ // Break the weak link first -+ ff_weak_link_break(&mbc->this_wlm); -+ -+ polltask_delete(&mbc->pt); -+ -+ mediabufs_stream_off(mbc); -+ -+ // Empty v4l2 buffer stash -+ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0); -+ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0); -+ -+ bq_free_all_free_src(mbc->src); -+ bq_free_all_inuse_src(mbc->src); -+ bq_free_all_free_dst(mbc->dst); -+ -+ { -+ struct qent_dst *dst_be; -+ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) { -+ dst_be->base.timestamp = (struct timeval){0}; -+ dst_be->base.status = QENT_ERROR; -+ qe_dst_done(dst_be); -+ } -+ } -+ -+ queue_delete(mbc->dst); -+ queue_delete(mbc->src); -+ close(mbc->vfd); -+ pthread_mutex_destroy(&mbc->lock); -+ -+ free(mbc); -+} -+ -+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc) -+{ -+ atomic_fetch_add(&mbc->ref_count, 1); -+ return mbc; -+} -+ -+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) -+{ -+ struct mediabufs_ctl *const mbc = *pmbc; -+ int n; -+ -+ if (!mbc) -+ return; -+ *pmbc = NULL; -+ n = atomic_fetch_sub(&mbc->ref_count, 1); -+ if (n) -+ return; -+ mediabufs_ctl_delete(mbc); -+} -+ -+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) -+{ -+ return mbc->capability.version; -+} -+ -+static int set_capabilities(struct mediabufs_ctl *const mbc) -+{ -+ uint32_t caps; -+ -+ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { -+ int err = errno; -+ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); -+ return -err; -+ } -+ -+ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? -+ mbc->capability.device_caps : -+ mbc->capability.capabilities; -+ -+ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { -+ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ } -+ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) { -+ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ } -+ else { -+ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+/* One of these per context */ -+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq) -+{ -+ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc)); -+ -+ if (!mbc) -+ return NULL; -+ -+ mbc->dc = dc; -+ // Default mono planar -+ mbc->pq = pq; -+ pthread_mutex_init(&mbc->lock, NULL); -+ -+ /* Pick a default - could we scan for this? */ -+ if (vpath == NULL) -+ vpath = "/dev/media0"; -+ -+ while ((mbc->vfd = open(vpath, O_RDWR)) == -1) -+ { -+ const int err = errno; -+ if (err != EINTR) { -+ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err)); -+ goto fail0; -+ } -+ } -+ -+ if (set_capabilities(mbc)) { -+ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath); -+ goto fail1; -+ } -+ -+ mbc->src = queue_new(mbc->vfd); -+ if (!mbc->src) -+ goto fail1; -+ mbc->dst = queue_new(mbc->vfd); -+ if (!mbc->dst) -+ goto fail2; -+ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc); -+ if (!mbc->pt) -+ goto fail3; -+ mbc->this_wlm = ff_weak_link_new(mbc); -+ if (!mbc->this_wlm) -+ goto fail4; -+ -+ /* Cannot add polltask now - polling with nothing pending -+ * generates infinite error polls -+ */ -+ return mbc; -+ -+fail4: -+ polltask_delete(&mbc->pt); -+fail3: -+ queue_delete(mbc->dst); -+fail2: -+ queue_delete(mbc->src); -+fail1: -+ close(mbc->vfd); -+fail0: -+ free(mbc); -+ request_info(dc, "%s: FAILED\n", __func__); -+ return NULL; -+} -+ -+ -+ -diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h -new file mode 100644 -index 0000000000..890947b2e2 ---- /dev/null -+++ b/libavcodec/v4l2_req_media.h -@@ -0,0 +1,171 @@ -+/* -+e.h -+* -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the -+ * "Software"), to deal in the Software without restriction, including -+ * without limitation the rights to use, copy, modify, merge, publish, -+ * distribute, sub license, and/or sell copies of the Software, and to -+ * permit persons to whom the Software is furnished to do so, subject to -+ * the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the -+ * next paragraph) shall be included in all copies or substantial portions -+ * of the Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. -+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR -+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -+ */ -+ -+#ifndef _MEDIA_H_ -+#define _MEDIA_H_ -+ -+#include -+#include -+ -+struct v4l2_format; -+struct v4l2_fmtdesc; -+struct v4l2_query_ext_ctrl; -+ -+struct pollqueue; -+struct media_request; -+struct media_pool; -+ -+typedef enum media_buf_status { -+ MEDIABUFS_STATUS_SUCCESS = 0, -+ MEDIABUFS_ERROR_OPERATION_FAILED, -+ MEDIABUFS_ERROR_DECODING_ERROR, -+ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, -+ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, -+ MEDIABUFS_ERROR_ALLOCATION_FAILED, -+ MEDIABUFS_ERROR_UNSUPPORTED_MEMORY, -+} MediaBufsStatus; -+ -+struct media_pool * media_pool_new(const char * const media_path, -+ struct pollqueue * const pq, -+ const unsigned int n); -+void media_pool_delete(struct media_pool ** pmp); -+ -+// Obtain a media request -+// Will block if none availible - has a 2sec timeout -+struct media_request * media_request_get(struct media_pool * const mp); -+int media_request_fd(const struct media_request * const req); -+ -+// Start this request -+// Request structure is returned to pool once done -+int media_request_start(struct media_request * const req); -+ -+// Return an *unstarted* media_request to the pool -+// May later be upgraded to allow for aborting a started req -+int media_request_abort(struct media_request ** const preq); -+ -+ -+struct mediabufs_ctl; -+struct qent_src; -+struct qent_dst; -+struct dmabuf_h; -+struct dmabufs_ctl; -+ -+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties -+enum mediabufs_memory { -+ MEDIABUFS_MEMORY_UNSET = 0, -+ MEDIABUFS_MEMORY_MMAP = 1, -+ MEDIABUFS_MEMORY_USERPTR = 2, -+ MEDIABUFS_MEMORY_OVERLAY = 3, -+ MEDIABUFS_MEMORY_DMABUF = 4, -+}; -+ -+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); -+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); -+ -+// prealloc -+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc); -+// dbsc may be NULL if realloc not required -+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc); -+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane); -+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane); -+MediaBufsStatus qent_dst_wait(struct qent_dst *const be); -+void qent_dst_delete(struct qent_dst *const be); -+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead -+void qent_dst_unref(struct qent_dst ** const pbe_dst); -+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst); -+ -+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no); -+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be); -+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be); -+/* Import an fd unattached to any mediabuf */ -+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, -+ unsigned int plane, -+ int fd, size_t size); -+ -+const char * mediabufs_memory_name(const enum mediabufs_memory m); -+ -+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, -+ struct media_request **const pmreq, -+ struct qent_src **const psrc_be, -+ struct qent_dst *const dst_be, -+ const bool is_final); -+// Get / alloc a dst buffer & associate with a slot -+// If the dst pool is empty then behaviour depends on the fixed flag passed to -+// dst_slots_create. Default is !fixed = unlimited alloc -+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, -+ struct dmabufs_ctl *const dbsc); -+// Create dst slots without alloc -+// If fixed true then qent_alloc will only get slots from this pool and will -+// block until a qent has been unrefed -+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype); -+ -+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); -+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); -+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc); -+ -+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc); -+ -+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, -+ const unsigned int width, -+ const unsigned int height, -+ mediabufs_dst_fmt_accept_fn *const accept_fn, -+ void *const accept_v); -+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc); -+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src); -+ -+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, -+ struct v4l2_ext_control control_array[], unsigned int n); -+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, -+ struct media_request * const mreq, -+ unsigned int id, void *data, -+ unsigned int size); -+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n); -+ -+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc); -+ -+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, -+ enum v4l2_buf_type buf_type, -+ const uint32_t pixfmt, -+ const uint32_t width, const uint32_t height, -+ const size_t bufsize); -+ -+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, -+ struct dmabufs_ctl * const dbsc, -+ unsigned int n, -+ const enum mediabufs_memory memtype); -+ -+// Want to have appropriate formats set first -+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); -+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); -+ -+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) -+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); -+ -+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, -+ const char *vpath, struct pollqueue *const pq); -+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); -+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc); -+ -+ -+#endif -diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c -new file mode 100644 -index 0000000000..cc8a5d4001 ---- /dev/null -+++ b/libavcodec/v4l2_req_pollqueue.c -@@ -0,0 +1,361 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_utils.h" -+ -+ -+struct pollqueue; -+ -+enum polltask_state { -+ POLLTASK_UNQUEUED = 0, -+ POLLTASK_QUEUED, -+ POLLTASK_RUNNING, -+ POLLTASK_Q_KILL, -+ POLLTASK_RUN_KILL, -+}; -+ -+struct polltask { -+ struct polltask *next; -+ struct polltask *prev; -+ struct pollqueue *q; -+ enum polltask_state state; -+ -+ int fd; -+ short events; -+ -+ void (*fn)(void *v, short revents); -+ void * v; -+ -+ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */ -+ sem_t kill_sem; -+}; -+ -+struct pollqueue { -+ atomic_int ref_count; -+ pthread_mutex_t lock; -+ -+ struct polltask *head; -+ struct polltask *tail; -+ -+ bool kill; -+ bool no_prod; -+ int prod_fd; -+ struct polltask *prod_pt; -+ pthread_t worker; -+}; -+ -+struct polltask *polltask_new(struct pollqueue *const pq, -+ const int fd, const short events, -+ void (*const fn)(void *v, short revents), -+ void *const v) -+{ -+ struct polltask *pt; -+ -+ if (!events) -+ return NULL; -+ -+ pt = malloc(sizeof(*pt)); -+ if (!pt) -+ return NULL; -+ -+ *pt = (struct polltask){ -+ .next = NULL, -+ .prev = NULL, -+ .q = pollqueue_ref(pq), -+ .fd = fd, -+ .events = events, -+ .fn = fn, -+ .v = v -+ }; -+ -+ sem_init(&pt->kill_sem, 0, 0); -+ -+ return pt; -+} -+ -+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt) -+{ -+ if (pt->prev) -+ pt->prev->next = pt->next; -+ else -+ pq->head = pt->next; -+ if (pt->next) -+ pt->next->prev = pt->prev; -+ else -+ pq->tail = pt->prev; -+ pt->next = NULL; -+ pt->prev = NULL; -+} -+ -+static void polltask_free(struct polltask * const pt) -+{ -+ sem_destroy(&pt->kill_sem); -+ free(pt); -+} -+ -+static int pollqueue_prod(const struct pollqueue *const pq) -+{ -+ static const uint64_t one = 1; -+ return write(pq->prod_fd, &one, sizeof(one)); -+} -+ -+void polltask_delete(struct polltask **const ppt) -+{ -+ struct polltask *const pt = *ppt; -+ struct pollqueue * pq; -+ enum polltask_state state; -+ bool prodme; -+ -+ if (!pt) -+ return; -+ -+ pq = pt->q; -+ pthread_mutex_lock(&pq->lock); -+ state = pt->state; -+ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL; -+ prodme = !pq->no_prod; -+ pthread_mutex_unlock(&pq->lock); -+ -+ if (state != POLLTASK_UNQUEUED) { -+ if (prodme) -+ pollqueue_prod(pq); -+ while (sem_wait(&pt->kill_sem) && errno == EINTR) -+ /* loop */; -+ } -+ -+ // Leave zapping the ref until we have DQed the PT as might well be -+ // legitimately used in it -+ *ppt = NULL; -+ polltask_free(pt); -+ pollqueue_unref(&pq); -+} -+ -+static uint64_t pollqueue_now(int timeout) -+{ -+ struct timespec now; -+ uint64_t now_ms; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &now)) -+ return 0; -+ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout; -+ return now_ms ? now_ms : (uint64_t)1; -+} -+ -+void pollqueue_add_task(struct polltask *const pt, const int timeout) -+{ -+ bool prodme = false; -+ struct pollqueue * const pq = pt->q; -+ -+ pthread_mutex_lock(&pq->lock); -+ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) { -+ if (pq->tail) -+ pq->tail->next = pt; -+ else -+ pq->head = pt; -+ pt->prev = pq->tail; -+ pt->next = NULL; -+ pt->state = POLLTASK_QUEUED; -+ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout); -+ pq->tail = pt; -+ prodme = !pq->no_prod; -+ } -+ pthread_mutex_unlock(&pq->lock); -+ if (prodme) -+ pollqueue_prod(pq); -+} -+ -+static void *poll_thread(void *v) -+{ -+ struct pollqueue *const pq = v; -+ struct pollfd *a = NULL; -+ size_t asize = 0; -+ -+ pthread_mutex_lock(&pq->lock); -+ do { -+ unsigned int i; -+ unsigned int n = 0; -+ struct polltask *pt; -+ struct polltask *pt_next; -+ uint64_t now = pollqueue_now(0); -+ int timeout = -1; -+ int rv; -+ -+ for (pt = pq->head; pt; pt = pt_next) { -+ int64_t t; -+ -+ pt_next = pt->next; -+ -+ if (pt->state == POLLTASK_Q_KILL) { -+ pollqueue_rem_task(pq, pt); -+ sem_post(&pt->kill_sem); -+ continue; -+ } -+ -+ if (n >= asize) { -+ asize = asize ? asize * 2 : 4; -+ a = realloc(a, asize * sizeof(*a)); -+ if (!a) { -+ request_log("Failed to realloc poll array to %zd\n", asize); -+ goto fail_locked; -+ } -+ } -+ -+ a[n++] = (struct pollfd){ -+ .fd = pt->fd, -+ .events = pt->events -+ }; -+ -+ t = (int64_t)(pt->timeout - now); -+ if (pt->timeout && t < INT_MAX && -+ (timeout < 0 || (int)t < timeout)) -+ timeout = (t < 0) ? 0 : (int)t; -+ } -+ pthread_mutex_unlock(&pq->lock); -+ -+ if ((rv = poll(a, n, timeout)) == -1) { -+ if (errno != EINTR) { -+ request_log("Poll error: %s\n", strerror(errno)); -+ goto fail_unlocked; -+ } -+ } -+ -+ pthread_mutex_lock(&pq->lock); -+ now = pollqueue_now(0); -+ -+ /* Prodding in this loop is pointless and might lead to -+ * infinite looping -+ */ -+ pq->no_prod = true; -+ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { -+ pt_next = pt->next; -+ -+ /* Pending? */ -+ if (a[i].revents || -+ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) { -+ pollqueue_rem_task(pq, pt); -+ if (pt->state == POLLTASK_QUEUED) -+ pt->state = POLLTASK_RUNNING; -+ if (pt->state == POLLTASK_Q_KILL) -+ pt->state = POLLTASK_RUN_KILL; -+ pthread_mutex_unlock(&pq->lock); -+ -+ /* This can add new entries to the Q but as -+ * those are added to the tail our existing -+ * chain remains intact -+ */ -+ pt->fn(pt->v, a[i].revents); -+ -+ pthread_mutex_lock(&pq->lock); -+ if (pt->state == POLLTASK_RUNNING) -+ pt->state = POLLTASK_UNQUEUED; -+ if (pt->state == POLLTASK_RUN_KILL) -+ sem_post(&pt->kill_sem); -+ } -+ } -+ pq->no_prod = false; -+ -+ } while (!pq->kill); -+ -+fail_locked: -+ pthread_mutex_unlock(&pq->lock); -+fail_unlocked: -+ free(a); -+ return NULL; -+} -+ -+static void prod_fn(void *v, short revents) -+{ -+ struct pollqueue *const pq = v; -+ char buf[8]; -+ if (revents) -+ read(pq->prod_fd, buf, 8); -+ if (!pq->kill) -+ pollqueue_add_task(pq->prod_pt, -1); -+} -+ -+struct pollqueue * pollqueue_new(void) -+{ -+ struct pollqueue *pq = malloc(sizeof(*pq)); -+ if (!pq) -+ return NULL; -+ *pq = (struct pollqueue){ -+ .ref_count = ATOMIC_VAR_INIT(0), -+ .lock = PTHREAD_MUTEX_INITIALIZER, -+ .head = NULL, -+ .tail = NULL, -+ .kill = false, -+ .prod_fd = -1 -+ }; -+ -+ pq->prod_fd = eventfd(0, EFD_NONBLOCK); -+ if (pq->prod_fd == 1) -+ goto fail1; -+ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq); -+ if (!pq->prod_pt) -+ goto fail2; -+ pollqueue_add_task(pq->prod_pt, -1); -+ if (pthread_create(&pq->worker, NULL, poll_thread, pq)) -+ goto fail3; -+ // Reset ref count which will have been inced by the add_task -+ atomic_store(&pq->ref_count, 0); -+ return pq; -+ -+fail3: -+ polltask_free(pq->prod_pt); -+fail2: -+ close(pq->prod_fd); -+fail1: -+ free(pq); -+ return NULL; -+} -+ -+static void pollqueue_free(struct pollqueue *const pq) -+{ -+ void *rv; -+ -+ pthread_mutex_lock(&pq->lock); -+ pq->kill = true; -+ pollqueue_prod(pq); -+ pthread_mutex_unlock(&pq->lock); -+ -+ pthread_join(pq->worker, &rv); -+ polltask_free(pq->prod_pt); -+ pthread_mutex_destroy(&pq->lock); -+ close(pq->prod_fd); -+ free(pq); -+} -+ -+struct pollqueue * pollqueue_ref(struct pollqueue *const pq) -+{ -+ atomic_fetch_add(&pq->ref_count, 1); -+ return pq; -+} -+ -+void pollqueue_unref(struct pollqueue **const ppq) -+{ -+ struct pollqueue * const pq = *ppq; -+ -+ if (!pq) -+ return; -+ *ppq = NULL; -+ -+ if (atomic_fetch_sub(&pq->ref_count, 1) != 0) -+ return; -+ -+ pollqueue_free(pq); -+} -+ -+ -+ -diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h -new file mode 100644 -index 0000000000..e1182cb2fc ---- /dev/null -+++ b/libavcodec/v4l2_req_pollqueue.h -@@ -0,0 +1,18 @@ -+#ifndef POLLQUEUE_H_ -+#define POLLQUEUE_H_ -+ -+struct polltask; -+struct pollqueue; -+ -+struct polltask *polltask_new(struct pollqueue *const pq, -+ const int fd, const short events, -+ void (*const fn)(void *v, short revents), -+ void *const v); -+void polltask_delete(struct polltask **const ppt); -+ -+void pollqueue_add_task(struct polltask *const pt, const int timeout); -+struct pollqueue * pollqueue_new(void); -+void pollqueue_unref(struct pollqueue **const ppq); -+struct pollqueue * pollqueue_ref(struct pollqueue *const pq); -+ -+#endif /* POLLQUEUE_H_ */ -diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h -new file mode 100644 -index 0000000000..a31cc1f4ec ---- /dev/null -+++ b/libavcodec/v4l2_req_utils.h -@@ -0,0 +1,27 @@ -+#ifndef AVCODEC_V4L2_REQ_UTILS_H -+#define AVCODEC_V4L2_REQ_UTILS_H -+ -+#include -+#include "libavutil/log.h" -+ -+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) -+ -+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) -+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) -+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) -+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) -+ -+static inline char safechar(char c) { -+ return c > 0x20 && c < 0x7f ? c : '.'; -+} -+ -+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) { -+ tbuf[0] = safechar((fcc >> 0) & 0xff); -+ tbuf[1] = safechar((fcc >> 8) & 0xff); -+ tbuf[2] = safechar((fcc >> 16) & 0xff); -+ tbuf[3] = safechar((fcc >> 24) & 0xff); -+ tbuf[4] = '\0'; -+ return tbuf; -+} -+ -+#endif -diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c -new file mode 100644 -index 0000000000..fbec16a93e ---- /dev/null -+++ b/libavcodec/v4l2_request_hevc.c -@@ -0,0 +1,347 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+#include "config.h" -+#include "decode.h" -+#include "hevcdec.h" -+#include "hwconfig.h" -+ -+#include "v4l2_request_hevc.h" -+ -+#include "libavutil/hwcontext_drm.h" -+#include "libavutil/pixdesc.h" -+ -+#include "v4l2_req_devscan.h" -+#include "v4l2_req_dmabufs.h" -+#include "v4l2_req_pollqueue.h" -+#include "v4l2_req_media.h" -+#include "v4l2_req_utils.h" -+ -+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8) -+{ -+ const size_t wxh = w * h; -+ size_t bits_alloc; -+ -+ /* Annex A gives a min compression of 2 @ lvl 3.1 -+ * (wxh <= 983040) and min 4 thereafter but avoid -+ * the odity of 983041 having a lower limit than -+ * 983040. -+ * Multiply by 3/2 for 4:2:0 -+ */ -+ bits_alloc = wxh < 983040 ? wxh * 3 / 4 : -+ wxh < 983040 * 2 ? 983040 * 3 / 4 : -+ wxh * 3 / 8; -+ /* Allow for bit depth */ -+ bits_alloc += (bits_alloc * bits_minus8) / 8; -+ /* Add a few bytes (16k) for overhead */ -+ bits_alloc += 0x4000; -+ return bits_alloc; -+} -+ -+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx, -+ av_unused const uint8_t *buffer, -+ av_unused uint32_t size) -+{ -+ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->start_frame(avctx, buffer, size); -+} -+ -+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->decode_slice(avctx, buffer, size); -+} -+ -+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx) -+{ -+ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->end_frame(avctx); -+} -+ -+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ ctx->fns->abort_frame(avctx); -+} -+ -+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->frame_params(avctx, hw_frames_ctx); -+} -+ -+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ return ctx->fns->alloc_frame(avctx, frame); -+} -+ -+ -+static int v4l2_request_hevc_uninit(AVCodecContext *avctx) -+{ -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ -+ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode -+ -+ mediabufs_ctl_unref(&ctx->mbufs); -+ media_pool_delete(&ctx->mpool); -+ pollqueue_unref(&ctx->pq); -+ dmabufs_ctl_unref(&ctx->dbufs); -+ devscan_delete(&ctx->devscan); -+ -+ decode_q_uninit(&ctx->decode_q); -+ -+// if (avctx->hw_frames_ctx) { -+// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; -+// av_buffer_pool_flush(hwfc->pool); -+// } -+ return 0; -+} -+ -+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc) -+{ -+ AVCodecContext *const avctx = v; -+ const HEVCContext *const h = avctx->priv_data; -+ -+ if (h->ps.sps->bit_depth == 8) { -+ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 || -+ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) { -+ return 1; -+ } -+ } -+ else if (h->ps.sps->bit_depth == 10) { -+ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+static int v4l2_request_hevc_init(AVCodecContext *avctx) -+{ -+ const HEVCContext *h = avctx->priv_data; -+ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; -+ const HEVCSPS * const sps = h->ps.sps; -+ int ret; -+ const struct decdev * decdev; -+ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes -+ size_t src_size; -+ enum mediabufs_memory src_memtype; -+ enum mediabufs_memory dst_memtype; -+ -+ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ // Give up immediately if this is something that we have no code to deal with -+ if (h->ps.sps->chroma_format_idc != 1) { -+ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); -+ return AVERROR_PATCHWELCOME; -+ } -+ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || -+ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { -+ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); -+ return AVERROR_PATCHWELCOME; -+ } -+ -+ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); -+ return (AVERROR(-ret)); -+ } -+ ret = AVERROR(ENOMEM); // Assume mem fail by default for these -+ -+ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL) -+ { -+ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n"); -+ ret = AVERROR(ENODEV); -+ goto fail0; -+ } -+ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n", -+ decdev_media_path(decdev), decdev_video_path(decdev)); -+ -+ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { -+ av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n"); -+ src_memtype = MEDIABUFS_MEMORY_MMAP; -+ dst_memtype = MEDIABUFS_MEMORY_MMAP; -+ } -+ else { -+ av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n"); -+ src_memtype = MEDIABUFS_MEMORY_DMABUF; -+ dst_memtype = MEDIABUFS_MEMORY_DMABUF; -+ } -+ -+ if ((ctx->pq = pollqueue_new()) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n"); -+ goto fail1; -+ } -+ -+ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n"); -+ goto fail2; -+ } -+ -+ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) { -+ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n"); -+ goto fail3; -+ } -+ -+ // Ask for an initial bitbuf size of max size / 4 -+ // We will realloc if we need more -+ // Must use sps->h/w as avctx contains cropped size -+retry_src_memtype: -+ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); -+ if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs)) -+ src_size /= 4; -+ // Kludge for conformance tests which break Annex A limits -+ else if (src_size < 0x40000) -+ src_size = 0x40000; -+ -+ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt, -+ sps->width, sps->height, src_size)) { -+ char tbuf1[5]; -+ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); -+ goto fail4; -+ } -+ -+ if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) { -+ if (src_memtype == MEDIABUFS_MEMORY_DMABUF) { -+ src_memtype = MEDIABUFS_MEMORY_MMAP; -+ goto retry_src_memtype; -+ } -+ av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n"); -+ goto fail4; -+ } -+ -+ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 4); -+ } -+#if CONFIG_V4L2_REQ_HEVC_VX -+ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 3); -+ } -+ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 2); -+ } -+ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) { -+ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); -+ ctx->fns = &V2(ff_v4l2_req_hevc, 1); -+ } -+#endif -+ else { -+ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); -+ ret = AVERROR(EINVAL); -+ goto fail4; -+ } -+ -+ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) { -+ char tbuf1[5]; -+ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); -+ goto fail4; -+ } -+ -+ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); -+ goto fail4; -+ } -+ -+ { -+ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + -+ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); -+ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, -+ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, -+ avctx->thread_count, avctx->extra_hw_frames); -+ -+ if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) { -+ if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n"); -+ goto fail4; -+ } -+ av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n"); -+ dst_memtype = MEDIABUFS_MEMORY_MMAP; -+ } -+ -+ // extra_hw_frames is -1 if unset -+ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); -+ goto fail4; -+ } -+ } -+ -+ if (mediabufs_stream_on(ctx->mbufs)) { -+ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n"); -+ goto fail4; -+ } -+ -+ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n"); -+ goto fail4; -+ } -+ -+ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n"); -+ goto fail5; -+ } -+ -+ decode_q_init(&ctx->decode_q); -+ -+ // Set our s/w format -+ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; -+ -+ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n", -+ ctx->fns->name, -+ decdev_media_path(decdev), decdev_video_path(decdev), -+ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype), -+ av_get_pix_fmt_name(avctx->sw_pix_fmt)); -+ -+ return 0; -+ -+fail5: -+ av_buffer_unref(&avctx->hw_frames_ctx); -+fail4: -+ mediabufs_ctl_unref(&ctx->mbufs); -+fail3: -+ media_pool_delete(&ctx->mpool); -+fail2: -+ pollqueue_unref(&ctx->pq); -+fail1: -+ dmabufs_ctl_unref(&ctx->dbufs); -+fail0: -+ devscan_delete(&ctx->devscan); -+ return ret; -+} -+ -+const AVHWAccel ff_hevc_v4l2request_hwaccel = { -+ .name = "hevc_v4l2request", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .id = AV_CODEC_ID_HEVC, -+ .pix_fmt = AV_PIX_FMT_DRM_PRIME, -+ .alloc_frame = v4l2_req_hevc_alloc_frame, -+ .start_frame = v4l2_req_hevc_start_frame, -+ .decode_slice = v4l2_req_hevc_decode_slice, -+ .end_frame = v4l2_req_hevc_end_frame, -+ .abort_frame = v4l2_req_hevc_abort_frame, -+ .init = v4l2_request_hevc_init, -+ .uninit = v4l2_request_hevc_uninit, -+ .priv_data_size = sizeof(V4L2RequestContextHEVC), -+ .frame_params = v4l2_req_hevc_frame_params, -+ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, -+}; -diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h -new file mode 100644 -index 0000000000..99c90064ea ---- /dev/null -+++ b/libavcodec/v4l2_request_hevc.h -@@ -0,0 +1,102 @@ -+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H -+#define AVCODEC_V4L2_REQUEST_HEVC_H -+ -+#include -+#include -+#include "v4l2_req_decode_q.h" -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+// P030 should be defined in drm_fourcc.h and hopefully will be sometime -+// in the future but until then... -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') -+#endif -+ -+#ifndef DRM_FORMAT_NV15 -+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') -+#endif -+ -+#ifndef DRM_FORMAT_NV20 -+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') -+#endif -+ -+#include -+#ifndef V4L2_CID_CODEC_BASE -+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE -+#endif -+ -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in drm_fourcc.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ -+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY -+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 -+#endif -+ -+#define VCAT(name, version) name##_v##version -+#define V2(n,v) VCAT(n, v) -+#define V(n) V2(n, HEVC_CTRLS_VERSION) -+ -+#define S2(x) #x -+#define STR(x) S2(x) -+ -+// 1 per decoder -+struct v4l2_req_decode_fns; -+ -+typedef struct V4L2RequestContextHEVC { -+// V4L2RequestContext base; -+ const struct v4l2_req_decode_fns * fns; -+ -+ unsigned int timestamp; // ?? maybe uint64_t -+ -+ int decode_mode; -+ int start_code; -+ unsigned int max_slices; // 0 => not wanted (frame mode) -+ unsigned int max_offsets; // 0 => not wanted -+ -+ req_decode_q decode_q; -+ -+ struct devscan *devscan; -+ struct dmabufs_ctl *dbufs; -+ struct pollqueue *pq; -+ struct media_pool * mpool; -+ struct mediabufs_ctl *mbufs; -+} V4L2RequestContextHEVC; -+ -+typedef struct v4l2_req_decode_fns { -+ int src_pix_fmt_v4l2; -+ const char * name; -+ -+ // Init setup -+ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); -+ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); -+ -+ // Passthrough of hwaccel fns -+ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); -+ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); -+ int (*end_frame)(AVCodecContext *avctx); -+ void (*abort_frame)(AVCodecContext *avctx); -+ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); -+ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); -+} v4l2_req_decode_fns; -+ -+ -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); -+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4); -+ -+#endif -diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c -index ea93e11588..a9e0c6323e 100644 ---- a/libavcodec/vc1dec.c -+++ b/libavcodec/vc1dec.c -@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) - size = next - start - 4; - if (size <= 0) - continue; -- buf2_size = vc1_unescape_buffer(start + 4, size, buf2); -+ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); - init_get_bits(&gb, buf2, buf2_size * 8); - switch (AV_RB32(start)) { - case VC1_CODE_SEQHDR: -@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - case VC1_CODE_FRAME: - if (avctx->hwaccel) - buf_start = start; -- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); -+ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); - break; - case VC1_CODE_FIELD: { - int buf_size3; -@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - ret = AVERROR(ENOMEM); - goto err; - } -- buf_size3 = vc1_unescape_buffer(start + 4, size, -- slices[n_slices].buf); -+ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, -+ slices[n_slices].buf); - init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, - buf_size3 << 3); - slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; -@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - break; - } - case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ -- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); -+ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); - init_get_bits(&s->gb, buf2, buf_size2 * 8); - ff_vc1_decode_entry_point(avctx, v, &s->gb); - break; -@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - ret = AVERROR(ENOMEM); - goto err; - } -- buf_size3 = vc1_unescape_buffer(start + 4, size, -- slices[n_slices].buf); -+ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, -+ slices[n_slices].buf); - init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, - buf_size3 << 3); - slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); -@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - ret = AVERROR(ENOMEM); - goto err; - } -- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); -+ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); - init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, - buf_size3 << 3); - slices[n_slices].mby_start = s->mb_height + 1 >> 1; -@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, - n_slices1 = n_slices - 1; - n_slices++; - } -- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); -+ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); - } else { -- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); -+ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); - } - init_get_bits(&s->gb, buf2, buf_size2*8); - } else -diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c -index c25a6f3adf..10182786b3 100644 ---- a/libavcodec/vc1dsp.c -+++ b/libavcodec/vc1dsp.c -@@ -32,6 +32,7 @@ - #include "rnd_avg.h" - #include "vc1dsp.h" - #include "startcode.h" -+#include "vc1_common.h" - - /* Apply overlap transform to horizontal edge */ - static void vc1_v_overlap_c(uint8_t *src, int stride) -@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) - #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ - - dsp->startcode_find_candidate = ff_startcode_find_candidate_c; -+ dsp->vc1_unescape_buffer = vc1_unescape_buffer; - - if (ARCH_AARCH64) - ff_vc1dsp_init_aarch64(dsp); -diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h -index 75db62b1b4..e192b431be 100644 ---- a/libavcodec/vc1dsp.h -+++ b/libavcodec/vc1dsp.h -@@ -80,6 +80,9 @@ typedef struct VC1DSPContext { - * one or more further zero bytes and a one byte. - */ - int (*startcode_find_candidate)(const uint8_t *buf, int size); -+ -+ /* Copy a buffer, removing startcode emulation escape bytes as we go */ -+ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); - } VC1DSPContext; - - void ff_vc1dsp_init(VC1DSPContext* c); -diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c -new file mode 100644 -index 0000000000..f234a985b9 ---- /dev/null -+++ b/libavcodec/weak_link.c -@@ -0,0 +1,102 @@ -+#include -+#include -+#include -+#include "weak_link.h" -+ -+struct ff_weak_link_master { -+ atomic_int ref_count; /* 0 is single ref for easier atomics */ -+ pthread_rwlock_t lock; -+ void * ptr; -+}; -+ -+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c) -+{ -+ return (struct ff_weak_link_master *)c; -+} -+ -+struct ff_weak_link_master * ff_weak_link_new(void * p) -+{ -+ struct ff_weak_link_master * w = malloc(sizeof(*w)); -+ if (!w) -+ return NULL; -+ w->ptr = p; -+ if (pthread_rwlock_init(&w->lock, NULL)) { -+ free(w); -+ return NULL; -+ } -+ return w; -+} -+ -+static void weak_link_do_unref(struct ff_weak_link_master * const w) -+{ -+ int n = atomic_fetch_sub(&w->ref_count, 1); -+ if (n) -+ return; -+ -+ pthread_rwlock_destroy(&w->lock); -+ free(w); -+} -+ -+// Unref & break link -+void ff_weak_link_break(struct ff_weak_link_master ** ppLink) -+{ -+ struct ff_weak_link_master * const w = *ppLink; -+ if (!w) -+ return; -+ -+ *ppLink = NULL; -+ pthread_rwlock_wrlock(&w->lock); -+ w->ptr = NULL; -+ pthread_rwlock_unlock(&w->lock); -+ -+ weak_link_do_unref(w); -+} -+ -+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) -+{ -+ if (!w) -+ return NULL; -+ atomic_fetch_add(&w->ref_count, 1); -+ return (struct ff_weak_link_client*)w; -+} -+ -+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(*ppLink); -+ if (!w) -+ return; -+ -+ *ppLink = NULL; -+ weak_link_do_unref(w); -+} -+ -+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(*ppLink); -+ -+ if (!w) -+ return NULL; -+ -+ if (pthread_rwlock_rdlock(&w->lock)) -+ goto broken; -+ -+ if (w->ptr) -+ return w->ptr; -+ -+ pthread_rwlock_unlock(&w->lock); -+ -+broken: -+ *ppLink = NULL; -+ weak_link_do_unref(w); -+ return NULL; -+} -+ -+// Ignores a NULL c (so can be on the return path of both broken & live links) -+void ff_weak_link_unlock(struct ff_weak_link_client * c) -+{ -+ struct ff_weak_link_master * const w = weak_link_x(c); -+ if (w) -+ pthread_rwlock_unlock(&w->lock); -+} -+ -+ -diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h -new file mode 100644 -index 0000000000..415b6a27a0 ---- /dev/null -+++ b/libavcodec/weak_link.h -@@ -0,0 +1,23 @@ -+struct ff_weak_link_master; -+struct ff_weak_link_client; -+ -+struct ff_weak_link_master * ff_weak_link_new(void * p); -+void ff_weak_link_break(struct ff_weak_link_master ** ppLink); -+ -+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w); -+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink); -+ -+// Returns NULL if link broken - in this case it will also zap -+// *ppLink and unref the weak_link. -+// Returns NULL if *ppLink is NULL (so a link once broken stays broken) -+// -+// The above does mean that there is a race if this is called simultainiously -+// by two threads using the same weak_link_client (so don't do that) -+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink); -+void ff_weak_link_unlock(struct ff_weak_link_client * c); -+ -+ -+ -+ -+ -+ -diff --git a/libavdevice/Makefile b/libavdevice/Makefile -index 0dfe47a1f4..ec7c7b4147 100644 ---- a/libavdevice/Makefile -+++ b/libavdevice/Makefile -@@ -47,6 +47,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o - OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o - OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o - OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o -+OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o -+OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o -+OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o - OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o - OBJS-$(CONFIG_XV_OUTDEV) += xv.o - -diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c -index 92b27a1d14..19d2a9de55 100644 ---- a/libavdevice/alldevices.c -+++ b/libavdevice/alldevices.c -@@ -53,6 +53,9 @@ extern AVOutputFormat ff_sndio_muxer; - extern AVInputFormat ff_v4l2_demuxer; - extern AVOutputFormat ff_v4l2_muxer; - extern AVInputFormat ff_vfwcap_demuxer; -+extern AVOutputFormat ff_vout_drm_muxer; -+extern AVOutputFormat ff_vout_egl_muxer; -+extern AVOutputFormat ff_vout_rpi_muxer; - extern AVInputFormat ff_xcbgrab_demuxer; - extern AVOutputFormat ff_xv_muxer; - -diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c -new file mode 100644 -index 0000000000..c7b90e6dd8 ---- /dev/null -+++ b/libavdevice/drm_vout.c -@@ -0,0 +1,680 @@ -+/* -+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+// *** This module is a work in progress and its utility is strictly -+// limited to testing. -+ -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavformat/internal.h" -+#include "avdevice.h" -+ -+#include "pthread.h" -+#include -+#include -+ -+#include -+#include -+#include -+ -+#define TRACE_ALL 0 -+ -+#define DRM_MODULE "vc4" -+ -+#define ERRSTR strerror(errno) -+ -+struct drm_setup { -+ int conId; -+ uint32_t crtcId; -+ int crtcIdx; -+ uint32_t planeId; -+ unsigned int out_fourcc; -+ struct { -+ int x, y, width, height; -+ } compose; -+}; -+ -+typedef struct drm_aux_s { -+ unsigned int fb_handle; -+ uint32_t bo_handles[AV_DRM_MAX_PLANES]; -+ AVFrame * frame; -+} drm_aux_t; -+ -+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS -+// we get initial flicker probably due to dodgy drm timing -+#define AUX_SIZE 3 -+typedef struct drm_display_env_s -+{ -+ AVClass *class; -+ -+ int drm_fd; -+ uint32_t con_id; -+ struct drm_setup setup; -+ enum AVPixelFormat avfmt; -+ -+ int show_all; -+ const char * drm_module; -+ -+ unsigned int ano; -+ drm_aux_t aux[AUX_SIZE]; -+ -+ pthread_t q_thread; -+ sem_t q_sem_in; -+ sem_t q_sem_out; -+ int q_terminate; -+ AVFrame * q_next; -+ -+} drm_display_env_t; -+ -+ -+static int drm_vout_write_trailer(AVFormatContext *s) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ -+ return 0; -+} -+ -+static int drm_vout_write_header(AVFormatContext *s) -+{ -+ const AVCodecParameters * const par = s->streams[0]->codecpar; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ if ( s->nb_streams > 1 -+ || par->codec_type != AVMEDIA_TYPE_VIDEO -+ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { -+ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+static int find_plane(struct AVFormatContext * const avctx, -+ const int drmfd, const int crtcidx, const uint32_t format, -+ uint32_t * const pplane_id) -+{ -+ drmModePlaneResPtr planes; -+ drmModePlanePtr plane; -+ drmModeObjectPropertiesPtr props = NULL; -+ drmModePropertyPtr prop = NULL; -+ unsigned int i; -+ unsigned int j; -+ int ret = -1; -+ -+ planes = drmModeGetPlaneResources(drmfd); -+ if (!planes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); -+ return -1; -+ } -+ -+ for (i = 0; i < planes->count_planes; ++i) { -+ plane = drmModeGetPlane(drmfd, planes->planes[i]); -+ if (!planes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); -+ break; -+ } -+ -+ if (!(plane->possible_crtcs & (1 << crtcidx))) { -+ drmModeFreePlane(plane); -+ continue; -+ } -+ -+ for (j = 0; j < plane->count_formats; ++j) { -+ if (plane->formats[j] == format) -+ break; -+ } -+ -+ if (j == plane->count_formats) { -+ drmModeFreePlane(plane); -+ continue; -+ } -+ -+ *pplane_id = plane->plane_id; -+ drmModeFreePlane(plane); -+ break; -+ } -+ -+ if (i == planes->count_planes) { -+ ret = -1; -+ goto fail; -+ } -+ -+ props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE); -+ if (!props) -+ goto fail; -+ for (i = 0; i != props->count_props; ++i) { -+ if (prop) -+ drmModeFreeProperty(prop); -+ prop = drmModeGetProperty(drmfd, props->props[i]); -+ if (!prop) -+ goto fail; -+ if (strcmp("zpos", prop->name) == 0) { -+ if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0) -+ av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]); -+ else -+ av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n"); -+ break; -+ } -+ } -+ -+ ret = 0; -+fail: -+ if (props) -+ drmModeFreeObjectProperties(props); -+ if (prop) -+ drmModeFreeProperty(prop); -+ drmModeFreePlaneResources(planes); -+ return ret; -+} -+ -+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) -+{ -+ if (da->fb_handle != 0) { -+ drmModeRmFB(de->drm_fd, da->fb_handle); -+ da->fb_handle = 0; -+ } -+ -+ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) { -+ if (da->bo_handles[i]) { -+ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]}; -+ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close); -+ da->bo_handles[i] = 0; -+ } -+ } -+ av_frame_free(&da->frame); -+} -+ -+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame) -+{ -+ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; -+ drm_aux_t * da = de->aux + de->ano; -+ const uint32_t format = desc->layers[0].format; -+ int ret = 0; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd); -+#endif -+ -+ if (de->setup.out_fourcc != format) { -+ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) { -+ av_frame_free(&frame); -+ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format); -+ return -1; -+ } -+ de->setup.out_fourcc = format; -+ } -+ -+ { -+ drmVBlank vbl = { -+ .request = { -+ .type = DRM_VBLANK_RELATIVE, -+ .sequence = 0 -+ } -+ }; -+ -+ while (drmWaitVBlank(de->drm_fd, &vbl)) { -+ if (errno != EINTR) { -+// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); -+ break; -+ } -+ } -+ } -+ -+ da_uninit(de, da); -+ -+ { -+ uint32_t pitches[4] = {0}; -+ uint32_t offsets[4] = {0}; -+ uint64_t modifiers[4] = {0}; -+ uint32_t bo_handles[4] = {0}; -+ int has_mods = 0; -+ int i, j, n; -+ -+ da->frame = frame; -+ -+ for (i = 0; i < desc->nb_objects; ++i) { -+ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { -+ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); -+ return -1; -+ } -+ if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR && -+ desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID) -+ has_mods = 1; -+ } -+ -+ n = 0; -+ for (i = 0; i < desc->nb_layers; ++i) { -+ for (j = 0; j < desc->layers[i].nb_planes; ++j) { -+ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; -+ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; -+ pitches[n] = p->pitch; -+ offsets[n] = p->offset; -+ modifiers[n] = obj->format_modifier; -+ bo_handles[n] = da->bo_handles[p->object_index]; -+ ++n; -+ } -+ } -+ -+#if 1 && TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," -+ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, -+ bo_handles[0], -+ bo_handles[1], -+ bo_handles[2], -+ bo_handles[3], -+ pitches[0], -+ pitches[1], -+ pitches[2], -+ pitches[3], -+ offsets[0], -+ offsets[1], -+ offsets[2], -+ offsets[3], -+ (long long)modifiers[0], -+ (long long)modifiers[1], -+ (long long)modifiers[2], -+ (long long)modifiers[3] -+ ); -+#endif -+ -+ if (drmModeAddFB2WithModifiers(de->drm_fd, -+ av_frame_cropped_width(frame), -+ av_frame_cropped_height(frame), -+ desc->layers[0].format, bo_handles, -+ pitches, offsets, -+ has_mods ? modifiers : NULL, -+ &da->fb_handle, -+ has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) { -+ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); -+ return -1; -+ } -+ } -+ -+ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, -+ da->fb_handle, 0, -+ de->setup.compose.x, de->setup.compose.y, -+ de->setup.compose.width, -+ de->setup.compose.height, -+ 0, 0, -+ av_frame_cropped_width(frame) << 16, -+ av_frame_cropped_height(frame) << 16); -+ -+ if (ret != 0) { -+ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); -+ } -+ -+ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1; -+ -+ return ret; -+} -+ -+static int do_sem_wait(sem_t * const sem, const int nowait) -+{ -+ while (nowait ? sem_trywait(sem) : sem_wait(sem)) { -+ if (errno != EINTR) -+ return -errno; -+ } -+ return 0; -+} -+ -+static void * display_thread(void * v) -+{ -+ AVFormatContext * const s = v; -+ drm_display_env_t * const de = s->priv_data; -+ int i; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+#endif -+ -+ sem_post(&de->q_sem_out); -+ -+ for (;;) { -+ AVFrame * frame; -+ -+ do_sem_wait(&de->q_sem_in, 0); -+ -+ if (de->q_terminate) -+ break; -+ -+ frame = de->q_next; -+ de->q_next = NULL; -+ sem_post(&de->q_sem_out); -+ -+ do_display(s, de, frame); -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+#endif -+ -+ for (i = 0; i != AUX_SIZE; ++i) -+ da_uninit(de, de->aux + i); -+ -+ av_frame_free(&de->q_next); -+ -+ return NULL; -+} -+ -+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) -+{ -+ const AVFrame * const src_frame = (AVFrame *)pkt->data; -+ AVFrame * frame; -+ drm_display_env_t * const de = s->priv_data; -+ int ret; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); -+#endif -+ -+ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { -+ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts); -+ return 0; -+ } -+ -+ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { -+ frame = av_frame_alloc(); -+ av_frame_ref(frame, src_frame); -+ } -+ else if (src_frame->format == AV_PIX_FMT_VAAPI) { -+ frame = av_frame_alloc(); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (av_hwframe_map(frame, src_frame, 0) != 0) -+ { -+ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); -+ av_frame_free(&frame); -+ return AVERROR(EINVAL); -+ } -+ } -+ else { -+ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); -+ return AVERROR(EINVAL); -+ } -+ -+ ret = do_sem_wait(&de->q_sem_out, !de->show_all); -+ if (ret) { -+ av_frame_free(&frame); -+ } -+ else { -+ de->q_next = frame; -+ sem_post(&de->q_sem_in); -+ } -+ -+ return 0; -+} -+ -+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -+ unsigned flags) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); -+#endif -+ -+ /* drm_vout_write_header() should have accepted only supported formats */ -+ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) -+ return 0; -+ -+ return 0; -+} -+ -+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type); -+#endif -+ switch(type) { -+ case AV_APP_TO_DEV_WINDOW_REPAINT: -+ return 0; -+ default: -+ break; -+ } -+ return AVERROR(ENOSYS); -+} -+ -+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) -+{ -+ int ret = -1; -+ int i; -+ drmModeRes *res = drmModeGetResources(drmfd); -+ drmModeConnector *c; -+ -+ if(!res) -+ { -+ printf( "drmModeGetResources failed: %s\n", ERRSTR); -+ return -1; -+ } -+ -+ if (res->count_crtcs <= 0) -+ { -+ printf( "drm: no crts\n"); -+ goto fail_res; -+ } -+ -+ if (!s->conId) { -+ fprintf(stderr, -+ "No connector ID specified. Choosing default from list:\n"); -+ -+ for (i = 0; i < res->count_connectors; i++) { -+ drmModeConnector *con = -+ drmModeGetConnector(drmfd, res->connectors[i]); -+ drmModeEncoder *enc = NULL; -+ drmModeCrtc *crtc = NULL; -+ -+ if (con->encoder_id) { -+ enc = drmModeGetEncoder(drmfd, con->encoder_id); -+ if (enc->crtc_id) { -+ crtc = drmModeGetCrtc(drmfd, enc->crtc_id); -+ } -+ } -+ -+ if (!s->conId && crtc) { -+ s->conId = con->connector_id; -+ s->crtcId = crtc->crtc_id; -+ } -+ -+ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n", -+ con->connector_id, -+ crtc ? crtc->crtc_id : 0, -+ con->connector_type, -+ crtc ? crtc->width : 0, -+ crtc ? crtc->height : 0, -+ (s->conId == (int)con->connector_id ? -+ " (chosen)" : "")); -+ } -+ -+ if (!s->conId) { -+ av_log(avctx, AV_LOG_ERROR, -+ "No suitable enabled connector found.\n"); -+ return -1;; -+ } -+ } -+ -+ s->crtcIdx = -1; -+ -+ for (i = 0; i < res->count_crtcs; ++i) { -+ if (s->crtcId == res->crtcs[i]) { -+ s->crtcIdx = i; -+ break; -+ } -+ } -+ -+ if (s->crtcIdx == -1) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); -+ goto fail_res; -+ } -+ -+ if (res->count_connectors <= 0) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); -+ goto fail_res; -+ } -+ -+ c = drmModeGetConnector(drmfd, s->conId); -+ if (!c) -+ { -+ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); -+ goto fail_res; -+ } -+ -+ if (!c->count_modes) -+ { -+ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); -+ goto fail_conn; -+ } -+ -+ { -+ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); -+ s->compose.x = crtc->x; -+ s->compose.y = crtc->y; -+ s->compose.width = crtc->width; -+ s->compose.height = crtc->height; -+ drmModeFreeCrtc(crtc); -+ } -+ -+ if (pConId) -+ *pConId = c->connector_id; -+ ret = 0; -+ -+fail_conn: -+ drmModeFreeConnector(c); -+ -+fail_res: -+ drmModeFreeResources(res); -+ -+ return ret; -+} -+ -+// deinit is called if init fails so no need to clean up explicity here -+static int drm_vout_init(struct AVFormatContext * s) -+{ -+ drm_display_env_t * const de = s->priv_data; -+ int rv; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->drm_fd = -1; -+ de->con_id = 0; -+ de->setup = (struct drm_setup){0}; -+ de->q_terminate = 0; -+ -+ if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0) -+ { -+ rv = AVERROR(errno); -+ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv)); -+ return rv; -+ } -+ -+ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) -+ { -+ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); -+ rv = AVERROR(EINVAL); -+ goto fail_close; -+ } -+ -+ sem_init(&de->q_sem_in, 0, 0); -+ sem_init(&de->q_sem_out, 0, 0); -+ if (pthread_create(&de->q_thread, NULL, display_thread, s)) { -+ rv = AVERROR(errno); -+ av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv)); -+ goto fail_close; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+ -+ return 0; -+ -+fail_close: -+ close(de->drm_fd); -+ de->drm_fd = -1; -+ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__); -+ -+ return rv; -+} -+ -+static void drm_vout_deinit(struct AVFormatContext * s) -+{ -+ drm_display_env_t * const de = s->priv_data; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->q_terminate = 1; -+ sem_post(&de->q_sem_in); -+ pthread_join(de->q_thread, NULL); -+ sem_destroy(&de->q_sem_in); -+ sem_destroy(&de->q_sem_out); -+ -+ for (unsigned int i = 0; i != AUX_SIZE; ++i) -+ da_uninit(de, de->aux + i); -+ -+ av_frame_free(&de->q_next); -+ -+ if (de->drm_fd >= 0) { -+ close(de->drm_fd); -+ de->drm_fd = -1; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+} -+ -+ -+#define OFFSET(x) offsetof(drm_display_env_t, x) -+static const AVOption options[] = { -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -+ { NULL } -+}; -+ -+static const AVClass drm_vout_class = { -+ .class_name = "drm vid outdev", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, -+}; -+ -+AVOutputFormat ff_vout_drm_muxer = { -+ .name = "vout_drm", -+ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), -+ .priv_data_size = sizeof(drm_display_env_t), -+ .audio_codec = AV_CODEC_ID_NONE, -+ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, -+ .write_header = drm_vout_write_header, -+ .write_packet = drm_vout_write_packet, -+ .write_uncoded_frame = drm_vout_write_frame, -+ .write_trailer = drm_vout_write_trailer, -+ .control_message = drm_vout_control_message, -+ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, -+ .priv_class = &drm_vout_class, -+ .init = drm_vout_init, -+ .deinit = drm_vout_deinit, -+}; -+ -diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c -new file mode 100644 -index 0000000000..cc6e310551 ---- /dev/null -+++ b/libavdevice/egl_vout.c -@@ -0,0 +1,788 @@ -+/* -+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+ -+// *** This module is a work in progress and its utility is strictly -+// limited to testing. -+// Amongst other issues it doesn't wait for the pic to be displayed before -+// returning the buffer so flikering does occur. -+ -+#include -+#include -+ -+#include "libavutil/opt.h" -+#include "libavutil/avassert.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/imgutils.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavformat/internal.h" -+#include "avdevice.h" -+ -+#include "pthread.h" -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include "libavutil/rpi_sand_fns.h" -+ -+#define TRACE_ALL 0 -+ -+struct egl_setup { -+ int conId; -+ -+ Display *dpy; -+ EGLDisplay egl_dpy; -+ EGLContext ctx; -+ EGLSurface surf; -+ Window win; -+ -+ uint32_t crtcId; -+ int crtcIdx; -+ uint32_t planeId; -+ struct { -+ int x, y, width, height; -+ } compose; -+}; -+ -+typedef struct egl_aux_s { -+ int fd; -+ GLuint texture; -+ -+} egl_aux_t; -+ -+typedef struct egl_display_env_s { -+ AVClass *class; -+ -+ struct egl_setup setup; -+ enum AVPixelFormat avfmt; -+ -+ int show_all; -+ int window_width, window_height; -+ int window_x, window_y; -+ int fullscreen; -+ -+ egl_aux_t aux[32]; -+ -+ pthread_t q_thread; -+ pthread_mutex_t q_lock; -+ sem_t display_start_sem; -+ sem_t q_sem; -+ int q_terminate; -+ AVFrame *q_this; -+ AVFrame *q_next; -+ -+} egl_display_env_t; -+ -+ -+/** -+ * Remove window border/decorations. -+ */ -+static void -+no_border(Display *dpy, Window w) -+{ -+ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); -+ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; -+ -+ typedef struct { -+ unsigned long flags; -+ unsigned long functions; -+ unsigned long decorations; -+ long inputMode; -+ unsigned long status; -+ } PropMotifWmHints; -+ -+ PropMotifWmHints motif_hints; -+ Atom prop, proptype; -+ unsigned long flags = 0; -+ -+ /* setup the property */ -+ motif_hints.flags = MWM_HINTS_DECORATIONS; -+ motif_hints.decorations = flags; -+ -+ /* get the atom for the property */ -+ prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True); -+ if (!prop) { -+ /* something went wrong! */ -+ return; -+ } -+ -+ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ -+ proptype = prop; -+ -+ XChangeProperty(dpy, w, /* display, window */ -+ prop, proptype, /* property, type */ -+ 32, /* format: 32-bit datums */ -+ PropModeReplace, /* mode */ -+ (unsigned char *)&motif_hints, /* data */ -+ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ -+ ); -+} -+ -+ -+/* -+ * Create an RGB, double-buffered window. -+ * Return the window and context handles. -+ */ -+static int -+make_window(struct AVFormatContext *const s, -+ egl_display_env_t *const de, -+ Display *dpy, EGLDisplay egl_dpy, const char *name, -+ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) -+{ -+ int scrnum = DefaultScreen(dpy); -+ XSetWindowAttributes attr; -+ unsigned long mask; -+ Window root = RootWindow(dpy, scrnum); -+ Window win; -+ EGLContext ctx; -+ const int fullscreen = de->fullscreen; -+ EGLConfig config; -+ int x = de->window_x; -+ int y = de->window_y; -+ int width = de->window_width ? de->window_width : 1280; -+ int height = de->window_height ? de->window_height : 720; -+ -+ -+ if (fullscreen) { -+ int scrnum = DefaultScreen(dpy); -+ -+ x = 0; y = 0; -+ width = DisplayWidth(dpy, scrnum); -+ height = DisplayHeight(dpy, scrnum); -+ } -+ -+ { -+ EGLint num_configs; -+ static const EGLint attribs[] = { -+ EGL_RED_SIZE, 1, -+ EGL_GREEN_SIZE, 1, -+ EGL_BLUE_SIZE, 1, -+ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, -+ EGL_NONE -+ }; -+ -+ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { -+ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); -+ return -1; -+ } -+ } -+ -+ { -+ EGLint vid; -+ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); -+ return -1; -+ } -+ -+ { -+ XVisualInfo visTemplate = { -+ .visualid = vid, -+ }; -+ int num_visuals; -+ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, -+ &visTemplate, &num_visuals); -+ -+ /* window attributes */ -+ attr.background_pixel = 0; -+ attr.border_pixel = 0; -+ attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone); -+ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; -+ /* XXX this is a bad way to get a borderless window! */ -+ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; -+ -+ win = XCreateWindow(dpy, root, x, y, width, height, -+ 0, visinfo->depth, InputOutput, -+ visinfo->visual, mask, &attr); -+ XFree(visinfo); -+ } -+ } -+ -+ if (fullscreen) -+ no_border(dpy, win); -+ -+ /* set hints and properties */ -+ { -+ XSizeHints sizehints; -+ sizehints.x = x; -+ sizehints.y = y; -+ sizehints.width = width; -+ sizehints.height = height; -+ sizehints.flags = USSize | USPosition; -+ XSetNormalHints(dpy, win, &sizehints); -+ XSetStandardProperties(dpy, win, name, name, -+ None, (char **)NULL, 0, &sizehints); -+ } -+ -+ eglBindAPI(EGL_OPENGL_ES_API); -+ -+ { -+ static const EGLint ctx_attribs[] = { -+ EGL_CONTEXT_CLIENT_VERSION, 2, -+ EGL_NONE -+ }; -+ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs); -+ if (!ctx) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ } -+ -+ -+ XMapWindow(dpy, win); -+ -+ { -+ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); -+ if (!surf) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); -+ return -1; -+ } -+ -+ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); -+ return -1; -+ } -+ -+ *winRet = win; -+ *ctxRet = ctx; -+ *surfRet = surf; -+ } -+ -+ return 0; -+} -+ -+static GLint -+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source) -+{ -+ GLuint s = glCreateShader(target); -+ -+ if (s == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); -+ return 0; -+ } -+ -+ glShaderSource(s, 1, (const GLchar **)&source, NULL); -+ glCompileShader(s); -+ -+ { -+ GLint ok; -+ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); -+ -+ if (!ok) { -+ GLchar *info; -+ GLint size; -+ -+ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); -+ info = malloc(size); -+ -+ glGetShaderInfoLog(s, size, NULL, info); -+ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); -+ -+ return 0; -+ } -+ } -+ -+ return s; -+} -+ -+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs) -+{ -+ GLuint prog = glCreateProgram(); -+ -+ if (prog == 0) { -+ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); -+ return 0; -+ } -+ -+ glAttachShader(prog, vs); -+ glAttachShader(prog, fs); -+ glLinkProgram(prog); -+ -+ { -+ GLint ok; -+ glGetProgramiv(prog, GL_LINK_STATUS, &ok); -+ if (!ok) { -+ /* Some drivers return a size of 1 for an empty log. This is the size -+ * of a log that contains only a terminating NUL character. -+ */ -+ GLint size; -+ GLchar *info = NULL; -+ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); -+ if (size > 1) { -+ info = malloc(size); -+ glGetProgramInfoLog(prog, size, NULL, info); -+ } -+ -+ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", -+ (info != NULL) ? info : ""); -+ return 0; -+ } -+ } -+ -+ return prog; -+} -+ -+static int -+gl_setup(struct AVFormatContext *const s) -+{ -+ const char *vs = -+ "attribute vec4 pos;\n" -+ "varying vec2 texcoord;\n" -+ "\n" -+ "void main() {\n" -+ " gl_Position = pos;\n" -+ " texcoord.x = (pos.x + 1.0) / 2.0;\n" -+ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" -+ "}\n"; -+ const char *fs = -+ "#extension GL_OES_EGL_image_external : enable\n" -+ "precision mediump float;\n" -+ "uniform samplerExternalOES s;\n" -+ "varying vec2 texcoord;\n" -+ "void main() {\n" -+ " gl_FragColor = texture2D(s, texcoord);\n" -+ "}\n"; -+ -+ GLuint vs_s; -+ GLuint fs_s; -+ GLuint prog; -+ -+ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || -+ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || -+ !(prog = link_program(s, vs_s, fs_s))) -+ return -1; -+ -+ glUseProgram(prog); -+ -+ { -+ static const float verts[] = { -+ -1, -1, -+ 1, -1, -+ 1, 1, -+ -1, 1, -+ }; -+ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); -+ } -+ -+ glEnableVertexAttribArray(0); -+ return 0; -+} -+ -+static int egl_vout_write_trailer(AVFormatContext *s) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ -+ return 0; -+} -+ -+static int egl_vout_write_header(AVFormatContext *s) -+{ -+ const AVCodecParameters *const par = s->streams[0]->codecpar; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ if (s->nb_streams > 1 -+ || par->codec_type != AVMEDIA_TYPE_VIDEO -+ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { -+ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+ -+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame) -+{ -+ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ egl_aux_t *da = NULL; -+ unsigned int i; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); -+#endif -+ -+ for (i = 0; i != 32; ++i) { -+ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { -+ da = de->aux + i; -+ break; -+ } -+ } -+ -+ if (da == NULL) { -+ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); -+ return AVERROR(EINVAL); -+ } -+ -+ if (da->texture == 0) { -+ EGLint attribs[50]; -+ EGLint *a = attribs; -+ int i, j; -+ static const EGLint anames[] = { -+ EGL_DMA_BUF_PLANE0_FD_EXT, -+ EGL_DMA_BUF_PLANE0_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE0_PITCH_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE1_FD_EXT, -+ EGL_DMA_BUF_PLANE1_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE1_PITCH_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, -+ EGL_DMA_BUF_PLANE2_FD_EXT, -+ EGL_DMA_BUF_PLANE2_OFFSET_EXT, -+ EGL_DMA_BUF_PLANE2_PITCH_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, -+ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, -+ }; -+ const EGLint *b = anames; -+ -+ *a++ = EGL_WIDTH; -+ *a++ = av_frame_cropped_width(frame); -+ *a++ = EGL_HEIGHT; -+ *a++ = av_frame_cropped_height(frame); -+ *a++ = EGL_LINUX_DRM_FOURCC_EXT; -+ *a++ = desc->layers[0].format; -+ -+ for (i = 0; i < desc->nb_layers; ++i) { -+ for (j = 0; j < desc->layers[i].nb_planes; ++j) { -+ const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j; -+ const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index; -+ *a++ = *b++; -+ *a++ = obj->fd; -+ *a++ = *b++; -+ *a++ = p->offset; -+ *a++ = *b++; -+ *a++ = p->pitch; -+ if (obj->format_modifier == 0) { -+ b += 2; -+ } -+ else { -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); -+ *a++ = *b++; -+ *a++ = (EGLint)(obj->format_modifier >> 32); -+ } -+ } -+ } -+ -+ *a = EGL_NONE; -+ -+#if TRACE_ALL -+ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { -+ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); -+ } -+#endif -+ { -+ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, -+ EGL_NO_CONTEXT, -+ EGL_LINUX_DMA_BUF_EXT, -+ NULL, attribs); -+ if (!image) { -+ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); -+ return -1; -+ } -+ -+ glGenTextures(1, &da->texture); -+ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); -+ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); -+ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); -+ -+ eglDestroyImageKHR(de->setup.egl_dpy, image); -+ } -+ -+ da->fd = desc->objects[0].fd; -+ } -+ -+ glClearColor(0.5, 0.5, 0.5, 0.5); -+ glClear(GL_COLOR_BUFFER_BIT); -+ -+ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); -+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4); -+ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); -+ -+ glDeleteTextures(1, &da->texture); -+ da->texture = 0; -+ da->fd = -1; -+ -+ return 0; -+} -+ -+static void* display_thread(void *v) -+{ -+ AVFormatContext *const s = v; -+ egl_display_env_t *const de = s->priv_data; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); -+#endif -+ { -+ EGLint egl_major, egl_minor; -+ -+ de->setup.dpy = XOpenDisplay(NULL); -+ if (!de->setup.dpy) { -+ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); -+ goto fail; -+ } -+ -+ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); -+ if (!de->setup.egl_dpy) { -+ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); -+ goto fail; -+ } -+ -+ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { -+ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); -+ goto fail; -+ } -+ -+ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); -+ -+ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { -+ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); -+ goto fail; -+ } -+ } -+ -+ if (!de->window_width || !de->window_height) { -+ de->window_width = 1280; -+ de->window_height = 720; -+ } -+ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", -+ &de->setup.win, &de->setup.ctx, &de->setup.surf)) { -+ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); -+ goto fail; -+ } -+ -+ if (gl_setup(s)) { -+ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); -+ goto fail; -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); -+#endif -+ sem_post(&de->display_start_sem); -+ -+ for (;;) { -+ AVFrame *frame; -+ -+ while (sem_wait(&de->q_sem) != 0) { -+ av_assert0(errno == EINTR); -+ } -+ -+ if (de->q_terminate) -+ break; -+ -+ pthread_mutex_lock(&de->q_lock); -+ frame = de->q_next; -+ de->q_next = NULL; -+ pthread_mutex_unlock(&de->q_lock); -+ -+ do_display(s, de, frame); -+ -+ av_frame_free(&de->q_this); -+ de->q_this = frame; -+ } -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); -+#endif -+ -+ return NULL; -+ -+fail: -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); -+#endif -+ de->q_terminate = 1; -+ sem_post(&de->display_start_sem); -+ -+ return NULL; -+} -+ -+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) -+{ -+ const AVFrame *const src_frame = (AVFrame *)pkt->data; -+ AVFrame *frame; -+ egl_display_env_t *const de = s->priv_data; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ -+ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { -+ frame = av_frame_alloc(); -+ av_frame_ref(frame, src_frame); -+ } -+ else if (src_frame->format == AV_PIX_FMT_VAAPI) { -+ frame = av_frame_alloc(); -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (av_hwframe_map(frame, src_frame, 0) != 0) { -+ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); -+ av_frame_free(&frame); -+ return AVERROR(EINVAL); -+ } -+ } -+ else { -+ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); -+ return AVERROR(EINVAL); -+ } -+ -+ // Really hacky sync -+ while (de->show_all && de->q_next) { -+ usleep(3000); -+ } -+ -+ pthread_mutex_lock(&de->q_lock); -+ { -+ AVFrame *const t = de->q_next; -+ de->q_next = frame; -+ frame = t; -+ } -+ pthread_mutex_unlock(&de->q_lock); -+ -+ if (frame == NULL) -+ sem_post(&de->q_sem); -+ else -+ av_frame_free(&frame); -+ -+ return 0; -+} -+ -+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -+ unsigned flags) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); -+#endif -+ -+ /* egl_vout_write_header() should have accepted only supported formats */ -+ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) -+ return 0; -+ -+ return 0; -+} -+ -+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); -+#endif -+ switch (type) { -+ case AV_APP_TO_DEV_WINDOW_REPAINT: -+ return 0; -+ default: -+ break; -+ } -+ return AVERROR(ENOSYS); -+} -+ -+// deinit is called if init fails so no need to clean up explicity here -+static int egl_vout_init(struct AVFormatContext *s) -+{ -+ egl_display_env_t *const de = s->priv_data; -+ unsigned int i; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->setup = (struct egl_setup) { 0 }; -+ -+ for (i = 0; i != 32; ++i) { -+ de->aux[i].fd = -1; -+ } -+ -+ de->q_terminate = 0; -+ pthread_mutex_init(&de->q_lock, NULL); -+ sem_init(&de->q_sem, 0, 0); -+ sem_init(&de->display_start_sem, 0, 0); -+ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); -+ -+ sem_wait(&de->display_start_sem); -+ if (de->q_terminate) { -+ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); -+ return -1; -+ } -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+ -+ return 0; -+} -+ -+static void egl_vout_deinit(struct AVFormatContext *s) -+{ -+ egl_display_env_t *const de = s->priv_data; -+ -+ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); -+ -+ de->q_terminate = 1; -+ sem_post(&de->q_sem); -+ pthread_join(de->q_thread, NULL); -+ sem_destroy(&de->q_sem); -+ pthread_mutex_destroy(&de->q_lock); -+ -+ av_frame_free(&de->q_next); -+ av_frame_free(&de->q_this); -+ -+ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); -+} -+ -+#define OFFSET(x) offsetof(egl_display_env_t, x) -+static const AVOption options[] = { -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { NULL } -+ -+}; -+ -+static const AVClass egl_vout_class = { -+ .class_name = "egl vid outdev", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, -+}; -+ -+AVOutputFormat ff_vout_egl_muxer = { -+ .name = "vout_egl", -+ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), -+ .priv_data_size = sizeof(egl_display_env_t), -+ .audio_codec = AV_CODEC_ID_NONE, -+ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, -+ .write_header = egl_vout_write_header, -+ .write_packet = egl_vout_write_packet, -+ .write_uncoded_frame = egl_vout_write_frame, -+ .write_trailer = egl_vout_write_trailer, -+ .control_message = egl_vout_control_message, -+ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, -+ .priv_class = &egl_vout_class, -+ .init = egl_vout_init, -+ .deinit = egl_vout_deinit, -+}; -+ -diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c -new file mode 100644 -index 0000000000..84723a34ad ---- /dev/null -+++ b/libavdevice/rpi_vout.c -@@ -0,0 +1,534 @@ -+/* -+ * Copyright (c) 2013 Jeff Moguillansky -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * XVideo output device -+ * -+ * TODO: -+ * - add support to more formats -+ */ -+ -+#include "libavutil/opt.h" -+#include "libavutil/avassert.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/imgutils.h" -+#include "libavformat/internal.h" -+#include "avdevice.h" -+ -+#include -+#include -+ -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#pragma GCC diagnostic pop -+#include "libavutil/rpi_sand_fns.h" -+#include "libavcodec/rpi_zc.h" -+ -+#define TRACE_ALL 0 -+ -+#define DISPLAY_PORT_DEPTH 4 -+ -+typedef struct rpi_display_env_s -+{ -+ AVClass *class; -+ -+ MMAL_COMPONENT_T* display; -+ MMAL_COMPONENT_T* isp; -+ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup -+ MMAL_CONNECTION_T * conn; -+ -+ MMAL_POOL_T *rpi_pool; -+ volatile int rpi_display_count; -+ -+ MMAL_FOURCC_T req_fmt; -+ MMAL_VIDEO_FORMAT_T req_vfmt; -+ -+ AVZcEnvPtr zc; -+ -+ int window_width, window_height; -+ int window_x, window_y; -+ int layer, fullscreen; -+ int show_all; -+} rpi_display_env_t; -+ -+ -+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { -+ mmal_buffer_header_release(buffer); -+} -+ -+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) { -+ mmal_buffer_header_release(buffer); -+} -+ -+ -+static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt) -+{ -+ switch (fmt) { -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ return MMAL_ENCODING_YUVUV128; -+ case AV_PIX_FMT_RPI4_10: -+ return MMAL_ENCODING_YUV10_COL; -+ case AV_PIX_FMT_SAND64_10: -+ return MMAL_ENCODING_YUVUV64_10; -+ case AV_PIX_FMT_SAND64_16: -+ return MMAL_ENCODING_YUVUV64_16; -+ case AV_PIX_FMT_YUV420P: -+ return MMAL_ENCODING_I420; -+ -+ default: -+ break; -+ } -+ return 0; -+} -+ -+ -+static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt, -+ const AVFrame * const frame, const AVRpiZcRefPtr fr_ref) -+{ -+ MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video; -+ const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref); -+ if (av_rpi_is_sand_format(geo->format)) { -+ // Sand formats are a bit "special" -+ // stride1 implicit in format -+ // width = stride2 -+ vfmt->width = geo->stripe_is_yc ? -+ geo->height_y + geo->height_c : geo->height_y; -+// es->height = geo->video_height; //*** When we get the FLAG this will change -+ vfmt->height = geo->height_y; -+ es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE; -+ } -+ else { -+ vfmt->width = geo->stride_y / geo->bytes_per_pel; -+ vfmt->height = geo->height_y; -+ es_fmt->flags = 0; -+ } -+ -+ es_fmt->type = MMAL_ES_TYPE_VIDEO; -+ es_fmt->encoding = mmfmt_from_avfmt(geo->format); -+ es_fmt->encoding_variant = 0; -+ es_fmt->bitrate = 0; -+ -+ vfmt->crop.x = frame->crop_left; -+ vfmt->crop.y = frame->crop_top; -+ vfmt->crop.width = av_frame_cropped_width(frame); -+ vfmt->crop.height = av_frame_cropped_height(frame); -+ -+ vfmt->frame_rate.den = 0; // Don't think I know it here -+ vfmt->frame_rate.num = 0; -+ -+ vfmt->par.den = frame->sample_aspect_ratio.den; -+ vfmt->par.num = frame->sample_aspect_ratio.num; -+ -+ vfmt->color_space = 0; // Unknown currently -+} -+ -+static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata) -+{ -+ rpi_display_env_t * const de = userdata; -+ if (buf->user_data != NULL) { -+ av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data); -+ buf->user_data = NULL; -+ } -+ atomic_fetch_add(&de->rpi_display_count, -1); -+ return MMAL_FALSE; -+} -+ -+static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt) -+{ -+ return avfmt == AV_PIX_FMT_SAND64_10; -+} -+ -+static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de) -+{ -+ if (de->isp != NULL) -+ { -+ if (de->isp->input[0]->is_enabled) -+ mmal_port_disable(de->isp->input[0]); -+ if (de->isp->control->is_enabled) -+ mmal_port_disable(de->isp->control); -+ } -+ if (de->conn != NULL) { -+ mmal_connection_destroy(de->conn); -+ de->conn = NULL; -+ } -+ if (de->isp != NULL) { -+ mmal_component_destroy(de->isp); -+ de->isp = NULL; -+ } -+} -+ -+static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) -+{ -+ MMAL_BUFFER_HEADER_T* buf = NULL; -+ AVRpiZcRefPtr fr_buf = NULL; -+ -+ if (de == NULL) -+ return; -+ -+ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { -+ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); -+ return; -+ } -+ -+ if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) { -+ return; -+ } -+ -+ buf = mmal_queue_get(de->rpi_pool->queue); -+ if (!buf) { -+ // Running too fast so drop the frame (unexpected) -+ goto fail; -+ } -+ -+ buf->cmd = 0; -+ buf->offset = 0; -+ buf->flags = 0; -+ mmal_buffer_header_reset(buf); -+ -+ atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release -+ mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de); -+ -+ buf->user_data = fr_buf; -+ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal -+ buf->offset = av_rpi_zc_offset(fr_buf); -+ buf->length = av_rpi_zc_length(fr_buf); -+ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); -+ -+ while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { -+ usleep(5000); -+ } -+ -+ { -+ MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}}; -+ MMAL_ES_FORMAT_T new_es = {.es = &new_ess}; -+ MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video; -+ -+ video_format_from_zc_frame(&new_es, fr, fr_buf); -+ if (de->req_fmt != new_es.encoding || -+ de->req_vfmt.width != new_vfmt->width || -+ de->req_vfmt.height != new_vfmt->height || -+ de->req_vfmt.crop.x != new_vfmt->crop.x || -+ de->req_vfmt.crop.y != new_vfmt->crop.y || -+ de->req_vfmt.crop.width != new_vfmt->crop.width || -+ de->req_vfmt.crop.height != new_vfmt->crop.height) { -+ // Something has changed -+ -+ // If we have an ISP tear it down -+ isp_remove(s, de); -+ de->port_in = de->display->input[0]; -+ -+ // If we still need an ISP create it now -+ if (avfmt_needs_isp(fr->format)) -+ { -+ if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS) -+ { -+ av_log(s, AV_LOG_ERROR, "ISP creation failed\n"); -+ goto fail; -+ } -+ de->port_in = de->isp->input[0]; -+ } -+ -+ mmal_format_copy(de->port_in->format, &new_es); -+ -+ if (mmal_port_format_commit(de->port_in)) { -+ av_log(s, AV_LOG_ERROR, "Failed to commit input format\n"); -+ goto fail; -+ } -+ -+ // If we have an ISP then we must want to use it -+ if (de->isp != NULL) { -+ MMAL_PORT_T * const port_out = de->isp->output[0]; -+ MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video; -+ MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video; -+ -+ port_out->format->type = MMAL_ES_TYPE_VIDEO; -+ port_out->format->encoding = MMAL_ENCODING_YUVUV128; -+ port_out->format->encoding_variant = 0; -+ port_out->format->bitrate = 0; -+ port_out->format->flags = 0; -+ port_out->format->extradata = NULL; -+ port_out->format->extradata_size = 0; -+ -+ vfmt_out->width = (vfmt_in->crop.width + 31) & ~31; -+ vfmt_out->height = (vfmt_in->crop.height + 15) & ~15; -+ vfmt_out->crop.x = 0; -+ vfmt_out->crop.y = 0; -+ vfmt_out->crop.width = vfmt_in->crop.width; -+ vfmt_out->crop.height = vfmt_in->crop.height; -+ vfmt_out->frame_rate = vfmt_in->frame_rate; -+ vfmt_out->par = vfmt_in->par; -+ vfmt_out->color_space = vfmt_in->color_space; -+ -+ if (mmal_port_format_commit(port_out)) { -+ av_log(s, AV_LOG_ERROR, "Failed to commit output format\n"); -+ goto fail; -+ } -+ -+ if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) { -+ av_log(s, AV_LOG_ERROR, "Failed to create connection\n"); -+ goto fail; -+ } -+ if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) { -+ av_log(s, AV_LOG_ERROR, "Failed to enable connection\n"); -+ goto fail; -+ } -+ mmal_port_enable(de->isp->control,display_cb_control); -+ mmal_component_enable(de->isp); -+ } -+ -+ // Number of slots in my port Q -+ de->port_in->buffer_num = DISPLAY_PORT_DEPTH; -+ // Size to keep it happy - isn't used for anything other than error checking -+ de->port_in->buffer_size = buf->alloc_size; -+ if (!de->port_in->is_enabled) -+ { -+ mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? -+ if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) { -+ av_log(s, AV_LOG_ERROR, "Failed to enable input port\n"); -+ goto fail; -+ } -+ } -+ -+ de->req_fmt = new_es.encoding; -+ de->req_vfmt = *new_vfmt; -+ } -+ } -+ -+ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) -+ { -+ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); -+ goto fail; -+ } -+ return; -+ -+fail: -+ // If we have a buf then fr_buf is held by that -+ if (buf != NULL) -+ mmal_buffer_header_release(buf); -+ else if (fr_buf != NULL) -+ av_rpi_zc_unref(fr_buf); -+} -+ -+ -+static int xv_write_trailer(AVFormatContext *s) -+{ -+ rpi_display_env_t * const de = s->priv_data; -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ if (de->port_in != NULL && de->port_in->is_enabled) { -+ mmal_port_disable(de->port_in); -+ } -+ -+ // The above disable should kick out all buffers - check that -+ if (atomic_load(&de->rpi_display_count) != 0) { -+ av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count)); -+ } -+ -+ isp_remove(s, de); -+ if (de->rpi_pool != NULL) { -+ mmal_pool_destroy(de->rpi_pool); -+ de->rpi_pool = NULL; -+ } -+ if (de->display != NULL) { -+ mmal_component_destroy(de->display); -+ de->display = NULL; -+ } -+ -+ return 0; -+} -+ -+static int xv_write_header(AVFormatContext *s) -+{ -+ rpi_display_env_t * const de = s->priv_data; -+ const AVCodecParameters * const par = s->streams[0]->codecpar; -+ const unsigned int w = de->window_width ? de->window_width : par->width; -+ const unsigned int h = de->window_height ? de->window_height : par->height; -+ const unsigned int x = de->window_x; -+ const unsigned int y = de->window_y; -+ const int layer = de->layer ? de->layer : 2; -+ const MMAL_BOOL_T fullscreen = de->fullscreen; -+ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h); -+#endif -+ if ( s->nb_streams > 1 -+ || par->codec_type != AVMEDIA_TYPE_VIDEO -+ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { -+ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ { -+ MMAL_DISPLAYREGION_T region = -+ { -+ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, -+ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | -+ MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA, -+ .layer = layer, -+ .fullscreen = fullscreen, -+ .dest_rect = {x, y, w, h}, -+ .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS, -+ }; -+ -+ bcm_host_init(); // Needs to be done by someone... -+ -+ if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS) -+ { -+ av_log(s, AV_LOG_ERROR, "Failed to create display component\n"); -+ goto fail; -+ } -+ de->port_in = de->display->input[0]; -+ -+ mmal_port_parameter_set(de->display->input[0], ®ion.hdr); -+ -+ if (mmal_component_enable(de->display) != MMAL_SUCCESS) -+ { -+ av_log(s, AV_LOG_ERROR, "Failed to enable display component\n"); -+ goto fail; -+ } -+ if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS) -+ { -+ av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n"); -+ goto fail; -+ } -+ -+ if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL) -+ { -+ av_log(s, AV_LOG_ERROR, "Failed to create pool\n"); -+ goto fail; -+ } -+ } -+ -+ return 0; -+ -+fail: -+ xv_write_trailer(s); -+ return AVERROR_UNKNOWN; -+} -+ -+static int xv_write_packet(AVFormatContext *s, AVPacket *pkt) -+{ -+ AVFrame * const frame = (AVFrame *)pkt->data; -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s\n", __func__); -+#endif -+ display_frame(s, s->priv_data, frame); -+ return 0; -+} -+ -+static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, -+ unsigned flags) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); -+#endif -+ -+ /* xv_write_header() should have accepted only supported formats */ -+ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) -+ return 0; -+// return write_picture(s, (*frame)->data, (*frame)->linesize); -+ -+ display_frame(s, s->priv_data, *ppframe); -+ return 0; -+} -+ -+static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size) -+{ -+#if TRACE_ALL -+ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); -+#endif -+ switch(type) { -+ case AV_APP_TO_DEV_WINDOW_REPAINT: -+ return 0; -+ default: -+ break; -+ } -+ return AVERROR(ENOSYS); -+} -+ -+// deinit is called if init fails so no need to clean up explicity here -+static int rpi_vout_init(struct AVFormatContext * s) -+{ -+ rpi_display_env_t * const de = s->priv_data; -+ -+ // Get a ZC context in case we need one - has little overhead if unused -+ if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL) -+ return 1; -+ -+ return 0; -+} -+ -+static void rpi_vout_deinit(struct AVFormatContext * s) -+{ -+ rpi_display_env_t * const de = s->priv_data; -+ -+ av_rpi_zc_int_env_freep(&de->zc); -+} -+ -+ -+#define OFFSET(x) offsetof(rpi_display_env_t, x) -+static const AVOption options[] = { -+ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, -+ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, -+ { NULL } -+ -+}; -+ -+static const AVClass xv_class = { -+ .class_name = "rpi vid outdev", -+ .item_name = av_default_item_name, -+ .option = options, -+ .version = LIBAVUTIL_VERSION_INT, -+ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, -+}; -+ -+AVOutputFormat ff_vout_rpi_muxer = { -+ .name = "vout_rpi", -+ .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"), -+ .priv_data_size = sizeof(rpi_display_env_t), -+ .audio_codec = AV_CODEC_ID_NONE, -+ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, -+ .write_header = xv_write_header, -+ .write_packet = xv_write_packet, -+ .write_uncoded_frame = xv_write_frame, -+ .write_trailer = xv_write_trailer, -+ .control_message = xv_control_message, -+ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, -+ .priv_class = &xv_class, -+ .init = rpi_vout_init, -+ .deinit = rpi_vout_deinit, -+}; -diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index b2c254ea67..144fbda652 100644 ---- a/libavfilter/Makefile -+++ b/libavfilter/Makefile -@@ -233,6 +233,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o - OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o - OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o - OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o -+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o - OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o - OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o - OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o -@@ -459,6 +460,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) += vf_transpose_opencl.o opencl.o o - OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o - OBJS-$(CONFIG_TRIM_FILTER) += trim.o - OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o -+OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o - OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o - OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ - opencl/unsharp.o -diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c -index 0872c6e0f2..1dd05e4d75 100644 ---- a/libavfilter/allfilters.c -+++ b/libavfilter/allfilters.c -@@ -218,6 +218,7 @@ extern AVFilter ff_vf_dedot; - extern AVFilter ff_vf_deflate; - extern AVFilter ff_vf_deflicker; - extern AVFilter ff_vf_deinterlace_qsv; -+extern AVFilter ff_vf_deinterlace_v4l2m2m; - extern AVFilter ff_vf_deinterlace_vaapi; - extern AVFilter ff_vf_dejudder; - extern AVFilter ff_vf_delogo; -@@ -377,6 +378,7 @@ extern AVFilter ff_vf_scale; - extern AVFilter ff_vf_scale_cuda; - extern AVFilter ff_vf_scale_npp; - extern AVFilter ff_vf_scale_qsv; -+extern AVFilter ff_vf_scale_v4l2m2m; - extern AVFilter ff_vf_scale_vaapi; - extern AVFilter ff_vf_scale_vulkan; - extern AVFilter ff_vf_scale2ref; -@@ -438,6 +440,7 @@ extern AVFilter ff_vf_transpose_opencl; - extern AVFilter ff_vf_transpose_vaapi; - extern AVFilter ff_vf_trim; - extern AVFilter ff_vf_unpremultiply; -+extern AVFilter ff_vf_unsand; - extern AVFilter ff_vf_unsharp; - extern AVFilter ff_vf_unsharp_opencl; - extern AVFilter ff_vf_untile; -diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c -index f6b572b3de..44fe8b679c 100644 ---- a/libavfilter/avfiltergraph.c -+++ b/libavfilter/avfiltergraph.c -@@ -32,6 +32,9 @@ - #include "libavutil/internal.h" - #include "libavutil/opt.h" - #include "libavutil/pixdesc.h" -+#if CONFIG_UNSAND_FILTER -+#include "libavutil/rpi_sand_fns.h" -+#endif - - #define FF_INTERNAL_FIELDS 1 - #include "framequeue.h" -@@ -422,6 +425,19 @@ static int formats_declared(AVFilterContext *f) - return 1; - } - -+#if CONFIG_UNSAND_FILTER -+static int has_sand_format(const AVFilterFormats * const ff) -+{ -+ int i; -+ for (i = 0; i != ff->nb_formats; ++i) { -+ if (av_rpi_is_sand_format(ff->formats[i])) { -+ return 1; -+ } -+ } -+ return 0; -+} -+#endif -+ - /** - * Perform one round of query_formats() and merging formats lists on the - * filter graph. -@@ -462,6 +478,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) - for (j = 0; j < filter->nb_inputs; j++) { - AVFilterLink *link = filter->inputs[j]; - int convert_needed = 0; -+ unsigned int extra_convert_tried = 0; - - if (!link) - continue; -@@ -504,11 +521,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) - link->outcfg.formats, link->type) - #undef MERGE_DISPATCH - -- if (convert_needed) { -+ while (convert_needed) { - AVFilterContext *convert; - const AVFilter *filter; - AVFilterLink *inlink, *outlink; - char inst_name[30]; -+ int can_retry = 0; -+ -+ convert_needed = 0; - - if (graph->disable_auto_convert) { - av_log(log_ctx, AV_LOG_ERROR, -@@ -521,19 +541,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) - /* couldn't merge format lists. auto-insert conversion filter */ - switch (link->type) { - case AVMEDIA_TYPE_VIDEO: -- if (!(filter = avfilter_get_by_name("scale"))) { -- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " -- "not present, cannot convert pixel formats.\n"); -- return AVERROR(EINVAL); -- } -- -- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", -- scaler_count++); -+#if CONFIG_UNSAND_FILTER -+ // Only try each extra conversion once -+ // The unsand output pad should never trigger has_sand_format -+ // but it is better to be safe -+ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->incfg.formats)) { -+ if (!(filter = avfilter_get_by_name("unsand"))) { -+ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter " -+ "not present, cannot convert pixel formats.\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d", -+ scaler_count++); -+ -+ if ((ret = avfilter_graph_create_filter(&convert, filter, -+ inst_name, "", NULL, -+ graph)) < 0) -+ return ret; - -- if ((ret = avfilter_graph_create_filter(&convert, filter, -- inst_name, graph->scale_sws_opts, NULL, -- graph)) < 0) -- return ret; -+ extra_convert_tried |= 1; -+ can_retry = 1; -+ } -+ else -+#endif -+ { -+ if (!(filter = avfilter_get_by_name("scale"))) { -+ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " -+ "not present, cannot convert pixel formats.\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", -+ scaler_count++); -+ -+ if ((ret = avfilter_graph_create_filter(&convert, filter, -+ inst_name, graph->scale_sws_opts, NULL, -+ graph)) < 0) -+ return ret; -+ } - break; - case AVMEDIA_TYPE_AUDIO: - if (!(filter = avfilter_get_by_name("aresample"))) { -@@ -589,6 +635,13 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) - outlink->outcfg.samplerates) || - CHECKED_MERGE(channel_layouts, outlink->incfg.channel_layouts, - outlink->outcfg.channel_layouts))) { -+ // Try adding an unsand filter & see if that helps -+ if (ret < 0 && can_retry) { -+ link = outlink; -+ convert_needed = 1; -+ continue; -+ } -+ - if (ret < 0) - return ret; - av_log(log_ctx, AV_LOG_ERROR, -diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c -index da1cf9941e..c588ed23cb 100644 ---- a/libavfilter/buffersrc.c -+++ b/libavfilter/buffersrc.c -@@ -188,7 +188,7 @@ int attribute_align_arg av_buffersrc_add_frame_flags(AVFilterContext *ctx, AVFra - - switch (ctx->outputs[0]->type) { - case AVMEDIA_TYPE_VIDEO: -- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, -+ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), - frame->format, frame->pts); - break; - case AVMEDIA_TYPE_AUDIO: -diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c -new file mode 100644 -index 0000000000..d4c11cfc51 ---- /dev/null -+++ b/libavfilter/vf_deinterlace_v4l2m2m.c -@@ -0,0 +1,2115 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * deinterlace video filter - V4L2 M2M -+ */ -+ -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "config.h" -+ -+#include "libavutil/avassert.h" -+#include "libavutil/avstring.h" -+#include "libavutil/common.h" -+#include "libavutil/hwcontext.h" -+#include "libavutil/hwcontext_drm.h" -+#include "libavutil/internal.h" -+#include "libavutil/mathematics.h" -+#include "libavutil/opt.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/time.h" -+ -+#define FF_INTERNAL_FIELDS 1 -+#include "framequeue.h" -+#include "filters.h" -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "scale_eval.h" -+#include "video.h" -+ -+#ifndef DRM_FORMAT_P030 -+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ -+#endif -+ -+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined -+// in drm_fourcc.h hopefully will be sometime in the future but until then... -+#ifndef V4L2_PIX_FMT_NV12_10_COL128 -+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') -+#endif -+ -+#ifndef V4L2_PIX_FMT_NV12_COL128 -+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ -+#endif -+ -+typedef struct V4L2Queue V4L2Queue; -+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; -+ -+typedef enum filter_type_v4l2_e -+{ -+ FILTER_V4L2_DEINTERLACE = 1, -+ FILTER_V4L2_SCALE, -+} filter_type_v4l2_t; -+ -+typedef struct V4L2Buffer { -+ int enqueued; -+ int reenqueue; -+ struct v4l2_buffer buffer; -+ AVFrame frame; -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ int num_planes; -+ AVDRMFrameDescriptor drm_frame; -+ V4L2Queue *q; -+} V4L2Buffer; -+ -+typedef struct V4L2Queue { -+ struct v4l2_format format; -+ struct v4l2_selection sel; -+ int eos; -+ int num_buffers; -+ V4L2Buffer *buffers; -+ const char * name; -+ DeintV4L2M2MContextShared *ctx; -+} V4L2Queue; -+ -+typedef struct pts_stats_s -+{ -+ void * logctx; -+ const char * name; // For debug -+ unsigned int last_count; -+ unsigned int last_interval; -+ int64_t last_pts; -+} pts_stats_t; -+ -+#define PTS_TRACK_SIZE 32 -+typedef struct pts_track_el_s -+{ -+ uint32_t n; -+ unsigned int interval; -+ AVFrame * props; -+} pts_track_el_t; -+ -+typedef struct pts_track_s -+{ -+ uint32_t n; -+ uint32_t last_n; -+ int got_2; -+ void * logctx; -+ pts_stats_t stats; -+ pts_track_el_t a[PTS_TRACK_SIZE]; -+} pts_track_t; -+ -+typedef enum drain_state_e -+{ -+ DRAIN_NONE = 0, // Not draining -+ DRAIN_TIMEOUT, // Drain until normal timeout setup yields no frame -+ DRAIN_LAST, // Drain with long timeout last_frame in received on output expected -+ DRAIN_EOS, // Drain with long timeout EOS expected -+ DRAIN_DONE // Drained -+} drain_state_t; -+ -+typedef struct DeintV4L2M2MContextShared { -+ void * logctx; // For logging - will be NULL when done -+ filter_type_v4l2_t filter_type; -+ -+ int fd; -+ int done; // fd closed - awating all refs dropped -+ int width; -+ int height; -+ -+ int drain; // EOS received (inlink status) -+ drain_state_t drain_state; -+ int64_t drain_pts; // PTS associated with inline status -+ -+ unsigned int frames_rx; -+ unsigned int frames_tx; -+ -+ // from options -+ int output_width; -+ int output_height; -+ enum AVPixelFormat output_format; -+ -+ int has_enc_stop; -+ // We expect to get exactly the same number of frames out as we put in -+ // We can drain by matching input to output -+ int one_to_one; -+ -+ int orig_width; -+ int orig_height; -+ atomic_uint refcount; -+ -+ AVBufferRef *hw_frames_ctx; -+ -+ unsigned int field_order; -+ -+ pts_track_t track; -+ -+ V4L2Queue output; -+ V4L2Queue capture; -+} DeintV4L2M2MContextShared; -+ -+typedef struct DeintV4L2M2MContext { -+ const AVClass *class; -+ -+ DeintV4L2M2MContextShared *shared; -+ -+ char * w_expr; -+ char * h_expr; -+ char * output_format_string;; -+ -+ int force_original_aspect_ratio; -+ int force_divisible_by; -+ -+ char *colour_primaries_string; -+ char *colour_transfer_string; -+ char *colour_matrix_string; -+ int colour_range; -+ char *chroma_location_string; -+ -+ enum AVColorPrimaries colour_primaries; -+ enum AVColorTransferCharacteristic colour_transfer; -+ enum AVColorSpace colour_matrix; -+ enum AVChromaLocation chroma_location; -+} DeintV4L2M2MContext; -+ -+ -+static inline int drain_frame_expected(const drain_state_t d) -+{ -+ return d == DRAIN_EOS || d == DRAIN_LAST; -+} -+ -+// These just list the ones we know we can cope with -+static uint32_t -+fmt_av_to_v4l2(const enum AVPixelFormat avfmt) -+{ -+ switch (avfmt) { -+ case AV_PIX_FMT_YUV420P: -+ return V4L2_PIX_FMT_YUV420; -+ case AV_PIX_FMT_NV12: -+ return V4L2_PIX_FMT_NV12; -+#if CONFIG_SAND -+ case AV_PIX_FMT_RPI4_8: -+ case AV_PIX_FMT_SAND128: -+ return V4L2_PIX_FMT_NV12_COL128; -+#endif -+ default: -+ break; -+ } -+ return 0; -+} -+ -+static enum AVPixelFormat -+fmt_v4l2_to_av(const uint32_t pixfmt) -+{ -+ switch (pixfmt) { -+ case V4L2_PIX_FMT_YUV420: -+ return AV_PIX_FMT_YUV420P; -+ case V4L2_PIX_FMT_NV12: -+ return AV_PIX_FMT_NV12; -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ return AV_PIX_FMT_RPI4_8; -+#endif -+ default: -+ break; -+ } -+ return AV_PIX_FMT_NONE; -+} -+ -+static unsigned int pts_stats_interval(const pts_stats_t * const stats) -+{ -+ return stats->last_interval; -+} -+ -+// Pick 64 for max last count - that is >1sec at 60fps -+#define STATS_LAST_COUNT_MAX 64 -+#define STATS_INTERVAL_MAX (1 << 30) -+static void pts_stats_add(pts_stats_t * const stats, int64_t pts) -+{ -+ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { -+ if (stats->last_count < STATS_LAST_COUNT_MAX) -+ ++stats->last_count; -+ return; -+ } -+ -+ if (stats->last_pts != AV_NOPTS_VALUE) { -+ const int64_t interval = pts - stats->last_pts; -+ -+ if (interval < 0 || interval >= STATS_INTERVAL_MAX || -+ stats->last_count >= STATS_LAST_COUNT_MAX) { -+ if (stats->last_interval != 0) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", -+ __func__, stats->name, interval, stats->last_count); -+ stats->last_interval = 0; -+ } -+ else { -+ const int64_t frame_time = interval / (int64_t)stats->last_count; -+ -+ if (frame_time != stats->last_interval) -+ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", -+ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); -+ stats->last_interval = frame_time; -+ } -+ } -+ -+ stats->last_pts = pts; -+ stats->last_count = 1; -+} -+ -+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) -+{ -+ *stats = (pts_stats_t){ -+ .logctx = logctx, -+ .name = name, -+ .last_count = 1, -+ .last_interval = 0, -+ .last_pts = AV_NOPTS_VALUE -+ }; -+} -+ -+static inline uint32_t pts_track_next_n(pts_track_t * const trk) -+{ -+ if (++trk->n == 0) -+ trk->n = 1; -+ return trk->n; -+} -+ -+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) -+{ -+ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); -+ pts_track_el_t * t; -+ -+ // As a first guess assume that n==0 means last frame -+ if (n == 0) { -+ n = trk->last_n; -+ if (n == 0) -+ goto fail; -+ } -+ -+ t = trk->a + (n & (PTS_TRACK_SIZE - 1)); -+ -+ if (t->n != n) { -+ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); -+ goto fail; -+ } -+ -+ // 1st frame is simple - just believe it -+ if (n != trk->last_n) { -+ trk->last_n = n; -+ trk->got_2 = 0; -+ return av_frame_copy_props(dst, t->props); -+ } -+ -+ // Only believe in a single interpolated frame -+ if (trk->got_2) -+ goto fail; -+ trk->got_2 = 1; -+ -+ av_frame_copy_props(dst, t->props); -+ -+ -+ // If we can't guess - don't -+ if (t->interval == 0) { -+ dst->best_effort_timestamp = AV_NOPTS_VALUE; -+ dst->pts = AV_NOPTS_VALUE; -+ dst->pkt_dts = AV_NOPTS_VALUE; -+ } -+ else { -+ if (dst->best_effort_timestamp != AV_NOPTS_VALUE) -+ dst->best_effort_timestamp += t->interval / 2; -+ if (dst->pts != AV_NOPTS_VALUE) -+ dst->pts += t->interval / 2; -+ if (dst->pkt_dts != AV_NOPTS_VALUE) -+ dst->pkt_dts += t->interval / 2; -+ } -+ -+ return 0; -+ -+fail: -+ trk->last_n = 0; -+ trk->got_2 = 0; -+ dst->pts = AV_NOPTS_VALUE; -+ dst->pkt_dts = AV_NOPTS_VALUE; -+ return 0; -+} -+ -+// We are only ever expecting in-order frames so nothing more clever is required -+static unsigned int -+pts_track_count(const pts_track_t * const trk) -+{ -+ return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1); -+} -+ -+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) -+{ -+ const uint32_t n = pts_track_next_n(trk); -+ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); -+ -+ pts_stats_add(&trk->stats, src->pts); -+ -+ t->n = n; -+ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last -+ av_frame_unref(t->props); -+ av_frame_copy_props(t->props, src); -+ -+ // We now know what the previous interval was, rather than having to guess, -+ // so set it. There is a better than decent chance that this is before -+ // we use it. -+ if (t->interval != 0) { -+ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); -+ prev_t->interval = t->interval; -+ } -+ -+ // In case deinterlace interpolates frames use every other usec -+ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; -+} -+ -+static void pts_track_uninit(pts_track_t * const trk) -+{ -+ unsigned int i; -+ for (i = 0; i != PTS_TRACK_SIZE; ++i) { -+ trk->a[i].n = 0; -+ av_frame_free(&trk->a[i].props); -+ } -+} -+ -+static int pts_track_init(pts_track_t * const trk, void *logctx) -+{ -+ unsigned int i; -+ trk->n = 1; -+ pts_stats_init(&trk->stats, logctx, "track"); -+ for (i = 0; i != PTS_TRACK_SIZE; ++i) { -+ trk->a[i].n = 0; -+ if ((trk->a[i].props = av_frame_alloc()) == NULL) { -+ pts_track_uninit(trk); -+ return AVERROR(ENOMEM); -+ } -+ } -+ return 0; -+} -+ -+static inline uint32_t -+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline; -+} -+ -+static inline uint32_t -+fmt_height(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -+} -+ -+static inline uint32_t -+fmt_width(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -+} -+ -+static inline uint32_t -+fmt_pixelformat(const struct v4l2_format * const fmt) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; -+} -+ -+static inline uint32_t -+buf_bytesused0(const struct v4l2_buffer * const buf) -+{ -+ return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused; -+} -+ -+static void -+init_format(V4L2Queue * const q, const uint32_t format_type) -+{ -+ memset(&q->format, 0, sizeof(q->format)); -+ memset(&q->sel, 0, sizeof(q->sel)); -+ q->format.type = format_type; -+ q->sel.type = format_type; -+} -+ -+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) -+{ -+ struct v4l2_capability cap; -+ int ret; -+ -+ memset(&cap, 0, sizeof(cap)); -+ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); -+ if (ret < 0) -+ return ret; -+ -+ if (ctx->filter_type == FILTER_V4L2_SCALE && -+ strcmp("bcm2835-codec-isp", cap.card) != 0) -+ { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ if (!(cap.capabilities & V4L2_CAP_STREAMING)) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { -+ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE); -+ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE); -+ } -+ else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { -+ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE); -+ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT); -+ } -+ else { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n"); -+ return AVERROR(EINVAL); -+ } -+ -+ return 0; -+} -+ -+// Just use for probe - doesn't modify q format -+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt) -+{ -+ struct v4l2_format fmt = {.type = queue->format.type}; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ int ret, field; -+ // Pick YUV to test with if not otherwise specified -+ uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt); -+ enum AVPixelFormat r_avfmt; -+ -+ -+ ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt); -+ if (ret) -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); -+ -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type)) -+ field = V4L2_FIELD_INTERLACED_TB; -+ else -+ field = V4L2_FIELD_NONE; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { -+ fmt.fmt.pix_mp.pixelformat = pixelformat; -+ fmt.fmt.pix_mp.field = field; -+ fmt.fmt.pix_mp.width = width; -+ fmt.fmt.pix_mp.height = height; -+ } else { -+ fmt.fmt.pix.pixelformat = pixelformat; -+ fmt.fmt.pix.field = field; -+ fmt.fmt.pix.width = width; -+ fmt.fmt.pix.height = height; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, -+ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, -+ fmt.fmt.pix_mp.pixelformat, -+ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); -+ -+ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt); -+ if (ret) -+ return AVERROR(EINVAL); -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, -+ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, -+ fmt.fmt.pix_mp.pixelformat, -+ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); -+ -+ r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt)); -+ if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); -+ return AVERROR(EINVAL); -+ } -+ if (r_avfmt == AV_PIX_FMT_NONE) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); -+ return AVERROR(EINVAL); -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { -+ if (fmt.fmt.pix_mp.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); -+ -+ return AVERROR(EINVAL); -+ } -+ } else { -+ if (fmt.fmt.pix.field != field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); -+ -+ return AVERROR(EINVAL); -+ } -+ } -+ -+ return 0; -+} -+ -+static int -+do_s_fmt(V4L2Queue * const q) -+{ -+ DeintV4L2M2MContextShared * const ctx = q->ctx; -+ const uint32_t pixelformat = fmt_pixelformat(&q->format); -+ int ret; -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret)); -+ return ret; -+ } -+ -+ if (pixelformat != fmt_pixelformat(&q->format)) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format))); -+ return AVERROR(EINVAL); -+ } -+ -+ q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, -+ q->sel.flags = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE; -+ -+ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel); -+ if (ret) { -+ ret = AVERROR(errno); -+ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret)); -+ } -+ -+ return 0; -+} -+ -+static void -+set_fmt_color(struct v4l2_format *const fmt, -+ const enum AVColorPrimaries avcp, -+ const enum AVColorSpace avcs, -+ const enum AVColorTransferCharacteristic avxc) -+{ -+ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; -+ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; -+ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; -+ -+ switch (avcp) { -+ case AVCOL_PRI_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ ycbcr = V4L2_YCBCR_ENC_709; -+ break; -+ case AVCOL_PRI_BT470M: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ ycbcr = V4L2_YCBCR_ENC_601; -+ break; -+ case AVCOL_PRI_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_PRI_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_PRI_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_PRI_BT2020: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ case AVCOL_PRI_SMPTE428: -+ case AVCOL_PRI_SMPTE431: -+ case AVCOL_PRI_SMPTE432: -+ case AVCOL_PRI_EBU3213: -+ case AVCOL_PRI_RESERVED: -+ case AVCOL_PRI_FILM: -+ case AVCOL_PRI_UNSPECIFIED: -+ default: -+ break; -+ } -+ -+ switch (avcs) { -+ case AVCOL_SPC_RGB: -+ cs = V4L2_COLORSPACE_SRGB; -+ break; -+ case AVCOL_SPC_BT709: -+ cs = V4L2_COLORSPACE_REC709; -+ break; -+ case AVCOL_SPC_FCC: -+ cs = V4L2_COLORSPACE_470_SYSTEM_M; -+ break; -+ case AVCOL_SPC_BT470BG: -+ cs = V4L2_COLORSPACE_470_SYSTEM_BG; -+ break; -+ case AVCOL_SPC_SMPTE170M: -+ cs = V4L2_COLORSPACE_SMPTE170M; -+ break; -+ case AVCOL_SPC_SMPTE240M: -+ cs = V4L2_COLORSPACE_SMPTE240M; -+ break; -+ case AVCOL_SPC_BT2020_CL: -+ cs = V4L2_COLORSPACE_BT2020; -+ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; -+ break; -+ case AVCOL_SPC_BT2020_NCL: -+ cs = V4L2_COLORSPACE_BT2020; -+ break; -+ default: -+ break; -+ } -+ -+ switch (xfer) { -+ case AVCOL_TRC_BT709: -+ xfer = V4L2_XFER_FUNC_709; -+ break; -+ case AVCOL_TRC_IEC61966_2_1: -+ xfer = V4L2_XFER_FUNC_SRGB; -+ break; -+ case AVCOL_TRC_SMPTE240M: -+ xfer = V4L2_XFER_FUNC_SMPTE240M; -+ break; -+ case AVCOL_TRC_SMPTE2084: -+ xfer = V4L2_XFER_FUNC_SMPTE2084; -+ break; -+ default: -+ break; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.colorspace = cs; -+ fmt->fmt.pix_mp.ycbcr_enc = ycbcr; -+ fmt->fmt.pix_mp.xfer_func = xfer; -+ } else { -+ fmt->fmt.pix.colorspace = cs; -+ fmt->fmt.pix.ycbcr_enc = ycbcr; -+ fmt->fmt.pix.xfer_func = xfer; -+ } -+} -+ -+static void -+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr) -+{ -+ const enum v4l2_quantization q = -+ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : -+ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : -+ V4L2_QUANTIZATION_DEFAULT; -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.quantization = q; -+ } else { -+ fmt->fmt.pix.quantization = q; -+ } -+} -+ -+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ switch(ycbcr) { -+ case V4L2_YCBCR_ENC_XV709: -+ case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709; -+ case V4L2_YCBCR_ENC_XV601: -+ case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M; -+ default: -+ break; -+ } -+ -+ switch(cs) { -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M; -+ case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020; -+ default: -+ break; -+ } -+ -+ return AVCOL_PRI_UNSPECIFIED; -+} -+ -+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ switch(cs) { -+ case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB; -+ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709; -+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC; -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M; -+ case V4L2_COLORSPACE_BT2020: -+ if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM) -+ return AVCOL_SPC_BT2020_CL; -+ else -+ return AVCOL_SPC_BT2020_NCL; -+ default: -+ break; -+ } -+ -+ return AVCOL_SPC_UNSPECIFIED; -+} -+ -+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_ycbcr_encoding ycbcr; -+ enum v4l2_xfer_func xfer; -+ enum v4l2_colorspace cs; -+ -+ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.colorspace : -+ fmt->fmt.pix.colorspace; -+ -+ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.ycbcr_enc: -+ fmt->fmt.pix.ycbcr_enc; -+ -+ xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.xfer_func: -+ fmt->fmt.pix.xfer_func; -+ -+ switch (xfer) { -+ case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709; -+ case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1; -+ default: -+ break; -+ } -+ -+ switch (cs) { -+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22; -+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28; -+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M; -+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M; -+ default: -+ break; -+ } -+ -+ switch (ycbcr) { -+ case V4L2_YCBCR_ENC_XV709: -+ case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG; -+ default: -+ break; -+ } -+ -+ return AVCOL_TRC_UNSPECIFIED; -+} -+ -+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt) -+{ -+ enum v4l2_quantization qt; -+ -+ qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? -+ fmt->fmt.pix_mp.quantization : -+ fmt->fmt.pix.quantization; -+ -+ switch (qt) { -+ case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG; -+ case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG; -+ default: -+ break; -+ } -+ -+ return AVCOL_RANGE_UNSPECIFIED; -+} -+ -+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) -+{ -+ struct v4l2_format *const format = &q->format; -+ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; -+ -+ const uint32_t drm_fmt = src->layers[0].format; -+ // Treat INVALID as LINEAR -+ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? -+ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; -+ uint32_t pix_fmt = 0; -+ uint32_t w = 0; -+ uint32_t h = 0; -+ uint32_t bpl = src->layers[0].planes[0].pitch; -+ -+ // We really don't expect multiple layers -+ // All formats that we currently cope with are single object -+ -+ if (src->nb_layers != 1 || src->nb_objects != 1) -+ return AVERROR(EINVAL); -+ -+ switch (drm_fmt) { -+ case DRM_FORMAT_YUV420: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 3) -+ break; -+ pix_fmt = V4L2_PIX_FMT_YUV420; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+ break; -+ -+ case DRM_FORMAT_NV12: -+ if (mod == DRM_FORMAT_MOD_LINEAR) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12; -+ h = src->layers[0].planes[1].offset / bpl; -+ w = bpl; -+ } -+#if CONFIG_SAND -+ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_COL128; -+ w = bpl; -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+#endif -+ break; -+ -+ case DRM_FORMAT_P030: -+#if CONFIG_SAND -+ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { -+ if (src->layers[0].nb_planes != 2) -+ break; -+ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; -+ w = bpl / 2; // Matching lie to how we construct this -+ h = src->layers[0].planes[1].offset / 128; -+ bpl = fourcc_mod_broadcom_param(mod); -+ } -+#endif -+ break; -+ -+ default: -+ break; -+ } -+ -+ if (!pix_fmt) -+ return AVERROR(EINVAL); -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { -+ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->plane_fmt[0].bytesperline = bpl; -+ pix->num_planes = 1; -+ } -+ else { -+ struct v4l2_pix_format *const pix = &format->fmt.pix; -+ -+ pix->width = w; -+ pix->height = h; -+ pix->pixelformat = pix_fmt; -+ pix->bytesperline = bpl; -+ } -+ -+ set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc); -+ set_fmt_color_range(format, frame->color_range); -+ -+ q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right); -+ q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom); -+ q->sel.r.left = frame->crop_left; -+ q->sel.r.top = frame->crop_top; -+ -+ return 0; -+} -+ -+ -+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height) -+{ -+ struct v4l2_format * const fmt = &queue->format; -+ struct v4l2_selection *const sel = &queue->sel; -+ -+ memset(&fmt->fmt, 0, sizeof(fmt->fmt)); -+ -+ // Align w/h to 16 here in case there are alignment requirements at the next -+ // stage of the filter chain (also RPi deinterlace setup is bust and this -+ // fixes it) -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { -+ fmt->fmt.pix_mp.pixelformat = pixelformat; -+ fmt->fmt.pix_mp.field = field; -+ fmt->fmt.pix_mp.width = FFALIGN(width, 16); -+ fmt->fmt.pix_mp.height = FFALIGN(height, 16); -+ } else { -+ fmt->fmt.pix.pixelformat = pixelformat; -+ fmt->fmt.pix.field = field; -+ fmt->fmt.pix.width = FFALIGN(width, 16); -+ fmt->fmt.pix.height = FFALIGN(height, 16); -+ } -+ -+ set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer); -+ set_fmt_color_range(fmt, priv->colour_range); -+ -+ sel->r.width = width; -+ sel->r.height = height; -+ sel->r.left = 0; -+ sel->r.top = 0; -+ -+ return do_s_fmt(queue); -+} -+ -+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) -+{ -+ int ret; -+ -+ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); -+ if (ctx->fd < 0) -+ return AVERROR(errno); -+ -+ ret = deint_v4l2m2m_prepare_context(ctx); -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n"); -+ goto fail; -+ } -+ -+ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format); -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n"); -+ goto fail; -+ } -+ -+ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE); -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n"); -+ goto fail; -+ } -+ -+ return 0; -+ -+fail: -+ close(ctx->fd); -+ ctx->fd = -1; -+ -+ return ret; -+} -+ -+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) -+{ -+ int ret = AVERROR(EINVAL); -+ struct dirent *entry; -+ char node[PATH_MAX]; -+ DIR *dirp; -+ -+ dirp = opendir("/dev"); -+ if (!dirp) -+ return AVERROR(errno); -+ -+ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { -+ -+ if (strncmp(entry->d_name, "video", 5)) -+ continue; -+ -+ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); -+ ret = deint_v4l2m2m_probe_device(ctx, node); -+ if (!ret) -+ break; -+ } -+ -+ closedir(dirp); -+ -+ if (ret) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); -+ ctx->fd = -1; -+ -+ return ret; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) -+{ -+ int ret; -+ -+ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ buf->enqueued = 1; -+ -+ return 0; -+} -+ -+static void -+drm_frame_init(AVDRMFrameDescriptor * const d) -+{ -+ unsigned int i; -+ for (i = 0; i != AV_DRM_MAX_PLANES; ++i) { -+ d->objects[i].fd = -1; -+ } -+} -+ -+static void -+drm_frame_uninit(AVDRMFrameDescriptor * const d) -+{ -+ unsigned int i; -+ for (i = 0; i != d->nb_objects; ++i) { -+ if (d->objects[i].fd != -1) { -+ close(d->objects[i].fd); -+ d->objects[i].fd = -1; -+ } -+ } -+} -+ -+static void -+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n) -+{ -+ unsigned int i; -+ V4L2Buffer* const avbufs = *ppavbufs; -+ -+ if (avbufs == NULL) -+ return; -+ *ppavbufs = NULL; -+ -+ for (i = 0; i != n; ++i) { -+ V4L2Buffer* const avbuf = avbufs + i; -+ drm_frame_uninit(&avbuf->drm_frame); -+ } -+ -+ av_free(avbufs); -+} -+ -+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) -+{ -+ struct v4l2_exportbuffer expbuf; -+ int i, ret; -+ uint64_t mod = DRM_FORMAT_MOD_LINEAR; -+ -+ AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame; -+ AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; -+ const struct v4l2_format *const fmt = &q->format; -+ const uint32_t height = fmt_height(fmt); -+ ptrdiff_t bpl0; -+ -+ /* fill the DRM frame descriptor */ -+ drm_desc->nb_layers = 1; -+ layer->nb_planes = avbuf->num_planes; -+ -+ for (int i = 0; i < avbuf->num_planes; i++) { -+ layer->planes[i].object_index = i; -+ layer->planes[i].offset = 0; -+ layer->planes[i].pitch = fmt_bpl(fmt, i); -+ } -+ bpl0 = layer->planes[0].pitch; -+ -+ switch (fmt_pixelformat(fmt)) { -+#if CONFIG_SAND -+ case V4L2_PIX_FMT_NV12_COL128: -+ mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); -+ layer->format = V4L2_PIX_FMT_NV12; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = height * 128; -+ layer->planes[0].pitch = fmt_width(fmt); -+ layer->planes[1].pitch = layer->planes[0].pitch; -+ break; -+#endif -+ -+ case DRM_FORMAT_NV12: -+ layer->format = V4L2_PIX_FMT_NV12; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 2; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = bpl0 * height; -+ layer->planes[1].pitch = bpl0; -+ break; -+ -+ case V4L2_PIX_FMT_YUV420: -+ layer->format = DRM_FORMAT_YUV420; -+ -+ if (avbuf->num_planes > 1) -+ break; -+ -+ layer->nb_planes = 3; -+ layer->planes[1].object_index = 0; -+ layer->planes[1].offset = bpl0 * height; -+ layer->planes[1].pitch = bpl0 / 2; -+ layer->planes[2].object_index = 0; -+ layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4); -+ layer->planes[2].pitch = bpl0 / 2; -+ break; -+ -+ default: -+ drm_desc->nb_layers = 0; -+ return AVERROR(EINVAL); -+ } -+ -+ drm_desc->nb_objects = 0; -+ for (i = 0; i < avbuf->num_planes; i++) { -+ memset(&expbuf, 0, sizeof(expbuf)); -+ -+ expbuf.index = avbuf->buffer.index; -+ expbuf.type = avbuf->buffer.type; -+ expbuf.plane = i; -+ -+ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ? -+ avbuf->buffer.m.planes[i].length : avbuf->buffer.length; -+ drm_desc->objects[i].fd = expbuf.fd; -+ drm_desc->objects[i].format_modifier = mod; -+ drm_desc->nb_objects = i + 1; -+ } -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) -+{ -+ struct v4l2_format *fmt = &queue->format; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_requestbuffers req; -+ int ret, i, multiplanar; -+ uint32_t memory; -+ -+ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? -+ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; -+ -+ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); -+ -+ memset(&req, 0, sizeof(req)); -+ req.count = queue->num_buffers; -+ req.memory = memory; -+ req.type = fmt->type; -+ -+ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); -+ if (ret < 0) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); -+ -+ return AVERROR(errno); -+ } -+ -+ queue->num_buffers = req.count; -+ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); -+ if (!queue->buffers) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); -+ -+ return AVERROR(ENOMEM); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer * const buf = &queue->buffers[i]; -+ -+ buf->enqueued = 0; -+ buf->q = queue; -+ -+ buf->buffer.type = fmt->type; -+ buf->buffer.memory = memory; -+ buf->buffer.index = i; -+ -+ if (multiplanar) { -+ buf->buffer.length = VIDEO_MAX_PLANES; -+ buf->buffer.m.planes = buf->planes; -+ } -+ -+ drm_frame_init(&buf->drm_frame); -+ } -+ -+ for (i = 0; i < queue->num_buffers; i++) { -+ V4L2Buffer * const buf = &queue->buffers[i]; -+ -+ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); -+ if (ret < 0) { -+ ret = AVERROR(errno); -+ -+ goto fail; -+ } -+ -+ buf->num_planes = multiplanar ? buf->buffer.length : 1; -+ -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { -+ ret = deint_v4l2m2m_enqueue_buffer(buf); -+ if (ret) -+ goto fail; -+ -+ ret = v4l2_buffer_export_drm(queue, buf); -+ if (ret) -+ goto fail; -+ } -+ } -+ -+ return 0; -+ -+fail: -+ avbufs_delete(&queue->buffers, queue->num_buffers); -+ queue->num_buffers = 0; -+ return ret; -+} -+ -+static int deint_v4l2m2m_streamon(V4L2Queue *queue) -+{ -+ DeintV4L2M2MContextShared * const ctx = queue->ctx; -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+static int deint_v4l2m2m_streamoff(V4L2Queue *queue) -+{ -+ DeintV4L2M2MContextShared * const ctx = queue->ctx; -+ int type = queue->format.type; -+ int ret; -+ -+ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); -+ if (ret < 0) -+ return AVERROR(errno); -+ -+ return 0; -+} -+ -+// timeout in ms -+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) -+{ -+ struct v4l2_plane planes[VIDEO_MAX_PLANES]; -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ struct v4l2_buffer buf = { 0 }; -+ V4L2Buffer* avbuf = NULL; -+ struct pollfd pfd; -+ short events; -+ int ret; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ events = POLLOUT | POLLWRNORM; -+ else -+ events = POLLIN | POLLRDNORM; -+ -+ pfd.events = events; -+ pfd.fd = ctx->fd; -+ -+ for (;;) { -+ ret = poll(&pfd, 1, timeout); -+ if (ret > 0) -+ break; -+ if (errno == EINTR) -+ continue; -+ return NULL; -+ } -+ -+ if (pfd.revents & POLLERR) -+ return NULL; -+ -+ if (pfd.revents & events) { -+ memset(&buf, 0, sizeof(buf)); -+ buf.memory = V4L2_MEMORY_MMAP; -+ buf.type = queue->format.type; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memset(planes, 0, sizeof(planes)); -+ buf.length = VIDEO_MAX_PLANES; -+ buf.m.planes = planes; -+ } -+ -+ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); -+ if (ret) { -+ if (errno != EAGAIN) -+ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", -+ av_err2str(AVERROR(errno))); -+ return NULL; -+ } -+ -+ avbuf = &queue->buffers[buf.index]; -+ avbuf->enqueued = 0; -+ avbuf->buffer = buf; -+ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { -+ memcpy(avbuf->planes, planes, sizeof(planes)); -+ avbuf->buffer.m.planes = avbuf->planes; -+ } -+ return avbuf; -+ } -+ -+ return NULL; -+} -+ -+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) -+{ -+ int i; -+ V4L2Buffer *buf = NULL; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (!queue->buffers[i].enqueued) { -+ buf = &queue->buffers[i]; -+ break; -+ } -+ return buf; -+} -+ -+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) -+{ -+ int i; -+ V4L2Buffer *buf = NULL; -+ -+ if (!queue || !queue->buffers) -+ return; -+ for (i = 0; i < queue->num_buffers; i++) { -+ buf = &queue->buffers[i]; -+ if (queue->buffers[i].enqueued) -+ av_frame_unref(&buf->frame); -+ } -+} -+ -+static void recycle_q(V4L2Queue * const queue) -+{ -+ V4L2Buffer* avbuf; -+ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { -+ av_frame_unref(&avbuf->frame); -+ } -+} -+ -+static int count_enqueued(V4L2Queue *queue) -+{ -+ int i; -+ int n = 0; -+ -+ if (queue->buffers == NULL) -+ return 0; -+ -+ for (i = 0; i < queue->num_buffers; i++) -+ if (queue->buffers[i].enqueued) -+ ++n; -+ return n; -+} -+ -+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) -+{ -+ DeintV4L2M2MContextShared *const ctx = queue->ctx; -+ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; -+ V4L2Buffer *buf; -+ int i; -+ -+ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) -+ recycle_q(queue); -+ -+ buf = deint_v4l2m2m_find_free_buf(queue); -+ if (!buf) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); -+ return AVERROR(EAGAIN); -+ } -+ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) -+ for (i = 0; i < drm_desc->nb_objects; i++) -+ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; -+ else -+ buf->buffer.m.fd = drm_desc->objects[0].fd; -+ -+ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : -+ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : -+ V4L2_FIELD_INTERLACED_BT; -+ -+ if (ctx->field_order != buf->buffer.field) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); -+ ctx->field_order = buf->buffer.field; -+ } -+ -+ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); -+ -+ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; -+ -+ av_frame_move_ref(&buf->frame, frame); -+ -+ return deint_v4l2m2m_enqueue_buffer(buf); -+} -+ -+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) -+{ -+ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ -+ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); -+ -+ if (ctx->fd >= 0) { -+ deint_v4l2m2m_streamoff(capture); -+ deint_v4l2m2m_streamoff(output); -+ } -+ -+ avbufs_delete(&capture->buffers, capture->num_buffers); -+ -+ deint_v4l2m2m_unref_queued(output); -+ -+ av_buffer_unref(&ctx->hw_frames_ctx); -+ -+ if (capture->buffers) -+ av_free(capture->buffers); -+ -+ if (output->buffers) -+ av_free(output->buffers); -+ -+ if (ctx->fd >= 0) { -+ close(ctx->fd); -+ ctx->fd = -1; -+ } -+ -+ av_free(ctx); -+ } -+} -+ -+static void v4l2_free_buffer(void *opaque, uint8_t *unused) -+{ -+ V4L2Buffer *buf = opaque; -+ DeintV4L2M2MContextShared *ctx = buf->q->ctx; -+ -+ if (!ctx->done) -+ deint_v4l2m2m_enqueue_buffer(buf); -+ -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+// timeout in ms -+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) -+{ -+ DeintV4L2M2MContextShared *ctx = queue->ctx; -+ V4L2Buffer* avbuf; -+ enum AVColorPrimaries color_primaries; -+ enum AVColorSpace colorspace; -+ enum AVColorTransferCharacteristic color_trc; -+ enum AVColorRange color_range; -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ if (queue->eos) { -+ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__); -+ return AVERROR_EOF; -+ } -+ -+ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); -+ if (!avbuf) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); -+ return AVERROR(EAGAIN); -+ } -+ -+ if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) { -+ if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0) -+ queue->eos = 1; -+ if (buf_bytesused0(&avbuf->buffer) == 0) -+ return queue->eos ? AVERROR_EOF : AVERROR(EINVAL); -+ } -+ -+ // Fill in PTS and anciliary info from src frame -+ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); -+ -+ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, -+ sizeof(avbuf->drm_frame), v4l2_free_buffer, -+ avbuf, AV_BUFFER_FLAG_READONLY); -+ if (!frame->buf[0]) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); -+ return AVERROR(ENOMEM); -+ } -+ -+ atomic_fetch_add(&ctx->refcount, 1); -+ -+ frame->data[0] = (uint8_t *)&avbuf->drm_frame; -+ frame->format = AV_PIX_FMT_DRM_PRIME; -+ if (ctx->hw_frames_ctx) -+ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); -+ frame->height = ctx->output_height; -+ frame->width = ctx->output_width; -+ -+ color_primaries = get_color_primaries(&ctx->capture.format); -+ colorspace = get_color_space(&ctx->capture.format); -+ color_trc = get_color_trc(&ctx->capture.format); -+ color_range = get_color_range(&ctx->capture.format); -+ -+ // If the color parameters are unspecified by V4L2 then leave alone as they -+ // will have been copied from src -+ if (color_primaries != AVCOL_PRI_UNSPECIFIED) -+ frame->color_primaries = color_primaries; -+ if (colorspace != AVCOL_SPC_UNSPECIFIED) -+ frame->colorspace = colorspace; -+ if (color_trc != AVCOL_TRC_UNSPECIFIED) -+ frame->color_trc = color_trc; -+ if (color_range != AVCOL_RANGE_UNSPECIFIED) -+ frame->color_range = color_range; -+ -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) { -+ // Not interlaced now -+ frame->interlaced_frame = 0; // *** Fill in from dst buffer? -+ frame->top_field_first = 0; -+ // Pkt duration halved -+ frame->pkt_duration /= 2; -+ } -+ -+ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { -+ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); -+ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; -+ } -+ -+ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); -+ return 0; -+} -+ -+static int deint_v4l2m2m_config_props(AVFilterLink *outlink) -+{ -+ AVFilterLink *inlink = outlink->src->inputs[0]; -+ AVFilterContext *avctx = outlink->src; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ int ret; -+ -+ ctx->height = avctx->inputs[0]->h; -+ ctx->width = avctx->inputs[0]->w; -+ -+ if (ctx->filter_type == FILTER_V4L2_SCALE) { -+ if ((ret = ff_scale_eval_dimensions(priv, -+ priv->w_expr, priv->h_expr, -+ inlink, outlink, -+ &ctx->output_width, &ctx->output_height)) < 0) -+ return ret; -+ -+ ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height, -+ priv->force_original_aspect_ratio, priv->force_divisible_by); -+ } -+ else { -+ ctx->output_width = ctx->width; -+ ctx->output_height = ctx->height; -+ } -+ -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__, -+ ctx->width, ctx->height, ctx->output_width, ctx->output_height, -+ inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den); -+ -+ outlink->time_base = inlink->time_base; -+ outlink->w = ctx->output_width; -+ outlink->h = ctx->output_height; -+ outlink->format = inlink->format; -+ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0) -+ outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den}; -+ -+ if (inlink->sample_aspect_ratio.num) -+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); -+ else -+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; -+ -+ ret = deint_v4l2m2m_find_device(ctx); -+ if (ret) -+ return ret; -+ -+ if (inlink->hw_frames_ctx) { -+ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); -+ if (!ctx->hw_frames_ctx) -+ return AVERROR(ENOMEM); -+ } -+ return 0; -+} -+ -+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) -+{ -+ static const enum AVPixelFormat pixel_formats[] = { -+ AV_PIX_FMT_DRM_PRIME, -+// AV_PIX_FMT_YUV420P, -+ AV_PIX_FMT_NONE, -+ }; -+ -+ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); -+} -+ -+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) -+{ -+ const uint64_t mod = drm_desc->objects[0].format_modifier; -+ const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID); -+ -+ // Only currently support single object things -+ if (drm_desc->nb_objects != 1) -+ return 0; -+ -+ switch (drm_desc->layers[0].format) { -+ case DRM_FORMAT_YUV420: -+ return is_linear ? V4L2_PIX_FMT_YUV420 : 0; -+ case DRM_FORMAT_NV12: -+ return is_linear ? V4L2_PIX_FMT_NV12 : -+#if CONFIG_SAND -+ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : -+#endif -+ 0; -+ default: -+ break; -+ } -+ return 0; -+} -+ -+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterContext *avctx = link->dst; -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ V4L2Queue *capture = &ctx->capture; -+ V4L2Queue *output = &ctx->output; -+ int ret; -+ -+ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n", -+ __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); -+ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, -+ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); -+ -+ if (ctx->field_order == V4L2_FIELD_ANY) { -+ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; -+ uint32_t pixelformat = desc_pixelformat(drm_desc); -+ -+ if (pixelformat == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", -+ av_fourcc2str(drm_desc->layers[0].format), -+ drm_desc->nb_objects, drm_desc->objects[0].format_modifier); -+ return AVERROR(EINVAL); -+ } -+ -+ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; -+ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; -+ -+ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, -+ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); -+ -+ if ((ret = set_src_fmt(output, in)) != 0) { -+ av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n", -+ av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier); -+ return ret; -+ } -+ -+ ret = do_s_fmt(output); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n"); -+ return ret; -+ } -+ -+ if (ctx->output_format != AV_PIX_FMT_NONE) -+ pixelformat = fmt_av_to_v4l2(ctx->output_format); -+ ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n"); -+ return ret; -+ } -+ -+ ret = deint_v4l2m2m_allocate_buffers(capture); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n"); -+ return ret; -+ } -+ -+ ret = deint_v4l2m2m_streamon(capture); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret)); -+ return ret; -+ } -+ -+ ret = deint_v4l2m2m_allocate_buffers(output); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n"); -+ return ret; -+ } -+ -+ ret = deint_v4l2m2m_streamon(output); -+ if (ret) { -+ av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret)); -+ return ret; -+ } -+ -+ if (in->top_field_first) -+ ctx->field_order = V4L2_FIELD_INTERLACED_TB; -+ else -+ ctx->field_order = V4L2_FIELD_INTERLACED_BT; -+ -+ { -+ struct v4l2_encoder_cmd ecmd = { -+ .cmd = V4L2_ENC_CMD_STOP -+ }; -+ ctx->has_enc_stop = 0; -+ if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n"); -+ ctx->has_enc_stop = 1; -+ } -+ else { -+ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno))); -+ } -+ -+ } -+ } -+ -+ ret = deint_v4l2m2m_enqueue_frame(output, in); -+ -+ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); -+ return ret; -+} -+ -+static int -+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s, -+ AVFilterLink * const inlink) -+{ -+ int instatus; -+ int64_t inpts; -+ -+ if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0) -+ return 0; -+ -+ s->drain = instatus; -+ s->drain_pts = inpts; -+ s->drain_state = DRAIN_TIMEOUT; -+ -+ if (s->field_order == V4L2_FIELD_ANY) { // Not yet started -+ s->drain_state = DRAIN_DONE; -+ } -+ else if (s->one_to_one) { -+ s->drain_state = DRAIN_LAST; -+ } -+ else if (s->has_enc_stop) { -+ struct v4l2_encoder_cmd ecmd = { -+ .cmd = V4L2_ENC_CMD_STOP -+ }; -+ if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) { -+ av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n"); -+ s->drain_state = DRAIN_EOS; -+ } -+ else { -+ av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno))); -+ } -+ } -+ return 1; -+} -+ -+static int deint_v4l2m2m_activate(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext * const priv = avctx->priv; -+ DeintV4L2M2MContextShared *const s = priv->shared; -+ AVFilterLink * const outlink = avctx->outputs[0]; -+ AVFilterLink * const inlink = avctx->inputs[0]; -+ int n = 0; -+ int cn = 99; -+ int did_something = 0; -+ -+ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); -+ -+ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); -+ -+ ack_inlink(avctx, s, inlink); -+ -+ if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! -+ { -+ AVFrame * frame = av_frame_alloc(); -+ int rv; -+ -+ recycle_q(&s->output); -+ n = count_enqueued(&s->output); -+ -+ if (frame == NULL) { -+ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); -+ return AVERROR(ENOMEM); -+ } -+ -+ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, -+ drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0); -+ if (rv != 0) { -+ av_frame_free(&frame); -+ if (rv == AVERROR_EOF) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } -+ else if (rv == AVERROR(EAGAIN)) { -+ if (s->drain_state != DRAIN_NONE) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } -+ } -+ else { -+ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } -+ } -+ else { -+ frame->interlaced_frame = 0; -+ // frame is always consumed by filter_frame - even on error despite -+ // a somewhat confusing comment in the header -+ rv = ff_filter_frame(outlink, frame); -+ ++s->frames_tx; -+ -+ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); -+ did_something = 1; -+ -+ if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) { -+ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__); -+ s->drain_state = DRAIN_DONE; -+ } -+ } -+ -+ cn = count_enqueued(&s->capture); -+ } -+ -+ if (s->drain_state == DRAIN_DONE) { -+ ff_outlink_set_status(outlink, s->drain, s->drain_pts); -+ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain)); -+ return 0; -+ } -+ -+ recycle_q(&s->output); -+ n = count_enqueued(&s->output); -+ -+ while (n < 6 && !s->drain) { -+ AVFrame * frame; -+ int rv; -+ -+ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { -+ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); -+ return rv; -+ } -+ -+ if (frame == NULL) { -+ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); -+ if (!ack_inlink(avctx, s, inlink)) { -+ ff_inlink_request_frame(inlink); -+ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); -+ } -+ break; -+ } -+ ++s->frames_rx; -+ -+ rv = deint_v4l2m2m_filter_frame(inlink, frame); -+ av_frame_free(&frame); -+ -+ if (rv != 0) -+ return rv; -+ -+ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); -+ did_something = 1; -+ ++n; -+ } -+ -+ if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) { -+ ff_filter_set_ready(avctx, 1); -+ did_something = 1; -+ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); -+ } -+ -+ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); -+ return did_something ? 0 : FFERROR_NOT_READY; -+} -+ -+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type) -+{ -+ DeintV4L2M2MContext * const priv = avctx->priv; -+ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); -+ -+ if (!ctx) { -+ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); -+ return AVERROR(ENOMEM); -+ } -+ priv->shared = ctx; -+ ctx->logctx = priv; -+ ctx->filter_type = filter_type; -+ ctx->fd = -1; -+ ctx->output.ctx = ctx; -+ ctx->output.num_buffers = 8; -+ ctx->output.name = "OUTPUT"; -+ ctx->capture.ctx = ctx; -+ ctx->capture.num_buffers = 12; -+ ctx->capture.name = "CAPTURE"; -+ ctx->done = 0; -+ ctx->field_order = V4L2_FIELD_ANY; -+ -+ pts_track_init(&ctx->track, priv); -+ -+ atomic_init(&ctx->refcount, 1); -+ -+ if (priv->output_format_string) { -+ ctx->output_format = av_get_pix_fmt(priv->output_format_string); -+ if (ctx->output_format == AV_PIX_FMT_NONE) { -+ av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string); -+ return AVERROR(EINVAL); -+ } -+ if (fmt_av_to_v4l2(ctx->output_format) == 0) { -+ av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format)); -+ return AVERROR(EINVAL); -+ } -+ } else { -+ // Use the input format once that is configured. -+ ctx->output_format = AV_PIX_FMT_NONE; -+ } -+ -+#define STRING_OPTION(var_name, func_name, default_value) do { \ -+ if (priv->var_name ## _string) { \ -+ int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \ -+ if (var < 0) { \ -+ av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \ -+ return AVERROR(EINVAL); \ -+ } \ -+ priv->var_name = var; \ -+ } else { \ -+ priv->var_name = default_value; \ -+ } \ -+ } while (0) -+ -+ STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED); -+ STRING_OPTION(colour_transfer, color_transfer, AVCOL_TRC_UNSPECIFIED); -+ STRING_OPTION(colour_matrix, color_space, AVCOL_SPC_UNSPECIFIED); -+ STRING_OPTION(chroma_location, chroma_location, AVCHROMA_LOC_UNSPECIFIED); -+ -+ return 0; -+} -+ -+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE); -+} -+ -+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx) -+{ -+ int rv; -+ DeintV4L2M2MContext * priv; -+ DeintV4L2M2MContextShared * ctx; -+ -+ if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0) -+ return rv; -+ -+ priv = avctx->priv; -+ ctx = priv->shared; -+ -+ ctx->one_to_one = 1; -+ return 0; -+} -+ -+static void deint_v4l2m2m_uninit(AVFilterContext *avctx) -+{ -+ DeintV4L2M2MContext *priv = avctx->priv; -+ DeintV4L2M2MContextShared *ctx = priv->shared; -+ -+ av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n", -+ ctx->frames_rx, ctx->frames_tx); -+ ctx->done = 1; -+ ctx->logctx = NULL; // Log to NULL works, log to missing crashes -+ pts_track_uninit(&ctx->track); -+ deint_v4l2m2m_destroy_context(ctx); -+} -+ -+static const AVOption deinterlace_v4l2m2m_options[] = { -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); -+ -+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x) -+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) -+ -+static const AVOption scale_v4l2m2m_options[] = { -+ { "w", "Output video width", -+ OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS }, -+ { "h", "Output video height", -+ OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS }, -+ { "format", "Output video format (software format of hardware frames)", -+ OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS }, -+ // These colour properties match the ones of the same name in vf_scale. -+ { "out_color_matrix", "Output colour matrix coefficient set", -+ OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS }, -+ { "out_range", "Output colour range", -+ OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, -+ AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" }, -+ { "full", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ { "limited", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "jpeg", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ { "mpeg", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "tv", "Limited range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, -+ { "pc", "Full range", -+ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, -+ // These colour properties match the ones in the VAAPI scaler -+ { "out_color_primaries", "Output colour primaries", -+ OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "out_color_transfer", "Output colour transfer characteristics", -+ OFFSET(colour_transfer_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "out_chroma_location", "Output chroma sample location", -+ OFFSET(chroma_location_string), AV_OPT_TYPE_STRING, -+ { .str = NULL }, .flags = FLAGS }, -+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, -+ { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS }, -+ { NULL }, -+}; -+ -+AVFILTER_DEFINE_CLASS(scale_v4l2m2m); -+ -+static const AVFilterPad deint_v4l2m2m_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ }, -+ { NULL } -+}; -+ -+static const AVFilterPad deint_v4l2m2m_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .config_props = deint_v4l2m2m_config_props, -+ }, -+ { NULL } -+}; -+ -+AVFilter ff_vf_deinterlace_v4l2m2m = { -+ .name = "deinterlace_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &deint_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ .query_formats = &deint_v4l2m2m_query_formats, -+ .inputs = deint_v4l2m2m_inputs, -+ .outputs = deint_v4l2m2m_outputs, -+ .priv_class = &deinterlace_v4l2m2m_class, -+ .activate = deint_v4l2m2m_activate, -+}; -+ -+AVFilter ff_vf_scale_v4l2m2m = { -+ .name = "scale_v4l2m2m", -+ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"), -+ .priv_size = sizeof(DeintV4L2M2MContext), -+ .init = &scale_v4l2m2m_init, -+ .uninit = &deint_v4l2m2m_uninit, -+ .query_formats = &deint_v4l2m2m_query_formats, -+ .inputs = deint_v4l2m2m_inputs, -+ .outputs = deint_v4l2m2m_outputs, -+ .priv_class = &scale_v4l2m2m_class, -+ .activate = deint_v4l2m2m_activate, -+}; -+ -diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c -new file mode 100644 -index 0000000000..61c03a385c ---- /dev/null -+++ b/libavfilter/vf_unsand.c -@@ -0,0 +1,229 @@ -+/* -+ * Copyright (c) 2007 Bobby Bingham -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+/** -+ * @file -+ * format and noformat video filters -+ */ -+ -+#include -+ -+#include "libavutil/internal.h" -+#include "libavutil/mem.h" -+#include "libavutil/pixdesc.h" -+#include "libavutil/opt.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" -+ -+typedef struct UnsandContext { -+ const AVClass *class; -+} UnsandContext; -+ -+static av_cold void uninit(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+} -+ -+static av_cold int init(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+ -+ return 0; -+} -+ -+ -+static int filter_frame(AVFilterLink *link, AVFrame *in) -+{ -+ AVFilterLink * const outlink = link->dst->outputs[0]; -+ AVFrame *out = NULL; -+ int rv = 0; -+ -+ if (outlink->format == in->format) { -+ // If nothing to do then do nothing -+ out = in; -+ } -+ else -+ { -+ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) -+ { -+ rv = AVERROR(ENOMEM); -+ goto fail; -+ } -+ if (av_rpi_sand_to_planar_frame(out, in) != 0) -+ { -+ rv = -1; -+ goto fail; -+ } -+ -+ av_frame_free(&in); -+ } -+ -+ return ff_filter_frame(outlink, out); -+ -+fail: -+ av_frame_free(&out); -+ av_frame_free(&in); -+ return rv; -+} -+ -+#if 0 -+static void dump_fmts(const AVFilterFormats * fmts) -+{ -+ int i; -+ if (fmts== NULL) { -+ printf("NULL\n"); -+ return; -+ } -+ for (i = 0; i < fmts->nb_formats; ++i) { -+ printf(" %d", fmts->formats[i]); -+ } -+ printf("\n"); -+} -+#endif -+ -+static int query_formats(AVFilterContext *ctx) -+{ -+// UnsandContext *s = ctx->priv; -+ int ret; -+ -+ // If we aren't connected at both ends then just do nothing -+ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) -+ return 0; -+ -+ // Our output formats depend on our input formats and we can't/don't -+ // want to convert between bit depths so we need to wait for the source -+ // to have an opinion before we do -+ if (ctx->inputs[0]->incfg.formats == NULL) -+ return AVERROR(EAGAIN); -+ -+ // Accept anything -+ if (ctx->inputs[0]->outcfg.formats == NULL && -+ (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0) -+ return ret; -+ -+ // Filter out sand formats -+ -+ // Generate a container if we don't already have one -+ if (ctx->outputs[0]->incfg.formats == NULL) -+ { -+ // Somewhat rubbish way of ensuring we have a good structure -+ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; -+ AVFilterFormats *formats = ff_make_format_list(out_fmts); -+ -+ if (formats == NULL) -+ return AVERROR(ENOMEM); -+ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) -+ return ret; -+ } -+ -+ // Replace old format list with new filtered list derived from what our -+ // input says it can do -+ { -+ const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats; -+ AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats; -+ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); -+ int i; -+ int n = 0; -+ int seen_420p = 0; -+ int seen_420p10 = 0; -+ -+ for (i = 0; i < src_ff->nb_formats; ++i) { -+ const enum AVPixelFormat f = src_ff->formats[i]; -+ -+ switch (f){ -+ case AV_PIX_FMT_YUV420P: -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ if (!seen_420p) { -+ seen_420p = 1; -+ dst_fmts[n++] = AV_PIX_FMT_YUV420P; -+ } -+ break; -+ case AV_PIX_FMT_SAND64_10: -+ case AV_PIX_FMT_YUV420P10: -+ case AV_PIX_FMT_RPI4_10: -+ if (!seen_420p10) { -+ seen_420p10 = 1; -+ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; -+ } -+ break; -+ default: -+ dst_fmts[n++] = f; -+ break; -+ } -+ } -+ -+ av_freep(&dst_ff->formats); -+ dst_ff->formats = dst_fmts; -+ dst_ff->nb_formats = n; -+ } -+ -+// printf("Unsand: %s calc: ", __func__); -+// dump_fmts(ctx->outputs[0]->incfg.formats); -+ -+ return 0; -+} -+ -+ -+#define OFFSET(x) offsetof(UnsandContext, x) -+static const AVOption unsand_options[] = { -+ { NULL } -+}; -+ -+ -+AVFILTER_DEFINE_CLASS(unsand); -+ -+static const AVFilterPad avfilter_vf_unsand_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .filter_frame = filter_frame, -+ }, -+ { NULL } -+}; -+ -+static const AVFilterPad avfilter_vf_unsand_outputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO -+ }, -+ { NULL } -+}; -+ -+AVFilter ff_vf_unsand = { -+ .name = "unsand", -+ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), -+ -+ .init = init, -+ .uninit = uninit, -+ -+ .query_formats = query_formats, -+ -+ .priv_size = sizeof(UnsandContext), -+ .priv_class = &unsand_class, -+ -+ .inputs = avfilter_vf_unsand_inputs, -+ .outputs = avfilter_vf_unsand_outputs, -+}; -+ -diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c -index bbf231f2a4..22571c89a3 100644 ---- a/libavformat/matroskaenc.c -+++ b/libavformat/matroskaenc.c -@@ -58,6 +58,9 @@ - * Info, Tracks, Chapters, Attachments, Tags (potentially twice) and Cues */ - #define MAX_SEEKHEAD_ENTRIES 7 - -+/* Reserved size for H264 headers if not extant at init time */ -+#define MAX_H264_HEADER_SIZE 1024 -+ - #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \ - !(mkv)->is_live) - -@@ -721,8 +724,12 @@ static int mkv_write_native_codecprivate(AVFormatContext *s, AVIOContext *pb, - case AV_CODEC_ID_WAVPACK: - return put_wv_codecpriv(dyn_cp, par); - case AV_CODEC_ID_H264: -- return ff_isom_write_avcc(dyn_cp, par->extradata, -- par->extradata_size); -+ if (par->extradata_size) -+ return ff_isom_write_avcc(dyn_cp, par->extradata, -+ par->extradata_size); -+ else -+ put_ebml_void(pb, MAX_H264_HEADER_SIZE); -+ break; - case AV_CODEC_ID_HEVC: - return ff_isom_write_hvcc(dyn_cp, par->extradata, - par->extradata_size, 0); -@@ -2258,7 +2265,9 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) - break; - // FIXME: Remove the following once libaom starts propagating extradata during init() - // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012 -+ // H264 V4L2 has a similar issue - case AV_CODEC_ID_AV1: -+ case AV_CODEC_ID_H264: - if (side_data_size && mkv->track.bc && !par->extradata_size) { - AVIOContext *dyn_cp; - uint8_t *codecpriv; -@@ -2266,7 +2275,10 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) - ret = avio_open_dyn_buf(&dyn_cp); - if (ret < 0) - return ret; -- ff_isom_write_av1c(dyn_cp, side_data, side_data_size); -+ if (par->codec_id == AV_CODEC_ID_H264) -+ ff_isom_write_avcc(dyn_cp, side_data, side_data_size); -+ else -+ ff_isom_write_av1c(dyn_cp, side_data, side_data_size); - codecpriv_size = avio_get_dyn_buf(dyn_cp, &codecpriv); - if ((ret = dyn_cp->error) < 0 || - !codecpriv_size && (ret = AVERROR_INVALIDDATA)) { -@@ -2274,8 +2286,25 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) - return ret; - } - avio_seek(mkv->track.bc, track->codecpriv_offset, SEEK_SET); -- // Do not write the OBUs as we don't have space saved for them -- put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4); -+ if (par->codec_id == AV_CODEC_ID_H264) { -+ int filler; -+ // Up to 6 bytes for header and the filler must be at least 2 -+ if (codecpriv_size > MAX_H264_HEADER_SIZE - 8) { -+ av_log(s, AV_LOG_ERROR, "H264 header size %d > %d bytes\n", codecpriv_size, MAX_H264_HEADER_SIZE - 8); -+ return AVERROR_INVALIDDATA; -+ } -+ put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, codecpriv_size); -+ filler = MAX_H264_HEADER_SIZE - (avio_tell(mkv->track.bc) - track->codecpriv_offset); -+ if (filler < 2) { -+ av_log(s, AV_LOG_ERROR, "Unexpected SPS/PPS filler length: %d\n", filler); -+ return AVERROR_BUG; -+ } -+ put_ebml_void(mkv->track.bc, filler); -+ } -+ else { -+ // Do not write the OBUs as we don't have space saved for them -+ put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4); -+ } - ffio_free_dyn_buf(&dyn_cp); - ret = ff_alloc_extradata(par, side_data_size); - if (ret < 0) -diff --git a/libavformat/movenc.c b/libavformat/movenc.c -index bade57dcea..d23101b23f 100644 ---- a/libavformat/movenc.c -+++ b/libavformat/movenc.c -@@ -5913,6 +5913,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt) - if (trk->par->codec_id == AV_CODEC_ID_MP4ALS || - trk->par->codec_id == AV_CODEC_ID_AAC || - trk->par->codec_id == AV_CODEC_ID_AV1 || -+ trk->par->codec_id == AV_CODEC_ID_H264 || - trk->par->codec_id == AV_CODEC_ID_FLAC) { - buffer_size_t side_size; - uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); -diff --git a/libavformat/utils.c b/libavformat/utils.c -index 1384b56771..27479e3c40 100644 ---- a/libavformat/utils.c -+++ b/libavformat/utils.c -@@ -3011,6 +3011,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) - return 1; - } - -+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER -+// This should be quite general purpose but avoid possible conflicts -+// by limiting usage to cases wehere we know it works. -+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts) -+{ -+ // Only try fallback if we know it is supported (HEVC only) -+ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL : -+ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE); -+ int err; -+ -+ // Failed to find fallback or we are already at the fallback -+ if (new_codec == NULL || new_codec == old_codec) -+ { -+ return AVERROR_DECODER_NOT_FOUND; -+ } -+ -+ // * This may be dodgy - header says to not use this fn, -+ // especially if we are going to reopen the context... -+ // (but it does seem to work for our cases) -+ if (avcodec_is_open(avctx)) { -+ avcodec_close(avctx); -+ } -+ -+ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0) -+ { -+ return err; -+ } -+ -+ return 0; -+} -+#else -+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND) -+#endif -+ - /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ - static int try_decode_frame(AVFormatContext *s, AVStream *st, - const AVPacket *avpkt, AVDictionary **options) -@@ -3049,7 +3083,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, - av_dict_set(options ? options : &thread_opt, "lowres", "0", 0); - if (s->codec_whitelist) - av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); -- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt); -+ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND) -+ { -+ // Try fallback if if looks worth a try -+ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt); -+ } - if (!options) - av_dict_free(&thread_opt); - if (ret < 0) { -@@ -3080,6 +3118,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, - if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || - avctx->codec_type == AVMEDIA_TYPE_AUDIO) { - ret = avcodec_send_packet(avctx, &pkt); -+ -+ // If we are going to want to fall back we should know here -+ if (ret == AVERROR_DECODER_NOT_FOUND) { -+ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0) -+ break; -+ continue; -+ } -+ - if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) - break; - if (ret >= 0) -@@ -3708,9 +3754,20 @@ FF_ENABLE_DEPRECATION_WARNINGS - // Try to just open decoders, in case this is enough to get parameters. - if (!has_codec_parameters(st, NULL) && st->internal->request_probe <= 0) { - if (codec && !avctx->codec) -- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0) -- av_log(ic, AV_LOG_WARNING, -- "Failed to open codec in %s\n",__FUNCTION__); -+ { -+ int err; -+ -+ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0) -+ { -+ if (err == AVERROR_DECODER_NOT_FOUND) { -+ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt); -+ } -+ if (err < 0) { -+ av_log(ic, AV_LOG_WARNING, -+ "Failed to open codec in %s\n",__FUNCTION__); -+ } -+ } -+ } - } - if (!options) - av_dict_free(&thread_opt); -diff --git a/libavutil/Makefile b/libavutil/Makefile -index 27bafe9e12..c9075ddf8a 100644 ---- a/libavutil/Makefile -+++ b/libavutil/Makefile -@@ -68,6 +68,7 @@ HEADERS = adler32.h \ - rational.h \ - replaygain.h \ - ripemd.h \ -+ rpi_sand_fns.h \ - samplefmt.h \ - sha.h \ - sha512.h \ -@@ -87,6 +88,7 @@ HEADERS = adler32.h \ - film_grain_params.h \ - - HEADERS-$(CONFIG_LZO) += lzo.h -+HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h - - ARCH_HEADERS = bswap.h \ - intmath.h \ -@@ -182,6 +184,7 @@ OBJS-$(CONFIG_LZO) += lzo.o - OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o - OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o - OBJS-$(CONFIG_QSV) += hwcontext_qsv.o -+OBJS-$(CONFIG_SAND) += rpi_sand_fns.o - OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o - OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o - OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o -diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile -index 5613813ba8..ab8bcfcf34 100644 ---- a/libavutil/aarch64/Makefile -+++ b/libavutil/aarch64/Makefile -@@ -1,4 +1,6 @@ - OBJS += aarch64/cpu.o \ - aarch64/float_dsp_init.o \ - --NEON-OBJS += aarch64/float_dsp_neon.o -+NEON-OBJS += aarch64/float_dsp_neon.o \ -+ aarch64/rpi_sand_neon.o \ -+ -diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S -new file mode 100644 -index 0000000000..2f07d9674c ---- /dev/null -+++ b/libavutil/aarch64/rpi_sand_neon.S -@@ -0,0 +1,781 @@ -+/* -+Copyright (c) 2021 Michael Eiler -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: Michael Eiler -+*/ -+ -+#include "asm.S" -+ -+// void ff_rpi_sand8_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+ -+function ff_rpi_sand8_lines_to_planar_y8, export=1 -+ // w15 contains the number of rows we need to process -+ ldr w15, [sp, #0] -+ -+ // w8 will contain the number of blocks per row -+ // w8 = floor(_w/stride1) -+ // stride1 is assumed to always be 128 -+ mov w8, w1 -+ lsr w8, w8, #7 -+ -+ // in case the width of the image is not a multiple of 128, there will -+ // be an incomplete block at the end of every row -+ // w9 contains the number of pixels stored within this block -+ // w9 = _w - w8 * 128 -+ lsl w9, w8, #7 -+ sub w9, w7, w9 -+ -+ // this is the value we have to add to the src pointer after reading a complete block -+ // it will move the address to the start of the next block -+ // w10 = stride2 * stride1 - stride1 -+ mov w10, w4 -+ lsl w10, w10, #7 -+ sub w10, w10, #128 -+ -+ // w11 is the row offset, meaning the start offset of the first block of every collumn -+ // this will be increased with stride1 within every iteration of the row_loop -+ eor w11, w11, w11 -+ -+ // w12 = 0, processed row count -+ eor w12, w12, w12 -+row_loop: -+ // start of the first block within the current row -+ // x13 = row offset + src -+ mov x13, x2 -+ add x13, x13, x11 -+ -+ // w14 = 0, processed block count -+ eor w14, w14, w14 -+ -+ cmp w8, #0 -+ beq no_main_y8 -+ -+block_loop: -+ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 -+ // fortunately these aren't callee saved ones, meaning we don't need to backup them -+ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 -+ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 -+ -+ // write these registers back to the destination vector and increase the dst address by 128 -+ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 -+ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64 -+ -+ // move the source register to the beginning of the next block (x13 = src + block offset) -+ add x13, x13, x10 -+ // increase the block counter -+ add w14, w14, #1 -+ -+ // continue with the block_loop if we haven't copied all full blocks yet -+ cmp w8, w14 -+ bgt block_loop -+ -+ // handle the last block at the end of each row -+ // at most 127 byte values copied from src to dst -+no_main_y8: -+ eor w5, w5, w5 // i = 0 -+incomplete_block_loop_y8: -+ cmp w5, w9 -+ bge incomplete_block_loop_end_y8 -+ -+ ldrb w6, [x13] -+ strb w6, [x0] -+ add x13, x13, #1 -+ add x0, x0, #1 -+ -+ add w5, w5, #1 -+ b incomplete_block_loop_y8 -+incomplete_block_loop_end_y8: -+ -+ -+ // increase the row offset by 128 (stride1) -+ add w11, w11, #128 -+ // increment the row counter -+ add w12, w12, #1 -+ -+ // process the next row if we haven't finished yet -+ cmp w15, w12 -+ bgt row_loop -+ -+ ret -+endfunc -+ -+ -+ -+// void ff_rpi_sand8_lines_to_planar_c8( -+// uint8_t * dst_u, : x0 -+// unsigned int dst_stride_u, : w1 == width -+// uint8_t * dst_v, : x2 -+// unsigned int dst_stride_v, : w3 == width -+// const uint8_t * src, : x4 -+// unsigned int stride1, : w5 == 128 -+// unsigned int stride2, : w6 -+// unsigned int _x, : w7 -+// unsigned int y, : [sp, #0] -+// unsigned int _w, : [sp, #8] -+// unsigned int h); : [sp, #16] -+ -+function ff_rpi_sand8_lines_to_planar_c8, export=1 -+ // w7 = width -+ ldr w7, [sp, #8] -+ -+ // w15 contains the number of rows we need to process -+ // counts down -+ ldr w15, [sp, #16] -+ -+ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6 -+ mov w8, w7 -+ lsr w8, w8, #6 -+ -+ // number of pixels in block at the end of every row -+ // w9 = _w - (w8 * 64) -+ lsl w9, w8, #6 -+ sub w9, w7, w9 -+ -+ // Skip at the end of the line to account for stride -+ sub w12, w1, w7 -+ -+ // address delta to the beginning of the next block -+ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128 -+ lsl w10, w6, #7 -+ sub w10, w10, #128 -+ -+ // w11 = row address start offset = 0 -+ eor w11, w11, w11 -+ -+row_loop_c8: -+ // start of the first block within the current row -+ // x13 = row offset + src -+ mov x13, x4 -+ add x13, x13, x11 -+ -+ // w14 = 0, processed block count -+ eor w14, w14, w14 -+ -+ cmp w8, #0 -+ beq no_main_c8 -+ -+block_loop_c8: -+ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values -+ ld2 { v0.16b, v1.16b }, [x13], #32 -+ ld2 { v2.16b, v3.16b }, [x13], #32 -+ ld2 { v4.16b, v5.16b }, [x13], #32 -+ ld2 { v6.16b, v7.16b }, [x13], #32 -+ -+ // swap register so that we can write them out with a single instruction -+ mov v16.16b, v1.16b -+ mov v17.16b, v3.16b -+ mov v18.16b, v5.16b -+ mov v1.16b, v2.16b -+ mov v2.16b, v4.16b -+ mov v3.16b, v6.16b -+ mov v4.16b, v16.16b -+ mov v5.16b, v17.16b -+ mov v6.16b, v18.16b -+ -+ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 -+ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64 -+ -+ // increment row counter and move src to the beginning of the next block -+ add w14, w14, #1 -+ add x13, x13, x10 -+ -+ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks -+ cmp w8, w14 -+ bgt block_loop_c8 -+ -+no_main_c8: -+ // handle incomplete block at the end of every row -+ eor w5, w5, w5 // point counter, this might be -+incomplete_block_loop_c8: -+ cmp w5, w9 -+ bge incomplete_block_loop_end_c8 -+ -+ ldrb w1, [x13] -+ strb w1, [x0] -+ add x13, x13, #1 -+ -+ ldrb w1, [x13] -+ strb w1, [x2] -+ add x13, x13, #1 -+ -+ add x0, x0, #1 -+ add x2, x2, #1 -+ -+ add w5, w5, #1 -+ b incomplete_block_loop_c8 -+incomplete_block_loop_end_c8: -+ -+ // increase row_offset by stride1 -+ add w11, w11, #128 -+ add x0, x0, w12, sxtw -+ add x2, x2, w12, sxtw -+ -+ // jump to row_Loop_c8 iff the row count is small than the height -+ subs w15, w15, #1 -+ bgt row_loop_c8 -+ -+ ret -+endfunc -+ -+//void ff_rpi_sand30_lines_to_planar_c16( -+// uint8_t * dst_u, // [x0] -+// unsigned int dst_stride_u, // [w1] == _w*2 -+// uint8_t * dst_v, // [x2] -+// unsigned int dst_stride_v, // [w3] == _w*2 -+// const uint8_t * src, // [x4] -+// unsigned int stride1, // [w5] == 128 -+// unsigned int stride2, // [w6] -+// unsigned int _x, // [w7] == 0 -+// unsigned int y, // [sp, #0] == 0 -+// unsigned int _w, // [sp, #8] -> w3 -+// unsigned int h); // [sp, #16] -> w7 -+ -+.macro rpi_sand30_lines_to_planar_c16_block_half -+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 -+ -+ xtn v4.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v5.4h, v0.4s -+ ushr v0.4s, v0.4s, #10 -+ xtn v6.4h, v0.4s -+ xtn2 v4.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v5.8h, v1.4s -+ ushr v1.4s, v1.4s, #10 -+ xtn2 v6.8h, v1.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 -+ -+ xtn v4.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v5.4h, v2.4s -+ ushr v2.4s, v2.4s, #10 -+ xtn v6.4h, v2.4s -+ xtn2 v4.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v5.8h, v3.4s -+ ushr v3.4s, v3.4s, #10 -+ xtn2 v6.8h, v3.4s -+ and v4.16b, v4.16b, v16.16b -+ and v5.16b, v5.16b, v16.16b -+ and v6.16b, v6.16b, v16.16b -+ st3 { v4.8h, v5.8h, v6.8h }, [sp] -+ sub sp, sp, #48 -+.endm -+ -+function ff_rpi_sand30_lines_to_planar_c16, export=1 -+ stp x19, x20, [sp, #-48]! -+ stp x21, x22, [sp, #16] -+ stp x23, x24, [sp, #32] -+ -+ ldr w3, [sp, #48+8] // w3 = width -+ ldr w7, [sp, #48+16] // w7 = height -+ -+ // reserve space on the stack for intermediate results -+ sub sp, sp, #256 -+ -+ // number of 128byte blocks per row, w8 = width / 48 -+ mov w9, #48 -+ udiv w8, w3, w9 -+ -+ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 -+ mul w9, w8, w9 -+ sub w9, w3, w9 -+ -+ // row offset, the beginning of the next row to process -+ eor w10, w10, w10 -+ -+ // offset to the beginning of the next block, w11 = stride2 * 128 - 128 -+ lsl w11, w6, #7 -+ sub w11, w11, #128 -+ -+ // decrease the height by one and in case of remaining pixels increase the block count by one -+ sub w7, w7, #1 -+ cmp w9, #0 -+ cset w19, ne // w19 == 1 iff reamining pixels != 0 -+ add w8, w8, w19 -+ -+ // bytes we have to move dst back by at the end of every row -+ mov w21, #48*2 -+ mul w21, w21, w8 -+ sub w21, w1, w21 -+ -+ mov w20, #0 // w20 = flag, last row processed -+ -+ mov x12, #0x03ff03ff03ff03ff -+ dup v16.2d, x12 -+ -+ // iterate through rows, row counter = w12 = 0 -+ eor w12, w12, w12 -+row_loop_c16: -+ cmp w12, w7 -+ bge row_loop_c16_fin -+ -+ // address of row data = src + row_offset -+ mov x13, x4 -+ add x13, x13, x10 -+ -+ eor w14, w14, w14 -+block_loop_c16: -+ cmp w14, w8 -+ bge block_loop_c16_fin -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ rpi_sand30_lines_to_planar_c16_block_half -+ -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #64 -+ -+ st1 { v0.8h }, [x0], #16 -+ st1 { v2.8h }, [x0], #16 -+ st1 { v4.8h }, [x0], #16 -+ st1 { v1.8h }, [x2], #16 -+ st1 { v3.8h }, [x2], #16 -+ st1 { v5.8h }, [x2], #16 -+ -+ add x13, x13, x11 // offset to next block -+ add w14, w14, #1 -+ b block_loop_c16 -+block_loop_c16_fin: -+ -+ add w10, w10, #128 -+ add w12, w12, #1 -+ add x0, x0, w21, sxtw // move dst pointers back by x21 -+ add x2, x2, w21, sxtw -+ b row_loop_c16 -+row_loop_c16_fin: -+ -+ cmp w20, #1 -+ beq row_loop_c16_fin2 -+ mov w20, #1 -+ sub w8, w8, w19 // decrease block count by w19 -+ add w7, w7, #1 // increase height -+ b row_loop_c16 -+ -+row_loop_c16_fin2: -+ sub x0, x0, w21, sxtw // readd x21 in case of the last row -+ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels -+ -+ // last incomplete block to be finished -+ // read operations are fine, stride2 is more than large enough even if rem_pix is 0 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp], #32 -+ rpi_sand30_lines_to_planar_c16_block_half -+ ld2 { v0.8h, v1.8h }, [sp], #32 -+ ld2 { v2.8h, v3.8h }, [sp], #32 -+ ld2 { v4.8h, v5.8h }, [sp] -+ sub sp, sp, #160 -+ -+ mov x4, sp -+ eor w20, w20, w20 -+rem_pix_c16_loop: -+ cmp w20, w9 -+ bge rem_pix_c16_fin -+ -+ ldr w22, [x4], #4 -+ str w22, [x0], #2 -+ lsr w22, w22, #16 -+ str w22, [x2], #2 -+ -+ add w20, w20, #1 -+ b rem_pix_c16_loop -+rem_pix_c16_fin: -+ -+ add sp, sp, #256 -+ -+ ldp x23, x24, [sp, #32] -+ ldp x21, x22, [sp, #16] -+ ldp x19, x20, [sp], #48 -+ ret -+endfunc -+ -+ -+ -+//void ff_rpi_sand30_lines_to_planar_p010( -+// uint8_t * dest, -+// unsigned int dst_stride, -+// const uint8_t * src, -+// unsigned int src_stride1, -+// unsigned int src_stride2, -+// unsigned int _x, -+// unsigned int y, -+// unsigned int _w, -+// unsigned int h); -+ -+// void ff_rpi_sand30_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+// -+// Assumes that we are starting on a stripe boundary and that overreading -+// within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ lsl w4, w4, #7 -+ sub w4, w4, #64 -+ sub w1, w1, w7, lsl #1 -+ uxtw x6, w6 -+ add x8, x2, x6, lsl #7 -+ ldr w6, [sp, #0] -+ -+10: -+ mov x2, x8 -+ mov w5, w7 -+1: -+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 -+ -+ subs w5, w5, #96 -+ -+ // v0, v1 -+ -+ shrn v18.4h, v0.4s, #14 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #10 -+ -+ shrn2 v18.8h, v1.4s, #14 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #10 -+ -+ ushr v18.8h, v18.8h, #6 -+ bic v16.8h, #0xfc, lsl #8 -+ bic v17.8h, #0xfc, lsl #8 -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #14 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #10 -+ -+ shrn2 v21.8h, v3.4s, #14 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #10 -+ -+ ushr v21.8h, v21.8h, #6 -+ bic v19.8h, #0xfc, lsl #8 -+ bic v20.8h, #0xfc, lsl #8 -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #14 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #10 -+ -+ shrn2 v24.8h, v5.4s, #14 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #10 -+ -+ ushr v24.8h, v24.8h, #6 -+ bic v22.8h, #0xfc, lsl #8 -+ bic v23.8h, #0xfc, lsl #8 -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #14 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #10 -+ -+ shrn2 v27.8h, v7.4s, #14 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #10 -+ -+ ushr v27.8h, v27.8h, #6 -+ bic v25.8h, #0xfc, lsl #8 -+ bic v26.8h, #0xfc, lsl #8 -+ -+ blt 2f -+ -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 -+ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48 -+ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48 -+ -+ bne 1b -+ -+11: -+ subs w6, w6, #1 -+ add x0, x0, w1, uxtw -+ add x8, x8, #128 -+ bne 10b -+ -+ ret -+ -+// Partial final write -+2: -+ cmp w5, #48-96 -+ blt 1f -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 -+ beq 11b -+ mov v16.16b, v22.16b -+ mov v17.16b, v23.16b -+ sub w5, w5, #48 -+ mov v18.16b, v24.16b -+ mov v19.16b, v25.16b -+ mov v20.16b, v26.16b -+ mov v21.16b, v27.16b -+1: -+ cmp w5, #24-96 -+ blt 1f -+ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 -+ beq 11b -+ mov v16.16b, v19.16b -+ mov v17.16b, v20.16b -+ sub w5, w5, #24 -+ mov v18.16b, v21.16b -+1: -+ cmp w5, #12-96 -+ blt 1f -+ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24 -+ beq 11b -+ mov v16.2d[0], v16.2d[1] -+ sub w5, w5, #12 -+ mov v17.2d[0], v17.2d[1] -+ mov v18.2d[0], v18.2d[1] -+1: -+ cmp w5, #6-96 -+ blt 1f -+ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 -+ st3 {v16.h, v17.h, v18.h}[1], [x0], #6 -+ beq 11b -+ mov v16.2s[0], v16.2s[1] -+ sub w5, w5, #6 -+ mov v17.2s[0], v17.2s[1] -+ mov v18.2s[0], v18.2s[1] -+1: -+ cmp w5, #3-96 -+ blt 1f -+ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 -+ beq 11b -+ mov v16.4h[0], v16.4h[1] -+ sub w5, w5, #3 -+ mov v17.4h[0], v17.4h[1] -+1: -+ cmp w5, #2-96 -+ blt 1f -+ st2 {v16.h, v17.h}[0], [x0], #4 -+ b 11b -+1: -+ st1 {v16.h}[0], [x0], #2 -+ b 11b -+ -+endfunc -+ -+// void ff_rpi_sand30_lines_to_planar_y8( -+// uint8_t * dest, : x0 -+// unsigned int dst_stride, : w1 -+// const uint8_t * src, : x2 -+// unsigned int src_stride1, : w3, always 128 -+// unsigned int src_stride2, : w4 -+// unsigned int _x, : w5 -+// unsigned int y, : w6 -+// unsigned int _w, : w7 -+// unsigned int h); : [sp, #0] -+// -+// Assumes that we are starting on a stripe boundary and that overreading -+// within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y8, export=1 -+ lsl w4, w4, #7 -+ sub w4, w4, #64 -+ sub w1, w1, w7 -+ uxtw x6, w6 -+ add x8, x2, x6, lsl #7 -+ ldr w6, [sp, #0] -+ -+10: -+ mov x2, x8 -+ mov w5, w7 -+1: -+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 -+ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 -+ -+ subs w5, w5, #96 -+ -+ // v0, v1 -+ -+ shrn v18.4h, v0.4s, #16 -+ xtn v16.4h, v0.4s -+ shrn v17.4h, v0.4s, #12 -+ -+ shrn2 v18.8h, v1.4s, #16 -+ xtn2 v16.8h, v1.4s -+ shrn2 v17.8h, v1.4s, #12 -+ -+ shrn v18.8b, v18.8h, #6 -+ shrn v16.8b, v16.8h, #2 -+ xtn v17.8b, v17.8h -+ -+ // v2, v3 -+ -+ shrn v21.4h, v2.4s, #16 -+ xtn v19.4h, v2.4s -+ shrn v20.4h, v2.4s, #12 -+ -+ shrn2 v21.8h, v3.4s, #16 -+ xtn2 v19.8h, v3.4s -+ shrn2 v20.8h, v3.4s, #12 -+ -+ shrn2 v18.16b, v21.8h, #6 -+ shrn2 v16.16b, v19.8h, #2 -+ xtn2 v17.16b, v20.8h -+ -+ // v4, v5 -+ -+ shrn v24.4h, v4.4s, #16 -+ xtn v22.4h, v4.4s -+ shrn v23.4h, v4.4s, #12 -+ -+ shrn2 v24.8h, v5.4s, #16 -+ xtn2 v22.8h, v5.4s -+ shrn2 v23.8h, v5.4s, #12 -+ -+ shrn v21.8b, v24.8h, #6 -+ shrn v19.8b, v22.8h, #2 -+ xtn v20.8b, v23.8h -+ -+ // v6, v7 -+ -+ shrn v27.4h, v6.4s, #16 -+ xtn v25.4h, v6.4s -+ shrn v26.4h, v6.4s, #12 -+ -+ shrn2 v27.8h, v7.4s, #16 -+ xtn2 v25.8h, v7.4s -+ shrn2 v26.8h, v7.4s, #12 -+ -+ shrn2 v21.16b, v27.8h, #6 -+ shrn2 v19.16b, v25.8h, #2 -+ xtn2 v20.16b, v26.8h -+ -+ blt 2f -+ -+ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 -+ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48 -+ -+ bne 1b -+ -+11: -+ subs w6, w6, #1 -+ add x0, x0, w1, uxtw -+ add x8, x8, #128 -+ bne 10b -+ -+ ret -+ -+// Partial final write -+2: -+ cmp w5, #48-96 -+ blt 1f -+ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 -+ beq 11b -+ mov v16.16b, v22.16b -+ mov v17.16b, v23.16b -+ sub w5, w5, #48 -+ mov v18.16b, v24.16b -+1: -+ cmp w5, #24-96 -+ blt 1f -+ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24 -+ beq 11b -+ mov v16.2d[0], v16.2d[1] -+ sub w5, w5, #24 -+ mov v17.2d[0], v17.2d[1] -+ mov v18.2d[0], v18.2d[1] -+1: -+ cmp w5, #12-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[2], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[3], [x0], #3 -+ beq 11b -+ mov v16.2s[0], v16.2s[1] -+ sub w5, w5, #12 -+ mov v17.2s[0], v17.2s[1] -+ mov v18.2s[0], v18.2s[1] -+1: -+ cmp w5, #6-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 -+ beq 11b -+ mov v16.4h[0], v16.4h[1] -+ sub w5, w5, #6 -+ mov v17.4h[0], v17.4h[1] -+ mov v18.4h[0], v18.4h[1] -+1: -+ cmp w5, #3-96 -+ blt 1f -+ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 -+ beq 11b -+ mov v16.8b[0], v16.8b[1] -+ sub w5, w5, #3 -+ mov v17.8b[0], v17.8b[1] -+1: -+ cmp w5, #2-96 -+ blt 1f -+ st2 {v16.b, v17.b}[0], [x0], #2 -+ b 11b -+1: -+ st1 {v16.b}[0], [x0], #1 -+ b 11b -+ -+endfunc -+ -diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h -new file mode 100644 -index 0000000000..2a56135bc3 ---- /dev/null -+++ b/libavutil/aarch64/rpi_sand_neon.h -@@ -0,0 +1,59 @@ -+/* -+Copyright (c) 2021 Michael Eiler -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: Michael Eiler -+*/ -+ -+#pragma once -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u, -+ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u, -+ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, -+ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, -+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, -+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); -+ -+#ifdef __cplusplus -+} -+#endif -+ -diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile -index 5da44b0542..b74b7c4e2f 100644 ---- a/libavutil/arm/Makefile -+++ b/libavutil/arm/Makefile -@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ - - NEON-OBJS += arm/float_dsp_init_neon.o \ - arm/float_dsp_neon.o \ -+ arm/rpi_sand_neon.o \ -diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S -new file mode 100644 -index 0000000000..60e697f681 ---- /dev/null -+++ b/libavutil/arm/rpi_sand_neon.S -@@ -0,0 +1,925 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#include "libavutil/arm/asm.S" -+ -+ -+@ General notes: -+@ Having done some timing on this in sand8->y8 (Pi4) -+@ vst1 (680fps) is a bit faster than vstm (660fps) -+@ vldm (680fps) is noticably faster than vld1 (480fps) -+@ (or it might be that a mix is what is required) -+@ -+@ At least on a Pi4 it is no more expensive to have a single auto-inc register -+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted -+@ the latter was better) -+@ -+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless -+@ the memory is uncached. -+@ As these are Sand -> planar we can assume that src is going to be aligned but -+@ it is possible that dest isn't (converting to .yuv or other packed format). -+@ Luckily vst1 is faster than vstm :-) so all is well -+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 -+@ .8 stores would let us do non-word aligned stores into uncached but it -+@ probably isn't worth it. -+ -+ -+ -+ -+@ void ff_rpi_sand128b_stripe_to_8_10( -+@ uint8_t * dest, // [r0] -+@ const uint8_t * src1, // [r1] -+@ const uint8_t * src2, // [r2] -+@ unsigned int lines); // [r3] -+ -+.macro stripe2_to_8, bit_depth -+ vpush {q4-q7} -+1: -+ vldm r1!, {q0-q7} -+ subs r3, #1 -+ vldm r2!, {q8-q15} -+ vqrshrn.u16 d0, q0, #\bit_depth - 8 -+ vqrshrn.u16 d1, q1, #\bit_depth - 8 -+ vqrshrn.u16 d2, q2, #\bit_depth - 8 -+ vqrshrn.u16 d3, q3, #\bit_depth - 8 -+ vqrshrn.u16 d4, q4, #\bit_depth - 8 -+ vqrshrn.u16 d5, q5, #\bit_depth - 8 -+ vqrshrn.u16 d6, q6, #\bit_depth - 8 -+ vqrshrn.u16 d7, q7, #\bit_depth - 8 -+ vqrshrn.u16 d8, q8, #\bit_depth - 8 -+ vqrshrn.u16 d9, q9, #\bit_depth - 8 -+ vqrshrn.u16 d10, q10, #\bit_depth - 8 -+ vqrshrn.u16 d11, q11, #\bit_depth - 8 -+ vqrshrn.u16 d12, q12, #\bit_depth - 8 -+ vqrshrn.u16 d13, q13, #\bit_depth - 8 -+ vqrshrn.u16 d14, q14, #\bit_depth - 8 -+ vqrshrn.u16 d15, q15, #\bit_depth - 8 -+ vstm r0!, {q0-q7} -+ bne 1b -+ vpop {q4-q7} -+ bx lr -+.endm -+ -+function ff_rpi_sand128b_stripe_to_8_10, export=1 -+ stripe2_to_8 10 -+endfunc -+ -+@ void ff_rpi_sand8_lines_to_planar_y8( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand8_lines_to_planar_y8, export=1 -+ push {r4-r8, lr} @ +24 L -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ lsl r3, #7 -+ sub r1, r6 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2, {q8-q15} -+ add r2, r3 -+ subs r5, #128 -+ blt 2f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d20, d21, d22, d23}, [r0]! -+ vst1.8 {d24, d25, d26, d27}, [r0]! -+ vst1.8 {d28, d29, d30, d31}, [r0]! -+ bne 1b -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #64-128 -+ blt 1f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d20, d21, d22, d23}, [r0]! -+ beq 11b -+ vmov q8, q12 -+ vmov q9, q13 -+ sub r5, #64 -+ vmov q10, q14 -+ vmov q11, q15 -+1: -+ cmp r5, #32-128 -+ blt 1f -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ beq 11b -+ vmov q8, q10 -+ sub r5, #32 -+ vmov q9, q11 -+1: -+ cmp r5, #16-128 -+ blt 1f -+ vst1.8 {d16, d17}, [r0]! -+ beq 11b -+ sub r5, #16 -+ vmov q8, q9 -+1: -+ cmp r5, #8-128 -+ blt 1f -+ vst1.8 {d16}, [r0]! -+ beq 11b -+ sub r5, #8 -+ vmov d16, d17 -+1: -+ cmp r5, #4-128 -+ blt 1f -+ vst1.32 {d16[0]}, [r0]! -+ beq 11b -+ sub r5, #4 -+ vshr.u64 d16, #32 -+1: -+ cmp r5, #2-128 -+ blt 1f -+ vst1.16 {d16[0]}, [r0]! -+ beq 11b -+ vst1.8 {d16[2]}, [r0]! -+ b 11b -+1: -+ vst1.8 {d16[0]}, [r0]! -+ b 11b -+endfunc -+ -+@ void ff_rpi_sand8_lines_to_planar_c8( -+@ uint8_t * dst_u, // [r0] -+@ unsigned int dst_stride_u, // [r1] -+@ uint8_t * dst_v, // [r2] -+@ unsigned int dst_stride_v, // [r3] -+@ const uint8_t * src, // [sp, #0] -> r4, r5 -+@ unsigned int stride1, // [sp, #4] 128 -+@ unsigned int stride2, // [sp, #8] -> r8 -+@ unsigned int _x, // [sp, #12] 0 -+@ unsigned int y, // [sp, #16] (r7 in prefix) -+@ unsigned int _w, // [sp, #20] -> r12, r6 -+@ unsigned int h); // [sp, #24] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand8_lines_to_planar_c8, export=1 -+ push {r4-r8, lr} @ +24 -+ -+ ldr r5, [sp, #24] -+ ldr r8, [sp, #32] -+ ldr r7, [sp, #40] -+ ldr r6, [sp, #44] -+ lsl r8, #7 -+ add r5, r5, r7, lsl #7 -+ sub r1, r1, r6 -+ sub r3, r3, r6 -+ ldr r7, [sp, #48] -+ vpush {q4-q7} -+ -+10: -+ mov r4, r5 -+ mov r12, r6 -+1: -+ subs r12, #64 -+ vldm r4, {q0-q7} -+ add r4, r8 -+ it gt -+ vldmgt r4, {q8-q15} -+ add r4, r8 -+ -+ vuzp.8 q0, q1 -+ vuzp.8 q2, q3 -+ vuzp.8 q4, q5 -+ vuzp.8 q6, q7 -+ -+ vuzp.8 q8, q9 -+ vuzp.8 q10, q11 -+ vuzp.8 q12, q13 -+ vuzp.8 q14, q15 -+ subs r12, #64 -+ -+ @ Rearrange regs so we can use vst1 with 4 regs -+ vswp q1, q2 -+ vswp q5, q6 -+ vswp q9, q10 -+ vswp q13, q14 -+ blt 2f -+ -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d8, d9, d10, d11}, [r0]! -+ vst1.8 {d16, d17, d18, d19}, [r0]! -+ vst1.8 {d24, d25, d26, d27}, [r0]! -+ -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ vst1.8 {d12, d13, d14, d15}, [r2]! -+ vst1.8 {d20, d21, d22, d23}, [r2]! -+ vst1.8 {d28, d29, d30, d31}, [r2]! -+ bne 1b -+11: -+ subs r7, #1 -+ add r5, #128 -+ add r0, r1 -+ add r2, r3 -+ bne 10b -+ vpop {q4-q7} -+ pop {r4-r8,pc} -+ -+2: -+ cmp r12, #64-128 -+ blt 1f -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d8, d9, d10, d11}, [r0]! -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ vst1.8 {d12, d13, d14, d15}, [r2]! -+ beq 11b -+ sub r12, #64 -+ vmov q0, q8 -+ vmov q1, q9 -+ vmov q2, q10 -+ vmov q3, q11 -+ vmov q4, q12 -+ vmov q5, q13 -+ vmov q6, q14 -+ vmov q7, q15 -+1: -+ cmp r12, #32-128 -+ blt 1f -+ vst1.8 {d0, d1, d2, d3 }, [r0]! -+ vst1.8 {d4, d5, d6, d7 }, [r2]! -+ beq 11b -+ sub r12, #32 -+ vmov q0, q4 -+ vmov q1, q5 -+ vmov q2, q6 -+ vmov q3, q7 -+1: -+ cmp r12, #16-128 -+ blt 1f -+ vst1.8 {d0, d1 }, [r0]! -+ vst1.8 {d4, d5 }, [r2]! -+ beq 11b -+ sub r12, #16 -+ vmov q0, q1 -+ vmov q2, q3 -+1: -+ cmp r12, #8-128 -+ blt 1f -+ vst1.8 {d0}, [r0]! -+ vst1.8 {d4}, [r2]! -+ beq 11b -+ sub r12, #8 -+ vmov d0, d1 -+ vmov d4, d5 -+1: -+ cmp r12, #4-128 -+ blt 1f -+ vst1.32 {d0[0]}, [r0]! -+ vst1.32 {d4[0]}, [r2]! -+ beq 11b -+ sub r12, #4 -+ vmov s0, s1 -+ vmov s8, s9 -+1: -+ cmp r12, #2-128 -+ blt 1f -+ vst1.16 {d0[0]}, [r0]! -+ vst1.16 {d4[0]}, [r2]! -+ beq 11b -+ vst1.8 {d0[2]}, [r0]! -+ vst1.8 {d4[2]}, [r2]! -+ b 11b -+1: -+ vst1.8 {d0[0]}, [r0]! -+ vst1.8 {d4[0]}, [r2]! -+ b 11b -+endfunc -+ -+ -+ -+@ void ff_rpi_sand30_lines_to_planar_y16( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_y16, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ sub r3, #1 -+ lsl r3, #7 -+ sub r1, r1, r6, lsl #1 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2!, {q10-q13} -+ add lr, #64 -+ -+ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20! -+ ands lr, #127 -+ vshrn.u32 d2, q10, #10 -+ vmovn.u32 d0, q10 -+ -+ vshrn.u32 d5, q11, #14 -+ it eq -+ addeq r2, r3 -+ vshrn.u32 d3, q11, #10 -+ vmovn.u32 d1, q11 -+ -+ subs r5, #48 -+ vshr.u16 q2, #6 -+ vbic.u16 q0, #0xfc00 -+ vbic.u16 q1, #0xfc00 -+ -+ vshrn.u32 d20, q12, #14 -+ vshrn.u32 d18, q12, #10 -+ vmovn.u32 d16, q12 -+ -+ vshrn.u32 d21, q13, #14 -+ vshrn.u32 d19, q13, #10 -+ vmovn.u32 d17, q13 -+ -+ vshr.u16 q10, #6 -+ vbic.u16 q8, #0xfc00 -+ vbic.u16 q9 , #0xfc00 -+ blt 2f -+ -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4], r12 -+ vst3.16 {d16, d18, d20}, [r0], r12 -+ vst3.16 {d17, d19, d21}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #24-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4] -+ beq 11b -+ vmov q0, q8 -+ sub r5, #24 -+ vmov q1, q9 -+ vmov q2, q10 -+1: -+ cmp r5, #12-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0]! -+ beq 11b -+ vmov d0, d1 -+ sub r5, #12 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r5, #6-48 -+ add r4, r0, #6 @ avoid [r0]! on sequential instructions -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0] -+ vst3.16 {d0[1], d2[1], d4[1]}, [r4] -+ add r0, #12 -+ beq 11b -+ vmov s0, s1 -+ sub r5, #6 -+ vmov s4, s5 -+ vmov s8, s9 -+1: -+ cmp r5, #3-48 -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #2-48 -+ blt 1f -+ vst2.16 {d0[0], d2[0]}, [r0]! -+ b 11b -+1: -+ vst1.16 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ -+ -+@ void ff_rpi_sand30_lines_to_planar_c16( -+@ uint8_t * dst_u, // [r0] -+@ unsigned int dst_stride_u, // [r1] -+@ uint8_t * dst_v, // [r2] -+@ unsigned int dst_stride_v, // [r3] -+@ const uint8_t * src, // [sp, #0] -> r4, r5 -+@ unsigned int stride1, // [sp, #4] 128 -+@ unsigned int stride2, // [sp, #8] -> r8 -+@ unsigned int _x, // [sp, #12] 0 -+@ unsigned int y, // [sp, #16] (r7 in prefix) -+@ unsigned int _w, // [sp, #20] -> r6, r9 -+@ unsigned int h); // [sp, #24] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_c16, export=1 -+ push {r4-r10, lr} @ +32 -+ ldr r5, [sp, #32] -+ ldr r8, [sp, #40] -+ ldr r7, [sp, #48] -+ ldr r9, [sp, #52] -+ mov r12, #48 -+ sub r8, #1 -+ lsl r8, #7 -+ add r5, r5, r7, lsl #7 -+ sub r1, r1, r9, lsl #1 -+ sub r3, r3, r9, lsl #1 -+ ldr r7, [sp, #56] -+10: -+ mov lr, #0 -+ mov r4, r5 -+ mov r6, r9 -+1: -+ vldm r4!, {q0-q3} -+ add lr, #64 -+ -+ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 -+ vshrn.u32 d20, q0, #14 -+ vmovn.u32 d18, q0 -+ vshrn.u32 d0, q0, #10 -+ ands lr, #127 -+ -+ vshrn.u32 d21, q1, #14 -+ vmovn.u32 d19, q1 -+ vshrn.u32 d1, q1, #10 -+ -+ vshrn.u32 d22, q2, #10 -+ vmovn.u32 d2, q2 -+ vshrn.u32 d4, q2, #14 -+ -+ add r10, r0, #24 -+ vshrn.u32 d23, q3, #10 -+ vmovn.u32 d3, q3 -+ vshrn.u32 d5, q3, #14 -+ -+ it eq -+ addeq r4, r8 -+ vuzp.16 q0, q11 -+ vuzp.16 q9, q1 -+ vuzp.16 q10, q2 -+ -+ @ q0 V0, V3,.. -+ @ q9 U0, U3... -+ @ q10 U1, U4... -+ @ q11 U2, U5,.. -+ @ q1 V1, V4, -+ @ q2 V2, V5,.. -+ -+ subs r6, #24 -+ vbic.u16 q11, #0xfc00 -+ vbic.u16 q9, #0xfc00 -+ vshr.u16 q10, #6 -+ vshr.u16 q2, #6 -+ vbic.u16 q0, #0xfc00 -+ vbic.u16 q1, #0xfc00 -+ -+ blt 2f -+ -+ vst3.16 {d18, d20, d22}, [r0], r12 -+ vst3.16 {d19, d21, d23}, [r10] -+ add r10, r2, #24 -+ vst3.16 {d0, d2, d4}, [r2], r12 -+ vst3.16 {d1, d3, d5}, [r10] -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r5, #128 -+ add r0, r1 -+ add r2, r3 -+ bne 10b -+ -+ pop {r4-r10, pc} -+ -+@ Partial final write -+2: -+ cmp r6, #-12 -+ blt 1f -+ vst3.16 {d18, d20, d22}, [r0]! -+ vst3.16 {d0, d2, d4}, [r2]! -+ beq 11b -+ vmov d18, d19 -+ vmov d20, d21 -+ vmov d22, d23 -+ sub r6, #12 -+ vmov d0, d1 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r6, #-18 -+ @ Rezip here as it makes the remaining tail handling easier -+ vzip.16 d0, d18 -+ vzip.16 d2, d20 -+ vzip.16 d4, d22 -+ blt 1f -+ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! -+ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! -+ vst3.16 {d0[3], d2[3], d4[3]}, [r0]! -+ vst3.16 {d0[2], d2[2], d4[2]}, [r2]! -+ beq 11b -+ vmov d0, d18 -+ vmov d2, d20 -+ sub r6, #6 -+ vmov d4, d22 -+1: -+ cmp r6, #-21 -+ blt 1f -+ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! -+ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! -+ beq 11b -+ vmov s4, s5 -+ sub r6, #3 -+ vmov s0, s1 -+1: -+ cmp r6, #-22 -+ blt 1f -+ vst2.16 {d0[1], d2[1]}, [r0]! -+ vst2.16 {d0[0], d2[0]}, [r2]! -+ b 11b -+1: -+ vst1.16 {d0[1]}, [r0]! -+ vst1.16 {d0[0]}, [r2]! -+ b 11b -+ -+endfunc -+ -+@ void ff_rpi_sand30_lines_to_planar_p010( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for writing -+ -+function ff_rpi_sand30_lines_to_planar_p010, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ vmov.u16 q15, #0xffc0 -+ sub r3, #1 -+ lsl r3, #7 -+ sub r1, r1, r6, lsl #1 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+ mov lr, #0 -+1: -+ vldm r2!, {q10-q13} -+ add lr, #64 -+ -+ vshl.u32 q14, q10, #6 -+ ands lr, #127 -+ vshrn.u32 d4, q10, #14 -+ vshrn.u32 d2, q10, #4 -+ vmovn.u32 d0, q14 -+ -+ vshl.u32 q14, q11, #6 -+ it eq -+ addeq r2, r3 -+ vshrn.u32 d5, q11, #14 -+ vshrn.u32 d3, q11, #4 -+ vmovn.u32 d1, q14 -+ -+ subs r5, #48 -+ vand q2, q15 -+ vand q1, q15 -+ vand q0, q15 -+ -+ vshl.u32 q14, q12, #6 -+ vshrn.u32 d20, q12, #14 -+ vshrn.u32 d18, q12, #4 -+ vmovn.u32 d16, q14 -+ -+ vshl.u32 q14, q13, #6 -+ vshrn.u32 d21, q13, #14 -+ vshrn.u32 d19, q13, #4 -+ vmovn.u32 d17, q14 -+ -+ vand q10, q15 -+ vand q9, q15 -+ vand q8, q15 -+ blt 2f -+ -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4], r12 -+ vst3.16 {d16, d18, d20}, [r0], r12 -+ vst3.16 {d17, d19, d21}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #24-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0], r12 -+ vst3.16 {d1, d3, d5}, [r4] -+ beq 11b -+ vmov q0, q8 -+ sub r5, #24 -+ vmov q1, q9 -+ vmov q2, q10 -+1: -+ cmp r5, #12-48 -+ blt 1f -+ vst3.16 {d0, d2, d4}, [r0]! -+ beq 11b -+ vmov d0, d1 -+ sub r5, #12 -+ vmov d2, d3 -+ vmov d4, d5 -+1: -+ cmp r5, #6-48 -+ add r4, r0, #6 @ avoid [r0]! on sequential instructions -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0] -+ vst3.16 {d0[1], d2[1], d4[1]}, [r4] -+ add r0, #12 -+ beq 11b -+ vmov s0, s1 -+ sub r5, #6 -+ vmov s4, s5 -+ vmov s8, s9 -+1: -+ cmp r5, #3-48 -+ blt 1f -+ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #2-48 -+ blt 1f -+ vst2.16 {d0[0], d2[0]}, [r0]! -+ b 11b -+1: -+ vst1.16 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ -+ -+@ void ff_rpi_sand30_lines_to_planar_y8( -+@ uint8_t * dest, // [r0] -+@ unsigned int dst_stride, // [r1] -+@ const uint8_t * src, // [r2] -+@ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+@ unsigned int src_stride2, // [sp, #0] -> r3 -+@ unsigned int _x, // [sp, #4] Ignored - 0 -+@ unsigned int y, // [sp, #8] (r7 in prefix) -+@ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+@ unsigned int h); // [sp, #16] -> r7 -+@ -+@ Assumes that we are starting on a stripe boundary and that overreading -+@ within the stripe is OK. However it does respect the dest size for wri -+ -+function ff_rpi_sand30_lines_to_planar_y8, export=1 -+ push {r4-r8, lr} @ +24 -+ ldr r3, [sp, #24] -+ ldr r6, [sp, #36] -+ ldr r7, [sp, #32] @ y -+ mov r12, #48 -+ lsl r3, #7 -+ sub r1, r1, r6 -+ add r8, r2, r7, lsl #7 -+ ldr r7, [sp, #40] -+ -+10: -+ mov r2, r8 -+ add r4, r0, #24 -+ mov r5, r6 -+1: -+ vldm r2, {q8-q15} -+ -+ subs r5, #96 -+ -+ vmovn.u32 d0, q8 -+ vshrn.u32 d2, q8, #12 -+ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20! -+ -+ add r2, r3 -+ -+ vmovn.u32 d1, q9 -+ vshrn.u32 d3, q9, #12 -+ vshrn.u32 d5, q9, #16 -+ -+ pld [r2, #0] -+ -+ vshrn.u16 d0, q0, #2 -+ vmovn.u16 d1, q1 -+ vshrn.u16 d2, q2, #6 -+ -+ vmovn.u32 d16, q10 -+ vshrn.u32 d18, q10, #12 -+ vshrn.u32 d20, q10, #16 -+ -+ vmovn.u32 d17, q11 -+ vshrn.u32 d19, q11, #12 -+ vshrn.u32 d21, q11, #16 -+ -+ pld [r2, #64] -+ -+ vshrn.u16 d4, q8, #2 -+ vmovn.u16 d5, q9 -+ vshrn.u16 d6, q10, #6 -+ -+ vmovn.u32 d16, q12 -+ vshrn.u32 d18, q12, #12 -+ vshrn.u32 d20, q12, #16 -+ -+ vmovn.u32 d17, q13 -+ vshrn.u32 d19, q13, #12 -+ vshrn.u32 d21, q13, #16 -+ -+ vshrn.u16 d16, q8, #2 -+ vmovn.u16 d17, q9 -+ vshrn.u16 d18, q10, #6 -+ -+ vmovn.u32 d20, q14 -+ vshrn.u32 d22, q14, #12 -+ vshrn.u32 d24, q14, #16 -+ -+ vmovn.u32 d21, q15 -+ vshrn.u32 d23, q15, #12 -+ vshrn.u32 d25, q15, #16 -+ -+ vshrn.u16 d20, q10, #2 -+ vmovn.u16 d21, q11 -+ vshrn.u16 d22, q12, #6 -+ -+ blt 2f -+ -+ vst3.8 {d0, d1, d2}, [r0], r12 -+ vst3.8 {d4, d5, d6}, [r4], r12 -+ vst3.8 {d16, d17, d18}, [r0], r12 -+ vst3.8 {d20, d21, d22}, [r4], r12 -+ -+ bne 1b -+ -+11: -+ subs r7, #1 -+ add r0, r1 -+ add r8, #128 -+ bne 10b -+ -+ pop {r4-r8, pc} -+ -+@ Partial final write -+2: -+ cmp r5, #48-96 -+ blt 1f -+ vst3.8 {d0, d1, d2}, [r0], r12 -+ vst3.8 {d4, d5, d6}, [r4], r12 -+ beq 11b -+ vmov q0, q8 -+ vmov q2, q10 -+ sub r5, #48 -+ vmov d2, d18 -+ vmov d6, d22 -+1: -+ cmp r5, #24-96 -+ blt 1f -+ vst3.8 {d0, d1, d2}, [r0]! -+ beq 11b -+ vmov q0, q2 -+ sub r5, #24 -+ vmov d2, d6 -+1: -+ cmp r5, #12-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! -+ vst3.8 {d0[2], d1[2], d2[2]}, [r0]! -+ vst3.8 {d0[3], d1[3], d2[3]}, [r0]! -+ beq 11b -+ vmov s0, s1 -+ sub r5, #12 -+ vmov s2, s3 -+ vmov s4, s5 -+1: -+ cmp r5, #6-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! -+ add r0, #12 -+ beq 11b -+ vshr.u32 d0, #16 -+ sub r5, #6 -+ vshr.u32 d1, #16 -+ vshr.u32 d2, #16 -+1: -+ cmp r5, #3-96 -+ blt 1f -+ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! -+ beq 11b -+ sub r5, #3 -+ vshr.u32 d0, #8 -+ vshr.u32 d1, #8 -+1: -+ cmp r5, #2-96 -+ blt 1f -+ vst2.8 {d0[0], d1[0]}, [r0]! -+ b 11b -+1: -+ vst1.8 {d0[0]}, [r0]! -+ b 11b -+ -+endfunc -+ -+ -diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h -new file mode 100644 -index 0000000000..d457c10870 ---- /dev/null -+++ b/libavutil/arm/rpi_sand_neon.h -@@ -0,0 +1,110 @@ -+/* -+Copyright (c) 2020 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#ifndef AVUTIL_ARM_SAND_NEON_H -+#define AVUTIL_ARM_SAND_NEON_H -+ -+void ff_rpi_sand128b_stripe_to_8_10( -+ uint8_t * dest, // [r0] -+ const uint8_t * src1, // [r1] -+ const uint8_t * src2, // [r2] -+ unsigned int lines); // [r3] -+ -+void ff_rpi_sand8_lines_to_planar_y8( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+void ff_rpi_sand8_lines_to_planar_c8( -+ uint8_t * dst_u, // [r0] -+ unsigned int dst_stride_u, // [r1] -+ uint8_t * dst_v, // [r2] -+ unsigned int dst_stride_v, // [r3] -+ const uint8_t * src, // [sp, #0] -> r4, r5 -+ unsigned int stride1, // [sp, #4] 128 -+ unsigned int stride2, // [sp, #8] -> r8 -+ unsigned int _x, // [sp, #12] 0 -+ unsigned int y, // [sp, #16] (r7 in prefix) -+ unsigned int _w, // [sp, #20] -> r12, r6 -+ unsigned int h); // [sp, #24] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_y16( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_c16( -+ uint8_t * dst_u, // [r0] -+ unsigned int dst_stride_u, // [r1] -+ uint8_t * dst_v, // [r2] -+ unsigned int dst_stride_v, // [r3] -+ const uint8_t * src, // [sp, #0] -> r4, r5 -+ unsigned int stride1, // [sp, #4] 128 -+ unsigned int stride2, // [sp, #8] -> r8 -+ unsigned int _x, // [sp, #12] 0 -+ unsigned int y, // [sp, #16] (r7 in prefix) -+ unsigned int _w, // [sp, #20] -> r6, r9 -+ unsigned int h); // [sp, #24] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_p010( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+void ff_rpi_sand30_lines_to_planar_y8( -+ uint8_t * dest, // [r0] -+ unsigned int dst_stride, // [r1] -+ const uint8_t * src, // [r2] -+ unsigned int src_stride1, // [r3] Ignored - assumed 128 -+ unsigned int src_stride2, // [sp, #0] -> r3 -+ unsigned int _x, // [sp, #4] Ignored - 0 -+ unsigned int y, // [sp, #8] (r7 in prefix) -+ unsigned int _w, // [sp, #12] -> r6 (cur r5) -+ unsigned int h); // [sp, #16] -> r7 -+ -+#endif // AVUTIL_ARM_SAND_NEON_H -+ -diff --git a/libavutil/frame.c b/libavutil/frame.c -index 75e347bf2f..daa6477485 100644 ---- a/libavutil/frame.c -+++ b/libavutil/frame.c -@@ -16,6 +16,8 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include "config.h" -+ - #include "channel_layout.h" - #include "avassert.h" - #include "buffer.h" -@@ -26,6 +28,9 @@ - #include "mem.h" - #include "samplefmt.h" - #include "hwcontext.h" -+#if CONFIG_SAND -+#include "rpi_sand_fns.h" -+#endif - - #if FF_API_FRAME_GET_SET - MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp) -@@ -903,6 +908,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) - (frame->crop_top + frame->crop_bottom) >= frame->height) - return AVERROR(ERANGE); - -+#if CONFIG_SAND -+ // Sand cannot be cropped - do not try -+ if (av_rpi_is_sand_format(frame->format)) -+ return 0; -+#endif -+ - desc = av_pix_fmt_desc_get(frame->format); - if (!desc) - return AVERROR_BUG; -diff --git a/libavutil/frame.h b/libavutil/frame.h -index 7d1f8e2935..a4e7dc915d 100644 ---- a/libavutil/frame.h -+++ b/libavutil/frame.h -@@ -990,6 +990,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); - */ - const char *av_frame_side_data_name(enum AVFrameSideDataType type); - -+ -+static inline int av_frame_cropped_width(const AVFrame * const frame) -+{ -+ return frame->width - (frame->crop_left + frame->crop_right); -+} -+static inline int av_frame_cropped_height(const AVFrame * const frame) -+{ -+ return frame->height - (frame->crop_top + frame->crop_bottom); -+} -+ - /** - * @} - */ -diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c -index 7a9fdbd263..2f825b7e16 100644 ---- a/libavutil/hwcontext_drm.c -+++ b/libavutil/hwcontext_drm.c -@@ -21,6 +21,7 @@ - #include - #include - #include -+#include - - /* This was introduced in version 4.6. And may not exist all without an - * optional package. So to prevent a hard dependency on needing the Linux -@@ -31,6 +32,7 @@ - #endif - - #include -+#include - #include - - #include "avassert.h" -@@ -38,7 +40,9 @@ - #include "hwcontext_drm.h" - #include "hwcontext_internal.h" - #include "imgutils.h" -- -+#if CONFIG_SAND -+#include "libavutil/rpi_sand_fns.h" -+#endif - - static void drm_device_free(AVHWDeviceContext *hwdev) - { -@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, - AVDRMDeviceContext *hwctx = hwdev->hwctx; - drmVersionPtr version; - -+ if (device == NULL) { -+ hwctx->fd = -1; -+ return 0; -+ } -+ - hwctx->fd = open(device, O_RDWR); - if (hwctx->fd < 0) - return AVERROR(errno); -@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc, - if (flags & AV_HWFRAME_MAP_WRITE) - mmap_prot |= PROT_WRITE; - -+ if (dst->format == AV_PIX_FMT_NONE) -+ dst->format = hwfc->sw_format; - #if HAVE_LINUX_DMA_BUF_H - if (flags & AV_HWFRAME_MAP_READ) - map->sync_flags |= DMA_BUF_SYNC_READ; -@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, - - dst->width = src->width; - dst->height = src->height; -+ dst->crop_top = src->crop_top; -+ dst->crop_bottom = src->crop_bottom; -+ dst->crop_left = src->crop_left; -+ dst->crop_right = src->crop_right; -+ -+#if CONFIG_SAND -+ // Rework for sand frames -+ if (av_rpi_is_sand_frame(dst)) { -+ // As it stands the sand formats hold stride2 in linesize[3] -+ // linesize[0] & [1] contain stride1 which is always 128 for everything we do -+ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] -+ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); -+ dst->linesize[0] = 128; -+ dst->linesize[1] = 128; -+ // *** Are we sure src->height is actually what we want ??? -+ } -+#endif - - err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, - &drm_unmap_frame, map); -@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, - enum AVHWFrameTransferDirection dir, - enum AVPixelFormat **formats) - { -- enum AVPixelFormat *pix_fmts; -+ enum AVPixelFormat *p; - -- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts)); -- if (!pix_fmts) -+ p = *formats = av_malloc_array(3, sizeof(*p)); -+ if (!p) - return AVERROR(ENOMEM); - -- pix_fmts[0] = ctx->sw_format; -- pix_fmts[1] = AV_PIX_FMT_NONE; -+ // **** Offer native sand too ???? -+ *p++ = -+#if CONFIG_SAND -+ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? -+ AV_PIX_FMT_YUV420P : -+ ctx->sw_format == AV_PIX_FMT_RPI4_10 ? -+ AV_PIX_FMT_YUV420P10LE : -+#endif -+ ctx->sw_format; -+ -+#if CONFIG_SAND -+ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 || -+ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128) -+ *p++ = AV_PIX_FMT_NV12; -+#endif - -- *formats = pix_fmts; -+ *p = AV_PIX_FMT_NONE; - return 0; - } - -@@ -231,18 +272,63 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, - map = av_frame_alloc(); - if (!map) - return AVERROR(ENOMEM); -- map->format = dst->format; - -+ // Map to default -+ map->format = AV_PIX_FMT_NONE; - err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); - if (err) - goto fail; - -- map->width = dst->width; -- map->height = dst->height; -+#if 0 -+ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, -+ hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, -+ map->width, map->height, -+ map->linesize[0], -+ map->linesize[1], -+ map->linesize[2], -+ map->linesize[3], -+ dst->width, dst->height, -+ dst->linesize[0], -+ dst->linesize[1], -+ dst->linesize[2]); -+#endif -+#if CONFIG_SAND -+ if (av_rpi_is_sand_frame(map)) { -+ // Preserve crop - later ffmpeg code assumes that we have in that it -+ // overwrites any crop that we create with the old values -+ unsigned int stride2 = map->linesize[3]; -+ const unsigned int w = FFMIN(dst->width, map->width); -+ const unsigned int h = FFMIN(dst->height, map->height); -+ -+ map->crop_top = 0; -+ map->crop_bottom = 0; -+ map->crop_left = 0; -+ map->crop_right = 0; -+ -+ if (av_rpi_sand_to_planar_frame(dst, map) != 0) -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); -+ err = AVERROR(EINVAL); -+ goto fail; -+ } -+ -+ dst->width = w; -+ dst->height = h; -+ } -+ else -+#endif -+ { -+ // Kludge mapped h/w s.t. frame_copy works -+ map->width = dst->width; -+ map->height = dst->height; -+ err = av_frame_copy(dst, map); -+ } - -- err = av_frame_copy(dst, map); - if (err) -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); - goto fail; -+ } - - err = 0; - fail: -@@ -257,7 +343,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, - int err; - - if (src->width > hwfc->width || src->height > hwfc->height) -+ { -+ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); - return AVERROR(EINVAL); -+ } - - map = av_frame_alloc(); - if (!map) -diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c -index 18c7a0efc8..bab13a4d50 100644 ---- a/libavutil/pixdesc.c -+++ b/libavutil/pixdesc.c -@@ -2395,6 +2395,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { - .name = "vulkan", - .flags = AV_PIX_FMT_FLAG_HWACCEL, - }, -+ [AV_PIX_FMT_SAND128] = { -+ .name = "sand128", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ -+ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ -+ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_SAND64_10] = { -+ .name = "sand64_10", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ -+ { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */ -+ { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_SAND64_16] = { -+ .name = "sand64_16", -+ .nb_components = 3, -+ .log2_chroma_w = 1, -+ .log2_chroma_h = 1, -+ .comp = { -+ { 0, 2, 0, 0, 16, 0, 15, 1 }, /* Y */ -+ { 1, 4, 0, 0, 16, 3, 15, 1 }, /* U */ -+ { 1, 4, 2, 0, 16, 3, 15, 3 }, /* V */ -+ }, -+ .flags = 0, -+ }, -+ [AV_PIX_FMT_RPI4_8] = { -+ .name = "rpi4_8", -+ .flags = AV_PIX_FMT_FLAG_HWACCEL, -+ }, -+ [AV_PIX_FMT_RPI4_10] = { -+ .name = "rpi4_10", -+ .flags = AV_PIX_FMT_FLAG_HWACCEL, -+ }, - }; - #if FF_API_PLUS1_MINUS1 - FF_ENABLE_DEPRECATION_WARNINGS -diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h -index 46ef211add..9195ead15f 100644 ---- a/libavutil/pixfmt.h -+++ b/libavutil/pixfmt.h -@@ -357,6 +357,14 @@ enum AVPixelFormat { - - AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian - AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian -+// RPI - not on ifdef so can be got at by calling progs -+// #define so code that uses this can know it is there -+#define AVUTIL_HAVE_PIX_FMT_SAND 1 -+ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding -+ AV_PIX_FMT_RPI4_8, -+ AV_PIX_FMT_RPI4_10, - - AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined - AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined -diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h -new file mode 100644 -index 0000000000..0d5d203dc3 ---- /dev/null -+++ b/libavutil/rpi_sand_fn_pw.h -@@ -0,0 +1,227 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+// * Included twice from rpi_sand_fn with different PW -+ -+#define STRCAT(x,y) x##y -+ -+#if PW == 1 -+#define pixel uint8_t -+#define FUNC(f) STRCAT(f, 8) -+#elif PW == 2 -+#define pixel uint16_t -+#define FUNC(f) STRCAT(f, 16) -+#else -+#error Unexpected PW -+#endif -+ -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// unclipped -+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x; -+ const unsigned int w = _w; -+ const unsigned int mask = stride1 - 1; -+ -+#if PW == 1 && HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { -+ memcpy(dst, p, w); -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const uint8_t * p = p2; -+ uint8_t * d = dst; -+ memcpy(d, p1, w1); -+ d += w1; -+ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { -+ memcpy(d, p, stride1); -+ } -+ memcpy(d, p, w3); -+ } -+ } -+} -+ -+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) -+ -+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x * 2; -+ const unsigned int w = _w * 2; -+ const unsigned int mask = stride1 - 1; -+ -+#if PW == 1 && HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { -+ pixel * du = (pixel *)dst_u; -+ pixel * dv = (pixel *)dst_v; -+ const pixel * p = (const pixel *)p1; -+ for (unsigned int k = 0; k < w; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const unsigned int sstride_p = (sstride - stride1) / PW; -+ -+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const pixel * p = (const pixel *)p1; -+ pixel * du = (pixel *)dst_u; -+ pixel * dv = (pixel *)dst_v; -+ for (unsigned int k = 0; k < w1; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { -+ for (unsigned int k = 0; k < stride1; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ for (unsigned int k = 0; k < w3; k += 2 * PW) { -+ *du++ = *p++; -+ *dv++ = *p++; -+ } -+ } -+ } -+} -+ -+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x = _x * 2; -+ const unsigned int w = _w * 2; -+ const unsigned int mask = stride1 - 1; -+ if ((x & ~mask) == ((x + w) & ~mask)) { -+ // All in one sand stripe -+ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { -+ const pixel * su = (const pixel *)src_u; -+ const pixel * sv = (const pixel *)src_v; -+ pixel * p = (pixel *)p1; -+ for (unsigned int k = 0; k < w; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ } -+ else -+ { -+ // Two+ stripe -+ const unsigned int sstride = stride1 * stride2; -+ const unsigned int sstride_p = (sstride - stride1) / PW; -+ -+ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; -+ const uint8_t * p2 = p1 + sstride - (x & mask); -+ const unsigned int w1 = stride1 - (x & mask); -+ const unsigned int w3 = (x + w) & mask; -+ const unsigned int w2 = w - (w1 + w3); -+ -+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { -+ unsigned int j; -+ const pixel * su = (const pixel *)src_u; -+ const pixel * sv = (const pixel *)src_v; -+ pixel * p = (pixel *)p1; -+ for (unsigned int k = 0; k < w1; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { -+ for (unsigned int k = 0; k < stride1; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ for (unsigned int k = 0; k < w3; k += 2 * PW) { -+ *p++ = *su++; -+ *p++ = *sv++; -+ } -+ } -+ } -+} -+ -+ -+#undef pixel -+#undef STRCAT -+#undef FUNC -+ -diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c -new file mode 100644 -index 0000000000..b6071e2928 ---- /dev/null -+++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,445 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#include "config.h" -+#include -+#include -+#include "rpi_sand_fns.h" -+#include "avassert.h" -+#include "frame.h" -+ -+#if ARCH_ARM && HAVE_NEON -+#include "arm/rpi_sand_neon.h" -+#define HAVE_SAND_ASM 1 -+#elif ARCH_AARCH64 && HAVE_NEON -+#include "aarch64/rpi_sand_neon.h" -+#define HAVE_SAND_ASM 1 -+#else -+#define HAVE_SAND_ASM 0 -+#endif -+ -+#define PW 1 -+#include "rpi_sand_fn_pw.h" -+#undef PW -+ -+#define PW 2 -+#include "rpi_sand_fn_pw.h" -+#undef PW -+ -+#if 1 -+// Simple round -+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) -+{ -+ const unsigned int rnd = (1 << shr) >> 1; -+ const uint16_t * src = (const uint16_t *)_src; -+ -+ for (; n != 0; --n) { -+ *dst++ = (*src++ + rnd) >> shr; -+ } -+} -+#else -+// Dithered variation -+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) -+{ -+ unsigned int rnd = (1 << shr) >> 1; -+ const unsigned int mask = ((1 << shr) - 1); -+ const uint16_t * src = (const uint16_t *)_src; -+ -+ for (; n != 0; --n) { -+ rnd = *src++ + (rnd & mask); -+ *dst++ = rnd >> shr; -+ } -+} -+#endif -+ -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// unclipped -+// _x & _w in pixels, strides in bytes -+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 2) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 4; -+ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint16_t * d = (uint16_t *)dst; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3 = *p++; -+ -+ if (xskip0 == 1) -+ *d++ = (p3 >> 10) & 0x3ff; -+ *d++ = (p3 >> 20) & 0x3ff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3 = *p++; -+ *d++ = p3 & 0x3ff; -+ *d++ = (p3 >> 10) & 0x3ff; -+ *d++ = (p3 >> 20) & 0x3ff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3 = *p; -+ -+ *d++ = p3 & 0x3ff; -+ if (xrem1 == 2) -+ *d++ = (p3 >> 10) & 0x3ff; -+ } -+ } -+} -+ -+ -+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 3) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 8; -+ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, -+ src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint16_t * du = (uint16_t *)dst_u; -+ uint16_t * dv = (uint16_t *)dst_v; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ if (xskip0 == 1) -+ { -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = (p3b >> 0) & 0x3ff; -+ } -+ *du++ = (p3b >> 10) & 0x3ff; -+ *dv++ = (p3b >> 20) & 0x3ff; -+ -+ if (((x += 8) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ *du++ = p3a & 0x3ff; -+ *dv++ = (p3a >> 10) & 0x3ff; -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = p3b & 0x3ff; -+ *du++ = (p3b >> 10) & 0x3ff; -+ *dv++ = (p3b >> 20) & 0x3ff; -+ -+ if (((x += 8) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3a = *p++; -+ const uint32_t p3b = *p++; -+ -+ *du++ = p3a & 0x3ff; -+ *dv++ = (p3a >> 10) & 0x3ff; -+ if (xrem1 == 2) -+ { -+ *du++ = (p3a >> 20) & 0x3ff; -+ *dv++ = p3b & 0x3ff; -+ } -+ } -+ } -+} -+ -+// Fetches a single patch - offscreen fixup not done here -+// w <= stride1 -+// single lose bottom 2 bits truncation -+// _x & _w in pixels, strides in bytes -+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h) -+{ -+ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word -+ const unsigned int xskip0 = _x - (x0 >> 2) * 3; -+ const unsigned int x1 = ((_x + _w) / 3) * 4; -+ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; -+ const unsigned int mask = stride1 - 1; -+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; -+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words -+ -+#if HAVE_SAND_ASM -+ if (_x == 0) { -+ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); -+ return; -+ } -+#endif -+ -+ if (x0 == x1) { -+ // ******************* -+ // Partial single word xfer -+ return; -+ } -+ -+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) -+ { -+ unsigned int x = x0; -+ const uint32_t * p = (const uint32_t *)p0; -+ uint8_t * d = dst; -+ -+ if (xskip0 != 0) { -+ const uint32_t p3 = *p++; -+ -+ if (xskip0 == 1) -+ *d++ = (p3 >> 12) & 0xff; -+ *d++ = (p3 >> 22) & 0xff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ while (x != x1) { -+ const uint32_t p3 = *p++; -+ *d++ = (p3 >> 2) & 0xff; -+ *d++ = (p3 >> 12) & 0xff; -+ *d++ = (p3 >> 22) & 0xff; -+ -+ if (((x += 4) & mask) == 0) -+ p += slice_inc; -+ } -+ -+ if (xrem1 != 0) { -+ const uint32_t p3 = *p; -+ -+ *d++ = (p3 >> 2) & 0xff; -+ if (xrem1 == 2) -+ *d++ = (p3 >> 12) & 0xff; -+ } -+ } -+} -+ -+ -+ -+// w/h in pixels -+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, -+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, -+ unsigned int w, unsigned int h, const unsigned int shr) -+{ -+ const unsigned int n = dst_stride1 / 2; -+ unsigned int j; -+ -+ // This is true for our current layouts -+ av_assert0(dst_stride1 == src_stride1); -+ -+ // As we have the same stride1 for src & dest and src is wider than dest -+ // then if we loop on src we can always write contiguously to dest -+ // We make no effort to copy an exact width - round up to nearest src stripe -+ // as we will always have storage in dest for that -+ -+#if ARCH_ARM && HAVE_NEON -+ if (shr == 3 && src_stride1 == 128) { -+ for (j = 0; j + n < w; j += dst_stride1) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ const uint8_t * s2 = s1 + src_stride1 * src_stride2; -+ -+ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); -+ } -+ } -+ else -+#endif -+ { -+ for (j = 0; j + n < w; j += dst_stride1) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ const uint8_t * s2 = s1 + src_stride1 * src_stride2; -+ -+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { -+ cpy16_to_8(d, s1, n, shr); -+ cpy16_to_8(d + n, s2, n, shr); -+ } -+ } -+ } -+ -+ // Fix up a trailing dest half stripe -+ if (j < w) { -+ uint8_t * d = dst + j * dst_stride2; -+ const uint8_t * s1 = src + j * 2 * src_stride2; -+ -+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { -+ cpy16_to_8(d, s1, n, shr); -+ } -+ } -+} -+ -+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) -+{ -+ const int w = av_frame_cropped_width(src); -+ const int h = av_frame_cropped_height(src); -+ const int x = src->crop_left; -+ const int y = src->crop_top; -+ -+ // We will crop as part of the conversion -+ dst->crop_top = 0; -+ dst->crop_left = 0; -+ dst->crop_bottom = 0; -+ dst->crop_right = 0; -+ -+ switch (src->format){ -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_RPI4_8: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P: -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w/2, h/2); -+ break; -+ case AV_PIX_FMT_NV12: -+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ case AV_PIX_FMT_SAND64_10: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P10: -+ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x*2, y, w*2, h); -+ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y/2, w, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ case AV_PIX_FMT_RPI4_10: -+ switch (dst->format){ -+ case AV_PIX_FMT_YUV420P10: -+ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], -+ dst->data[2], dst->linesize[2], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w/2, h/2); -+ break; -+ case AV_PIX_FMT_NV12: -+ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0], -+ src->data[0], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x, y, w, h); -+ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1], -+ src->data[1], -+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), -+ x/2, y/2, w, h/2); -+ break; -+ default: -+ return -1; -+ } -+ break; -+ default: -+ return -1; -+ } -+ -+ return av_frame_copy_props(dst, src); -+} -diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h -new file mode 100644 -index 0000000000..462ccb8abd ---- /dev/null -+++ b/libavutil/rpi_sand_fns.h -@@ -0,0 +1,188 @@ -+/* -+Copyright (c) 2018 Raspberry Pi (Trading) Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+Authors: John Cox -+*/ -+ -+#ifndef AVUTIL_RPI_SAND_FNS -+#define AVUTIL_RPI_SAND_FNS -+ -+#include "libavutil/frame.h" -+ -+// For all these fns _x & _w are measured as coord * PW -+// For the C fns coords are in chroma pels (so luma / 2) -+// Strides are in bytes -+ -+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_planar_to_sand_c8(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_planar_to_sand_c16(uint8_t * dst_c, -+ unsigned int stride1, unsigned int stride2, -+ const uint8_t * src_u, const unsigned int src_stride_u, -+ const uint8_t * src_v, const unsigned int src_stride_v, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, -+ uint8_t * dst_v, const unsigned int dst_stride_v, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, -+ const uint8_t * src, -+ unsigned int stride1, unsigned int stride2, -+ unsigned int _x, unsigned int y, -+ unsigned int _w, unsigned int h); -+ -+// w/h in pixels -+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, -+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, -+ unsigned int w, unsigned int h, const unsigned int shr); -+ -+ -+// dst must contain required pixel format & allocated data buffers -+// Cropping on the src buffer will be honoured and dst crop will be set to zero -+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); -+ -+ -+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) -+{ -+#ifdef RPI_ZC_SAND128_ONLY -+ // If we are sure we only only support 128 byte sand formats replace the -+ // var with a constant which should allow for better optimisation -+ return 128; -+#else -+ return frame->linesize[0]; -+#endif -+} -+ -+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) -+{ -+ return frame->linesize[3]; -+} -+ -+ -+static inline int av_rpi_is_sand_format(const int format) -+{ -+ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); -+} -+ -+static inline int av_rpi_is_sand_frame(const AVFrame * const frame) -+{ -+ return av_rpi_is_sand_format(frame->format); -+} -+ -+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) -+{ -+ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); -+} -+ -+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) -+{ -+ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); -+} -+ -+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) -+{ -+ return (frame->format == AV_PIX_FMT_RPI4_10); -+} -+ -+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) -+{ -+ return av_rpi_is_sand8_frame(frame) ? 0 : 1; -+} -+ -+// If x is measured in bytes (not pixels) then this works for sand64_16 as -+// well as sand128 - but in the general case we work that out -+ -+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) -+{ -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y + stride2 * x2; -+} -+ -+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) -+{ -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y_c + stride2 * x2; -+} -+ -+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); -+} -+ -+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); -+} -+ -+#endif -+ +From 2443e873ef3a82087fad1d392a26bf1d8a1cc9d5 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 26 Apr 2021 12:34:50 +0100 +Subject: [PATCH 001/113] Add pi configs and scripts + +--- + pi-util/BUILD.txt | 59 ++++++++ + pi-util/NOTES.txt | 69 +++++++++ + pi-util/TESTMESA.txt | 82 +++++++++++ + pi-util/clean_usr_libs.sh | 26 ++++ + pi-util/conf_arm64_native.sh | 45 ++++++ + pi-util/conf_h265.2016.csv | 195 ++++++++++++++++++++++++++ + pi-util/conf_h265.2016_HEVC_v1.csv | 147 ++++++++++++++++++++ + pi-util/conf_h265.csv | 144 +++++++++++++++++++ + pi-util/conf_native.sh | 108 +++++++++++++++ + pi-util/ffconf.py | 215 +++++++++++++++++++++++++++++ + pi-util/ffperf.py | 128 +++++++++++++++++ + pi-util/genpatch.sh | 35 +++++ + pi-util/make_array.py | 23 +++ + pi-util/mkinst.sh | 5 + + pi-util/patkodi.sh | 9 ++ + pi-util/perfcmp.py | 101 ++++++++++++++ + pi-util/qem.sh | 9 ++ + pi-util/v3dusage.py | 128 +++++++++++++++++ + 18 files changed, 1528 insertions(+) + create mode 100644 pi-util/BUILD.txt + create mode 100644 pi-util/NOTES.txt + create mode 100644 pi-util/TESTMESA.txt + create mode 100755 pi-util/clean_usr_libs.sh + create mode 100644 pi-util/conf_arm64_native.sh + create mode 100644 pi-util/conf_h265.2016.csv + create mode 100644 pi-util/conf_h265.2016_HEVC_v1.csv + create mode 100644 pi-util/conf_h265.csv + create mode 100755 pi-util/conf_native.sh + create mode 100755 pi-util/ffconf.py + create mode 100755 pi-util/ffperf.py + create mode 100755 pi-util/genpatch.sh + create mode 100755 pi-util/make_array.py + create mode 100755 pi-util/mkinst.sh + create mode 100644 pi-util/patkodi.sh + create mode 100755 pi-util/perfcmp.py + create mode 100755 pi-util/qem.sh + create mode 100755 pi-util/v3dusage.py + diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 index 0000000000..b050971f63 @@ -68847,10 +859,10 @@ index 0000000000..fc14f2a3c2 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh new file mode 100755 -index 0000000000..a9e053801c +index 0000000000..65576846e8 --- /dev/null +++ b/pi-util/conf_native.sh -@@ -0,0 +1,107 @@ +@@ -0,0 +1,108 @@ +echo "Configure for native build" + +FFSRC=`pwd` @@ -68944,9 +956,10 @@ index 0000000000..a9e053801c + --disable-thumb\ + --enable-v4l2-request\ + --enable-libdrm\ ++ --enable-epoxy\ ++ --enable-libudev\ + --enable-vout-egl\ + --enable-vout-drm\ -+ --enable-gpl\ + $SHARED_LIBS\ + $RPIOPTS\ + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ @@ -69531,95 +1544,6 @@ index 0000000000..a4dbb6eacd +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h + -diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py -new file mode 100755 -index 0000000000..b322dac0c2 ---- /dev/null -+++ b/pi-util/testfilt.py -@@ -0,0 +1,83 @@ -+#!/usr/bin/env python3 -+ -+import string -+import os -+import subprocess -+import re -+import argparse -+import sys -+import csv -+from stat import * -+ -+class validator: -+ def __init__(self): -+ self.ok = False -+ -+ def isok(self): -+ return self.ok -+ -+ def setok(self): -+ self.ok = True -+ -+class valid_regex(validator): -+ def __init__(self, regex): -+ super().__init__() -+ self.regex = re.compile(regex) -+ -+ def scanline(self, line): -+ if self.isok() or self.regex.search(line): -+ self.setok() -+ -+ -+def validate(validators, flog): -+ for line in flog: -+ for v in validators: -+ v.scanline(line) -+ -+ ok = True -+ for v in validators: -+ if not v.isok(): -+ ok = False -+ # complain -+ print("Test failed") -+ -+ if ok: -+ print("OK") -+ return ok -+ -+def runtest(name, ffmpeg, args, suffix, validators): -+ log_root = os.path.join("/tmp", "testfilt", name) -+ ofilename = os.path.join(log_root, name + suffix) -+ -+ if not os.path.exists(log_root): -+ os.makedirs(log_root) -+ -+ try: -+ os.remove(ofilename) -+ except: -+ pass -+ -+ flog = open(os.path.join(log_root, name + ".log"), "wb") -+ ffargs = [ffmpeg] + args + [ofilename] -+ -+ subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False) -+ flog.close -+ -+ flog = open(os.path.join(log_root, name + ".log"), "rt") -+ return validate(validators, flog) -+ -+def sayok(log_root, flog): -+ print("Woohoo") -+ return True -+ -+if __name__ == '__main__': -+ -+ argp = argparse.ArgumentParser(description="FFmpeg filter tester") -+ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name") -+ args = argp.parse_args() -+ -+ runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i", -+ "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv", -+# "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv", -+ "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv", -+ [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')]) diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py new file mode 100755 index 0000000000..5935a11ca5 @@ -69754,644 +1678,30381 @@ index 0000000000..5935a11ca5 + + do_logparse(args.logfile) + -diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile -index 1827a4e134..08da4166ef 100644 ---- a/tests/checkasm/Makefile -+++ b/tests/checkasm/Makefile -@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP) += g722dsp.o - AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o - AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o - AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o -+AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o - AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o - AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o -+AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o - AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o - AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o + +From bde822d2612b59911b0eb44409c8815aa4ff1fef Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 11:27:39 +0100 +Subject: [PATCH 002/113] Add sand pix fmts & conversion fns + +--- + configure | 3 + + libavutil/Makefile | 3 + + libavutil/arm/Makefile | 1 + + libavutil/arm/rpi_sand_neon.S | 768 ++++++++++++++++++++++++++++++++++ + libavutil/arm/rpi_sand_neon.h | 99 +++++ + libavutil/pixdesc.c | 44 ++ + libavutil/pixfmt.h | 6 + + libavutil/rpi_sand_fn_pw.h | 227 ++++++++++ + libavutil/rpi_sand_fns.c | 353 ++++++++++++++++ + libavutil/rpi_sand_fns.h | 183 ++++++++ + 10 files changed, 1687 insertions(+) + create mode 100644 libavutil/arm/rpi_sand_neon.S + create mode 100644 libavutil/arm/rpi_sand_neon.h + create mode 100644 libavutil/rpi_sand_fn_pw.h + create mode 100644 libavutil/rpi_sand_fns.c + create mode 100644 libavutil/rpi_sand_fns.h + +diff --git a/configure b/configure +index ba5793b2ff..a0213c039a 100755 +--- a/configure ++++ b/configure +@@ -343,6 +343,7 @@ External library support: + --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] + --enable-libnpp enable Nvidia Performance Primitives-based code [no] + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] ++ --enable-sand enable sand video formats [rpi] + --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] +@@ -1941,6 +1942,7 @@ FEATURE_LIST=" + omx_rpi + runtime_cpudetect + safe_bitstream_reader ++ sand + shared + small + static +@@ -2501,6 +2503,7 @@ CONFIG_EXTRA=" + rtpdec + rtpenc_chain + rv34dsp ++ sand + scene_sad + sinewin + snappy +diff --git a/libavutil/Makefile b/libavutil/Makefile +index 9435a0bfb0..a2dc31c198 100644 +--- a/libavutil/Makefile ++++ b/libavutil/Makefile +@@ -72,6 +72,7 @@ HEADERS = adler32.h \ + rational.h \ + replaygain.h \ + ripemd.h \ ++ rpi_sand_fns.h \ + samplefmt.h \ + sha.h \ + sha512.h \ +@@ -191,6 +192,7 @@ OBJS-$(CONFIG_MACOS_KPERF) += macos_kperf.o + OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o + OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o + OBJS-$(CONFIG_QSV) += hwcontext_qsv.o ++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o + OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o + OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o + OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o +@@ -211,6 +213,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA) += hwcontext_d3d11va.h + SKIPHEADERS-$(CONFIG_DXVA2) += hwcontext_dxva2.h + SKIPHEADERS-$(CONFIG_QSV) += hwcontext_qsv.h + SKIPHEADERS-$(CONFIG_OPENCL) += hwcontext_opencl.h ++SKIPHEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h + SKIPHEADERS-$(CONFIG_VAAPI) += hwcontext_vaapi.h + SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.h + SKIPHEADERS-$(CONFIG_VDPAU) += hwcontext_vdpau.h +diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile +index 5da44b0542..b74b7c4e2f 100644 +--- a/libavutil/arm/Makefile ++++ b/libavutil/arm/Makefile +@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ -diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c -index 8338e8ff58..81ef182f04 100644 ---- a/tests/checkasm/checkasm.c -+++ b/tests/checkasm/checkasm.c -@@ -131,6 +131,9 @@ static const struct { - #if CONFIG_HUFFYUV_DECODER - { "huffyuvdsp", checkasm_check_huffyuvdsp }, - #endif -+ #if CONFIG_IDCTDSP -+ { "idctdsp", checkasm_check_idctdsp }, -+ #endif - #if CONFIG_JPEG2000_DECODER - { "jpeg2000dsp", checkasm_check_jpeg2000dsp }, - #endif -@@ -155,6 +158,9 @@ static const struct { - #if CONFIG_V210_ENCODER - { "v210enc", checkasm_check_v210enc }, - #endif -+ #if CONFIG_VC1DSP -+ { "vc1dsp", checkasm_check_vc1dsp }, -+ #endif - #if CONFIG_VP8DSP - { "vp8dsp", checkasm_check_vp8dsp }, - #endif -diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h -index ef6645e3a2..1a1e17d835 100644 ---- a/tests/checkasm/checkasm.h -+++ b/tests/checkasm/checkasm.h -@@ -70,6 +70,7 @@ void checkasm_check_hevc_epel_bi(void); - void checkasm_check_hevc_epel_bi_w(void); - void checkasm_check_hevc_sao(void); - void checkasm_check_huffyuvdsp(void); -+void checkasm_check_idctdsp(void); - void checkasm_check_jpeg2000dsp(void); - void checkasm_check_llviddsp(void); - void checkasm_check_llviddspenc(void); -@@ -83,6 +84,7 @@ void checkasm_check_sw_scale(void); - void checkasm_check_utvideodsp(void); - void checkasm_check_v210dec(void); - void checkasm_check_v210enc(void); -+void checkasm_check_vc1dsp(void); - void checkasm_check_vf_eq(void); - void checkasm_check_vf_gblur(void); - void checkasm_check_vf_hflip(void); -diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c + NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ ++ arm/rpi_sand_neon.o \ +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S new file mode 100644 -index 0000000000..02724536a7 +index 0000000000..80890fe985 --- /dev/null -+++ b/tests/checkasm/idctdsp.c -@@ -0,0 +1,98 @@ ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -0,0 +1,768 @@ +/* -+ * Copyright (c) 2022 Ben Avison -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with FFmpeg; if not, write to the Free Software Foundation, Inc., -+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -+ */ ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. + -+#include ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. + -+#include "checkasm.h" ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + -+#include "libavcodec/idctdsp.h" ++Authors: John Cox ++*/ + -+#include "libavutil/common.h" -+#include "libavutil/internal.h" -+#include "libavutil/intreadwrite.h" -+#include "libavutil/mem_internal.h" ++#include "libavutil/arm/asm.S" + -+#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) }, + -+typedef struct { -+ const char *name; -+ size_t offset; -+} test; ++@ General notes: ++@ Having done some timing on this in sand8->y8 (Pi4) ++@ vst1 (680fps) is a bit faster than vstm (660fps) ++@ vldm (680fps) is noticably faster than vld1 (480fps) ++@ (or it might be that a mix is what is required) ++@ ++@ At least on a Pi4 it is no more expensive to have a single auto-inc register ++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted ++@ the latter was better) ++@ ++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless ++@ the memory is uncached. ++@ As these are Sand -> planar we can assume that src is going to be aligned but ++@ it is possible that dest isn't (converting to .yuv or other packed format). ++@ Luckily vst1 is faster than vstm :-) so all is well ++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 ++@ .8 stores would let us do non-word aligned stores into uncached but it ++@ probably isn't worth it. + -+#define RANDOMIZE_BUFFER16(name, size) \ -+ do { \ -+ int i; \ -+ for (i = 0; i < size; ++i) { \ -+ uint16_t r = rnd() % 0x201 - 0x100; \ -+ AV_WN16A(name##0 + i, r); \ -+ AV_WN16A(name##1 + i, r); \ -+ } \ -+ } while (0) + -+#define RANDOMIZE_BUFFER8(name, size) \ -+ do { \ -+ int i; \ -+ for (i = 0; i < size; ++i) { \ -+ uint8_t r = rnd(); \ -+ name##0[i] = r; \ -+ name##1[i] = r; \ -+ } \ -+ } while (0) + -+static void check_add_put_clamped(void) ++ ++@ void ff_rpi_sand128b_stripe_to_8_10( ++@ uint8_t * dest, // [r0] ++@ const uint8_t * src1, // [r1] ++@ const uint8_t * src2, // [r2] ++@ unsigned int lines); // [r3] ++ ++.macro stripe2_to_8, bit_depth ++ vpush {q4-q7} ++1: ++ vldm r1!, {q0-q7} ++ subs r3, #1 ++ vldm r2!, {q8-q15} ++ vqrshrn.u16 d0, q0, #\bit_depth - 8 ++ vqrshrn.u16 d1, q1, #\bit_depth - 8 ++ vqrshrn.u16 d2, q2, #\bit_depth - 8 ++ vqrshrn.u16 d3, q3, #\bit_depth - 8 ++ vqrshrn.u16 d4, q4, #\bit_depth - 8 ++ vqrshrn.u16 d5, q5, #\bit_depth - 8 ++ vqrshrn.u16 d6, q6, #\bit_depth - 8 ++ vqrshrn.u16 d7, q7, #\bit_depth - 8 ++ vqrshrn.u16 d8, q8, #\bit_depth - 8 ++ vqrshrn.u16 d9, q9, #\bit_depth - 8 ++ vqrshrn.u16 d10, q10, #\bit_depth - 8 ++ vqrshrn.u16 d11, q11, #\bit_depth - 8 ++ vqrshrn.u16 d12, q12, #\bit_depth - 8 ++ vqrshrn.u16 d13, q13, #\bit_depth - 8 ++ vqrshrn.u16 d14, q14, #\bit_depth - 8 ++ vqrshrn.u16 d15, q15, #\bit_depth - 8 ++ vstm r0!, {q0-q7} ++ bne 1b ++ vpop {q4-q7} ++ bx lr ++.endm ++ ++function ff_rpi_sand128b_stripe_to_8_10, export=1 ++ stripe2_to_8 10 ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 L ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ lsl r3, #7 ++ sub r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2, {q8-q15} ++ add r2, r3 ++ subs r5, #128 ++ blt 2f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ vst1.8 {d28, d29, d30, d31}, [r0]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #64-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ beq 11b ++ vmov q8, q12 ++ vmov q9, q13 ++ sub r5, #64 ++ vmov q10, q14 ++ vmov q11, q15 ++1: ++ cmp r5, #32-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ beq 11b ++ vmov q8, q10 ++ sub r5, #32 ++ vmov q9, q11 ++1: ++ cmp r5, #16-128 ++ blt 1f ++ vst1.8 {d16, d17}, [r0]! ++ beq 11b ++ sub r5, #16 ++ vmov q8, q9 ++1: ++ cmp r5, #8-128 ++ blt 1f ++ vst1.8 {d16}, [r0]! ++ beq 11b ++ sub r5, #8 ++ vmov d16, d17 ++1: ++ cmp r5, #4-128 ++ blt 1f ++ vst1.32 {d16[0]}, [r0]! ++ beq 11b ++ sub r5, #4 ++ vshr.u64 d16, #32 ++1: ++ cmp r5, #2-128 ++ blt 1f ++ vst1.16 {d16[0]}, [r0]! ++ beq 11b ++ vst1.8 {d16[2]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d16[0]}, [r0]! ++ b 11b ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_c8( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r12, r6 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_c8, export=1 ++ push {r4-r8, lr} @ +24 ++ ++ ldr r5, [sp, #24] ++ ldr r8, [sp, #32] ++ ldr r7, [sp, #40] ++ ldr r6, [sp, #44] ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r6 ++ sub r3, r3, r6 ++ ldr r7, [sp, #48] ++ vpush {q4-q7} ++ ++10: ++ mov r4, r5 ++ mov r12, r6 ++1: ++ subs r12, #64 ++ vldm r4, {q0-q7} ++ add r4, r8 ++ it gt ++ vldmgt r4, {q8-q15} ++ add r4, r8 ++ ++ vuzp.8 q0, q1 ++ vuzp.8 q2, q3 ++ vuzp.8 q4, q5 ++ vuzp.8 q6, q7 ++ ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vuzp.8 q12, q13 ++ vuzp.8 q14, q15 ++ subs r12, #64 ++ ++ @ Rearrange regs so we can use vst1 with 4 regs ++ vswp q1, q2 ++ vswp q5, q6 ++ vswp q9, q10 ++ vswp q13, q14 ++ blt 2f ++ ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ vst1.8 {d20, d21, d22, d23}, [r2]! ++ vst1.8 {d28, d29, d30, d31}, [r2]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ vpop {q4-q7} ++ pop {r4-r8,pc} ++ ++2: ++ cmp r12, #64-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ beq 11b ++ sub r12, #64 ++ vmov q0, q8 ++ vmov q1, q9 ++ vmov q2, q10 ++ vmov q3, q11 ++ vmov q4, q12 ++ vmov q5, q13 ++ vmov q6, q14 ++ vmov q7, q15 ++1: ++ cmp r12, #32-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ beq 11b ++ sub r12, #32 ++ vmov q0, q4 ++ vmov q1, q5 ++ vmov q2, q6 ++ vmov q3, q7 ++1: ++ cmp r12, #16-128 ++ blt 1f ++ vst1.8 {d0, d1 }, [r0]! ++ vst1.8 {d4, d5 }, [r2]! ++ beq 11b ++ sub r12, #16 ++ vmov q0, q1 ++ vmov q2, q3 ++1: ++ cmp r12, #8-128 ++ blt 1f ++ vst1.8 {d0}, [r0]! ++ vst1.8 {d4}, [r2]! ++ beq 11b ++ sub r12, #8 ++ vmov d0, d1 ++ vmov d4, d5 ++1: ++ cmp r12, #4-128 ++ blt 1f ++ vst1.32 {d0[0]}, [r0]! ++ vst1.32 {d4[0]}, [r2]! ++ beq 11b ++ sub r12, #4 ++ vmov s0, s1 ++ vmov s8, s9 ++1: ++ cmp r12, #2-128 ++ blt 1f ++ vst1.16 {d0[0]}, [r0]! ++ vst1.16 {d4[0]}, [r2]! ++ beq 11b ++ vst1.8 {d0[2]}, [r0]! ++ vst1.8 {d4[2]}, [r2]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ vst1.8 {d4[0]}, [r2]! ++ b 11b ++endfunc ++ ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_y16( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ vmov.u16 q15, #0x3ff ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! ++ ands lr, #127 ++ vshrn.u32 d2, q10, #10 ++ vmovn.u32 d0, q10 ++ vmovn.u32 d4, q14 ++ ++ vshr.u32 q14, q11, #20 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d3, q11, #10 ++ vmovn.u32 d1, q11 ++ vmovn.u32 d5, q14 ++ ++ subs r5, #48 ++ vand q0, q15 ++ vand q1, q15 ++ vand q2, q15 ++ ++ vshr.u32 q14, q12, #20 ++ vshrn.u32 d18, q12, #10 ++ vmovn.u32 d16, q12 ++ vmovn.u32 d20, q14 ++ ++ vshr.u32 q14, q13, #20 ++ vshrn.u32 d19, q13, #10 ++ vmovn.u32 d17, q13 ++ vmovn.u32 d21, q14 ++ ++ vand q8, q15 ++ vand q9, q15 ++ vand q10, q15 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_c16( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r6, r9 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ push {r4-r10, lr} @ +32 ++ ldr r5, [sp, #32] ++ ldr r8, [sp, #40] ++ ldr r7, [sp, #48] ++ ldr r9, [sp, #52] ++ mov r12, #48 ++ vmov.u16 q15, #0x3ff ++ sub r8, #1 ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r9, lsl #1 ++ sub r3, r3, r9, lsl #1 ++ ldr r7, [sp, #56] ++10: ++ mov lr, #0 ++ mov r4, r5 ++ mov r6, r9 ++1: ++ vldm r4!, {q0-q3} ++ add lr, #64 ++ ++ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 ++ vshr.u32 q14, q0, #20 ++ vshrn.u32 d16, q0, #10 ++ vmovn.u32 d18, q0 ++ ands lr, #127 ++ vmovn.u32 d20, q14 ++ ++ vshr.u32 q14, q1, #20 ++ vshrn.u32 d17, q1, #10 ++ vmovn.u32 d19, q1 ++ vmovn.u32 d21, q14 ++ ++ vshr.u32 q14, q2, #20 ++ vshrn.u32 d22, q2, #10 ++ vmovn.u32 d24, q2 ++ vmovn.u32 d26, q14 ++ ++ vshr.u32 q14, q3, #20 ++ vshrn.u32 d23, q3, #10 ++ vmovn.u32 d25, q3 ++ add r10, r0, #24 ++ vmovn.u32 d27, q14 ++ ++ it eq ++ addeq r4, r8 ++ vuzp.16 q8, q11 ++ vuzp.16 q9, q12 ++ vuzp.16 q10, q13 ++ ++ @ q8 V0, V3,.. -> q0 ++ @ q9 U0, U3... ++ @ q10 U1, U4... ++ @ q11 U2, U5,.. ++ @ q12 V1, V4,.. -> q1 ++ @ q13 V2, V5,.. -> q2 ++ ++ subs r6, #24 ++ vand q11, q15 ++ vand q9, q15 ++ vand q10, q15 ++ vand q0, q8, q15 ++ vand q1, q12, q15 ++ vand q2, q13, q15 ++ ++ blt 2f ++ ++ vst3.16 {d18, d20, d22}, [r0], r12 ++ vst3.16 {d19, d21, d23}, [r10] ++ add r10, r2, #24 ++ vst3.16 {d0, d2, d4}, [r2], r12 ++ vst3.16 {d1, d3, d5}, [r10] ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ ++ pop {r4-r10, pc} ++ ++@ Partial final write ++2: ++ cmp r6, #-12 ++ blt 1f ++ vst3.16 {d18, d20, d22}, [r0]! ++ vst3.16 {d0, d2, d4}, [r2]! ++ beq 11b ++ vmov d18, d19 ++ vmov d20, d21 ++ vmov d22, d23 ++ sub r6, #12 ++ vmov d0, d1 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r6, #-18 ++ @ Rezip here as it makes the remaining tail handling easier ++ vzip.16 d0, d18 ++ vzip.16 d2, d20 ++ vzip.16 d4, d22 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ vst3.16 {d0[3], d2[3], d4[3]}, [r0]! ++ vst3.16 {d0[2], d2[2], d4[2]}, [r2]! ++ beq 11b ++ vmov d0, d18 ++ vmov d2, d20 ++ sub r6, #6 ++ vmov d4, d22 ++1: ++ cmp r6, #-21 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ beq 11b ++ vmov s4, s5 ++ sub r6, #3 ++ vmov s0, s1 ++1: ++ cmp r6, #-22 ++ blt 1f ++ vst2.16 {d0[1], d2[1]}, [r0]! ++ vst2.16 {d0[0], d2[0]}, [r2]! ++ b 11b ++1: ++ vst1.16 {d0[1]}, [r0]! ++ vst1.16 {d0[0]}, [r2]! ++ b 11b ++ ++endfunc ++ ++@ void ff_rpi_sand30_lines_to_planar_p010( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_p010, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ vmov.u16 q15, #0xffc0 ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshl.u32 q14, q10, #6 ++ ands lr, #127 ++ vshrn.u32 d4, q10, #14 ++ vshrn.u32 d2, q10, #4 ++ vmovn.u32 d0, q14 ++ ++ vshl.u32 q14, q11, #6 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d5, q11, #14 ++ vshrn.u32 d3, q11, #4 ++ vmovn.u32 d1, q14 ++ ++ subs r5, #48 ++ vand q2, q15 ++ vand q1, q15 ++ vand q0, q15 ++ ++ vshl.u32 q14, q12, #6 ++ vshrn.u32 d20, q12, #14 ++ vshrn.u32 d18, q12, #4 ++ vmovn.u32 d16, q14 ++ ++ vshl.u32 q14, q13, #6 ++ vshrn.u32 d21, q13, #14 ++ vshrn.u32 d19, q13, #4 ++ vmovn.u32 d17, q14 ++ ++ vand q10, q15 ++ vand q9, q15 ++ vand q8, q15 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++ +diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h +new file mode 100644 +index 0000000000..447f367bea +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.h +@@ -0,0 +1,99 @@ ++/* ++Copyright (c) 2020 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_ARM_SAND_NEON_H ++#define AVUTIL_ARM_SAND_NEON_H ++ ++void ff_rpi_sand128b_stripe_to_8_10( ++ uint8_t * dest, // [r0] ++ const uint8_t * src1, // [r1] ++ const uint8_t * src2, // [r2] ++ unsigned int lines); // [r3] ++ ++void ff_rpi_sand8_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand8_lines_to_planar_c8( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r12, r6 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_y16( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_c16( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r6, r9 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_p010( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++#endif // AVUTIL_ARM_SAND_NEON_H ++ +diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c +index 6e57a82cb6..3b7b136ce3 100644 +--- a/libavutil/pixdesc.c ++++ b/libavutil/pixdesc.c +@@ -2491,6 +2491,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + }, + .flags = AV_PIX_FMT_FLAG_PLANAR, + }, ++ [AV_PIX_FMT_SAND128] = { ++ .name = "sand128", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 1, 0, 0, 8 }, /* Y */ ++ { 1, 2, 0, 0, 8 }, /* U */ ++ { 1, 2, 1, 0, 8 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_10] = { ++ .name = "sand64_10", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 10 }, /* Y */ ++ { 1, 4, 0, 0, 10 }, /* U */ ++ { 1, 4, 2, 0, 10 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_16] = { ++ .name = "sand64_16", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 16 }, /* Y */ ++ { 1, 4, 0, 0, 16 }, /* U */ ++ { 1, 4, 2, 0, 16 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_RPI4_8] = { ++ .name = "rpi4_8", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, ++ [AV_PIX_FMT_RPI4_10] = { ++ .name = "rpi4_10", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, + }; + + static const char * const color_range_names[] = { +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index 2d3927cc3f..b0dae0fe83 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -349,6 +349,12 @@ enum AVPixelFormat { + + AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian + AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian ++// RPI - not on ifdef so can be got at by calling progs ++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_RPI4_8, ++ AV_PIX_FMT_RPI4_10, + + AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined + AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +new file mode 100644 +index 0000000000..0324f6826d +--- /dev/null ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -0,0 +1,227 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++// * Included twice from rpi_sand_fn with different PW ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) +{ -+ /* Source buffers are only as big as needed, since any over-read won't affect results */ -+ LOCAL_ALIGNED_16(int16_t, src0, [64]); -+ LOCAL_ALIGNED_16(int16_t, src1, [64]); -+ /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */ -+ LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]); -+ LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]); ++ const unsigned int x = _x; ++ const unsigned int w = _w; ++ const unsigned int mask = stride1 - 1; + -+ AVCodecContext avctx = { 0 }; -+ IDCTDSPContext h; ++#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif + -+ const test tests[] = { -+ IDCTDSP_TEST(add_pixels_clamped) -+ IDCTDSP_TEST(put_pixels_clamped) -+ IDCTDSP_TEST(put_signed_pixels_clamped) -+ }; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { ++ memcpy(dst, p, w); ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); + -+ ff_idctdsp_init(&h, &avctx); -+ -+ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { -+ void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset); -+ if (check_func(func, "idctdsp.%s", tests[t].name)) { -+ declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t); -+ RANDOMIZE_BUFFER16(src, 64); -+ RANDOMIZE_BUFFER8(dst, 10 * 24); -+ call_ref(src0, dst0 + 24 + 8, 24); -+ call_new(src1, dst1 + 24 + 8, 24); -+ if (memcmp(dst0, dst1, 10 * 24)) -+ fail(); -+ bench_new(src1, dst1 + 24 + 8, 24); ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const uint8_t * p = p2; ++ uint8_t * d = dst; ++ memcpy(d, p1, w1); ++ d += w1; ++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { ++ memcpy(d, p, stride1); ++ } ++ memcpy(d, p, w3); + } + } +} + -+void checkasm_check_idctdsp(void) ++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) ++ ++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) +{ -+ check_add_put_clamped(); -+ report("idctdsp"); ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ ++#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ const pixel * p = (const pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * p = (const pixel *)p1; ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } +} -diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c ++ ++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++} ++ ++ ++#undef pixel ++#undef STRCAT ++#undef FUNC ++ +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c new file mode 100644 -index 0000000000..52628d15e4 +index 0000000000..ed0261b02f --- /dev/null -+++ b/tests/checkasm/vc1dsp.c -@@ -0,0 +1,452 @@ ++++ b/libavutil/rpi_sand_fns.c +@@ -0,0 +1,353 @@ +/* -+ * Copyright (c) 2022 Ben Avison ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#include "config.h" ++#include ++#include ++#include "rpi_sand_fns.h" ++#include "avassert.h" ++#include "frame.h" ++ ++#if ARCH_ARM && HAVE_NEON ++#include "arm/rpi_sand_neon.h" ++#define HAVE_SAND_ASM 1 ++#else ++#define HAVE_SAND_ASM 0 ++#endif ++ ++#define PW 1 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#define PW 2 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#if 1 ++// Simple round ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ const unsigned int rnd = (1 << shr) >> 1; ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ *dst++ = (*src++ + rnd) >> shr; ++ } ++} ++#else ++// Dithered variation ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ unsigned int rnd = (1 << shr) >> 1; ++ const unsigned int mask = ((1 << shr) - 1); ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ rnd = *src++ + (rnd & mask); ++ *dst++ = rnd >> shr; ++ } ++} ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * d = (uint16_t *)dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = p3 & 0x3ff; ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = p3 & 0x3ff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 10) & 0x3ff; ++ } ++ } ++} ++ ++ ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 3) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 8; ++ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * du = (uint16_t *)dst_u; ++ uint16_t * dv = (uint16_t *)dst_v; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ if (xskip0 == 1) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = (p3b >> 0) & 0x3ff; ++ } ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ if (xrem1 == 2) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ } ++ } ++ } ++} ++ ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr) ++{ ++ const unsigned int n = dst_stride1 / 2; ++ unsigned int j; ++ ++ // This is true for our current layouts ++ av_assert0(dst_stride1 == src_stride1); ++ ++ // As we have the same stride1 for src & dest and src is wider than dest ++ // then if we loop on src we can always write contiguously to dest ++ // We make no effort to copy an exact width - round up to nearest src stripe ++ // as we will always have storage in dest for that ++ ++#if ARCH_ARM && HAVE_NEON ++ if (shr == 3 && src_stride1 == 128) { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); ++ } ++ } ++ else ++#endif ++ { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ cpy16_to_8(d + n, s2, n, shr); ++ } ++ } ++ } ++ ++ // Fix up a trailing dest half stripe ++ if (j < w) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ } ++ } ++} ++ ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) ++{ ++ const int w = av_frame_cropped_width(src); ++ const int h = av_frame_cropped_height(src); ++ const int x = src->crop_left; ++ const int y = src->crop_top; ++ ++ // We will crop as part of the conversion ++ dst->crop_top = 0; ++ dst->crop_left = 0; ++ dst->crop_bottom = 0; ++ dst->crop_right = 0; ++ ++ switch (src->format){ ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x*2, y, w*2, h); ++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_RPI4_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ default: ++ return -1; ++ } ++ ++ return av_frame_copy_props(dst, src); ++} +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +new file mode 100644 +index 0000000000..634b55e800 +--- /dev/null ++++ b/libavutil/rpi_sand_fns.h +@@ -0,0 +1,183 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_RPI_SAND_FNS ++#define AVUTIL_RPI_SAND_FNS ++ ++#include "libavutil/frame.h" ++ ++// For all these fns _x & _w are measured as coord * PW ++// For the C fns coords are in chroma pels (so luma / 2) ++// Strides are in bytes ++ ++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_planar_to_sand_c8(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_planar_to_sand_c16(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr); ++ ++ ++// dst must contain required pixel format & allocated data buffers ++// Cropping on the src buffer will be honoured and dst crop will be set to zero ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); ++ ++ ++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) ++{ ++#ifdef RPI_ZC_SAND128_ONLY ++ // If we are sure we only only support 128 byte sand formats replace the ++ // var with a constant which should allow for better optimisation ++ return 128; ++#else ++ return frame->linesize[0]; ++#endif ++} ++ ++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); ++} ++ ++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) ++{ ++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand8_frame(frame) ? 0 : 1; ++} ++ ++// If x is measured in bytes (not pixels) then this works for sand64_16 as ++// well as sand128 - but in the general case we work that out ++ ++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); ++} ++ ++#endif ++ + +From 3d994f09aa704c80cd2e0496c00f5767f88244cc Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 11:36:47 +0100 +Subject: [PATCH 003/113] Add aarch64 asm sand conv functions + +Many thanks to eiler.mike@gmail.com (Michael Eiler) for these +optimizations +--- + libavutil/aarch64/Makefile | 4 +- + libavutil/aarch64/rpi_sand_neon.S | 676 ++++++++++++++++++++++++++++++ + libavutil/aarch64/rpi_sand_neon.h | 55 +++ + libavutil/rpi_sand_fn_pw.h | 4 +- + libavutil/rpi_sand_fns.c | 3 + + 5 files changed, 739 insertions(+), 3 deletions(-) + create mode 100644 libavutil/aarch64/rpi_sand_neon.S + create mode 100644 libavutil/aarch64/rpi_sand_neon.h + +diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile +index 5613813ba8..ab8bcfcf34 100644 +--- a/libavutil/aarch64/Makefile ++++ b/libavutil/aarch64/Makefile +@@ -1,4 +1,6 @@ + OBJS += aarch64/cpu.o \ + aarch64/float_dsp_init.o \ + +-NEON-OBJS += aarch64/float_dsp_neon.o ++NEON-OBJS += aarch64/float_dsp_neon.o \ ++ aarch64/rpi_sand_neon.o \ ++ +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +new file mode 100644 +index 0000000000..cdcf71ee67 +--- /dev/null ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -0,0 +1,676 @@ ++/* ++Copyright (c) 2021 Michael Eiler ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: Michael Eiler ++*/ ++ ++#include "asm.S" ++ ++// void ff_rpi_sand8_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++ ++function ff_rpi_sand8_lines_to_planar_y8, export=1 ++ // w15 contains the number of rows we need to process ++ ldr w15, [sp, #0] ++ ++ // w8 will contain the number of blocks per row ++ // w8 = floor(_w/stride1) ++ // stride1 is assumed to always be 128 ++ mov w8, w1 ++ lsr w8, w8, #7 ++ ++ // in case the width of the image is not a multiple of 128, there will ++ // be an incomplete block at the end of every row ++ // w9 contains the number of pixels stored within this block ++ // w9 = _w - w8 * 128 ++ lsl w9, w8, #7 ++ sub w9, w7, w9 ++ ++ // this is the value we have to add to the src pointer after reading a complete block ++ // it will move the address to the start of the next block ++ // w10 = stride2 * stride1 - stride1 ++ mov w10, w4 ++ lsl w10, w10, #7 ++ sub w10, w10, #128 ++ ++ // w11 is the row offset, meaning the start offset of the first block of every collumn ++ // this will be increased with stride1 within every iteration of the row_loop ++ eor w11, w11, w11 ++ ++ // w12 = 0, processed row count ++ eor w12, w12, w12 ++row_loop: ++ // start of the first block within the current row ++ // x13 = row offset + src ++ mov x13, x2 ++ add x13, x13, x11 ++ ++ // w14 = 0, processed block count ++ eor w14, w14, w14 ++ ++ cmp w8, #0 ++ beq no_main_y8 ++ ++block_loop: ++ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 ++ // fortunately these aren't callee saved ones, meaning we don't need to backup them ++ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 ++ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 ++ ++ // write these registers back to the destination vector and increase the dst address by 128 ++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 ++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64 ++ ++ // move the source register to the beginning of the next block (x13 = src + block offset) ++ add x13, x13, x10 ++ // increase the block counter ++ add w14, w14, #1 ++ ++ // continue with the block_loop if we haven't copied all full blocks yet ++ cmp w8, w14 ++ bgt block_loop ++ ++ // handle the last block at the end of each row ++ // at most 127 byte values copied from src to dst ++no_main_y8: ++ eor w5, w5, w5 // i = 0 ++incomplete_block_loop_y8: ++ cmp w5, w9 ++ bge incomplete_block_loop_end_y8 ++ ++ ldrb w6, [x13] ++ strb w6, [x0] ++ add x13, x13, #1 ++ add x0, x0, #1 ++ ++ add w5, w5, #1 ++ b incomplete_block_loop_y8 ++incomplete_block_loop_end_y8: ++ ++ ++ // increase the row offset by 128 (stride1) ++ add w11, w11, #128 ++ // increment the row counter ++ add w12, w12, #1 ++ ++ // process the next row if we haven't finished yet ++ cmp w15, w12 ++ bgt row_loop ++ ++ ret ++endfunc ++ ++ ++ ++// void ff_rpi_sand8_lines_to_planar_c8( ++// uint8_t * dst_u, : x0 ++// unsigned int dst_stride_u, : w1 == width ++// uint8_t * dst_v, : x2 ++// unsigned int dst_stride_v, : w3 == width ++// const uint8_t * src, : x4 ++// unsigned int stride1, : w5 == 128 ++// unsigned int stride2, : w6 ++// unsigned int _x, : w7 ++// unsigned int y, : [sp, #0] ++// unsigned int _w, : [sp, #8] ++// unsigned int h); : [sp, #16] ++ ++function ff_rpi_sand8_lines_to_planar_c8, export=1 ++ // w7 = width ++ ldr w7, [sp, #8] ++ ++ // w15 contains the number of rows we need to process ++ // counts down ++ ldr w15, [sp, #16] ++ ++ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6 ++ mov w8, w7 ++ lsr w8, w8, #6 ++ ++ // number of pixels in block at the end of every row ++ // w9 = _w - (w8 * 64) ++ lsl w9, w8, #6 ++ sub w9, w7, w9 ++ ++ // Skip at the end of the line to account for stride ++ sub w12, w1, w7 ++ ++ // address delta to the beginning of the next block ++ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128 ++ lsl w10, w6, #7 ++ sub w10, w10, #128 ++ ++ // w11 = row address start offset = 0 ++ eor w11, w11, w11 ++ ++row_loop_c8: ++ // start of the first block within the current row ++ // x13 = row offset + src ++ mov x13, x4 ++ add x13, x13, x11 ++ ++ // w14 = 0, processed block count ++ eor w14, w14, w14 ++ ++ cmp w8, #0 ++ beq no_main_c8 ++ ++block_loop_c8: ++ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values ++ ld2 { v0.16b, v1.16b }, [x13], #32 ++ ld2 { v2.16b, v3.16b }, [x13], #32 ++ ld2 { v4.16b, v5.16b }, [x13], #32 ++ ld2 { v6.16b, v7.16b }, [x13], #32 ++ ++ // swap register so that we can write them out with a single instruction ++ mov v16.16b, v1.16b ++ mov v17.16b, v3.16b ++ mov v18.16b, v5.16b ++ mov v1.16b, v2.16b ++ mov v2.16b, v4.16b ++ mov v3.16b, v6.16b ++ mov v4.16b, v16.16b ++ mov v5.16b, v17.16b ++ mov v6.16b, v18.16b ++ ++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 ++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64 ++ ++ // increment row counter and move src to the beginning of the next block ++ add w14, w14, #1 ++ add x13, x13, x10 ++ ++ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks ++ cmp w8, w14 ++ bgt block_loop_c8 ++ ++no_main_c8: ++ // handle incomplete block at the end of every row ++ eor w5, w5, w5 // point counter, this might be ++incomplete_block_loop_c8: ++ cmp w5, w9 ++ bge incomplete_block_loop_end_c8 ++ ++ ldrb w1, [x13] ++ strb w1, [x0] ++ add x13, x13, #1 ++ ++ ldrb w1, [x13] ++ strb w1, [x2] ++ add x13, x13, #1 ++ ++ add x0, x0, #1 ++ add x2, x2, #1 ++ ++ add w5, w5, #1 ++ b incomplete_block_loop_c8 ++incomplete_block_loop_end_c8: ++ ++ // increase row_offset by stride1 ++ add w11, w11, #128 ++ add x0, x0, w12, sxtw ++ add x2, x2, w12, sxtw ++ ++ // jump to row_Loop_c8 iff the row count is small than the height ++ subs w15, w15, #1 ++ bgt row_loop_c8 ++ ++ ret ++endfunc ++ ++//void ff_rpi_sand30_lines_to_planar_y16( ++// uint8_t * dest, // [x0] ++// unsigned int dst_stride, // [w1] -> assumed to be equal to _w ++// const uint8_t * src, // [x2] ++// unsigned int src_stride1, // [w3] -> 128 ++// unsigned int src_stride2, // [w4] ++// unsigned int _x, // [w5] ++// unsigned int y, // [w6] ++// unsigned int _w, // [w7] ++// unsigned int h); // [sp, #0] ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ stp x19, x20, [sp, #-48]! ++ stp x21, x22, [sp, #16] ++ stp x23, x24, [sp, #32] ++ ++ // w6 = argument h ++ ldr w6, [sp, #48] ++ ++ // slice_inc = ((stride2 - 1) * stride1) ++ mov w5, w4 ++ sub w5, w5, #1 ++ lsl w5, w5, #7 ++ ++ // total number of bytes per row = (width / 3) * 4 ++ mov w8, w7 ++ mov w9, #3 ++ udiv w8, w8, w9 ++ lsl w8, w8, #2 ++ ++ // number of full 128 byte blocks to be processed ++ mov w9, #96 ++ udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 ++ ++ // w10 = number of full integers to process (4 bytes) ++ // w11 = remaning zero to two 10bit values still to copy over ++ mov w12, #96 ++ mul w12, w9, w12 ++ sub w12, w7, w12 // width - blocks*96 = remaining points per row ++ mov w11, #3 ++ udiv w10, w12, w11 // full integers to process = w12 / 3 ++ mul w11, w10, w11 // #integers *3 ++ sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 ++ ++ // increase w9 by one if w10+w11 is not zero, and decrease the row count by one ++ // this is to efficiently copy incomplete blocks at the end of the rows ++ // the last row is handled explicitly to avoid writing out of bounds ++ add w22, w10, w11 ++ cmp w22, #0 ++ cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise ++ add w9, w9, w22 ++ sub w6, w6, #1 ++ ++ // store the number of bytes in w20 which we copy too much for every row ++ // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) ++ mov w20, #96*2 ++ mul w20, w20, w9 ++ sub w20, w1, w20 ++ ++ mov w23, #0 // flag to check whether the last line had already been processed ++ ++ // bitmask to clear the uppper 6bits of the result values ++ mov x19, #0x03ff03ff03ff03ff ++ dup v22.2d, x19 ++ ++ // row counter = 0 ++ eor w12, w12, w12 ++row_loop_y16: ++ cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows ++ bge row_loop_y16_fin ++ ++ mov x13, x2 // row src ++ eor w14, w14, w14 // full block counter ++block_loop_y16: ++ cmp w14, w9 ++ bge block_loop_y16_fin ++ ++ // load 64 bytes ++ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 ++ ++ // process v0 and v1 ++ xtn v16.4h, v0.4s ++ ushr v0.4s, v0.4s, #10 ++ xtn v17.4h, v0.4s ++ ushr v0.4s, v0.4s, #10 ++ xtn v18.4h, v0.4s ++ ++ xtn2 v16.8h, v1.4s ++ and v16.16b, v16.16b, v22.16b ++ ushr v1.4s, v1.4s, #10 ++ xtn2 v17.8h, v1.4s ++ and v17.16b, v17.16b, v22.16b ++ ushr v1.4s, v1.4s, #10 ++ xtn2 v18.8h, v1.4s ++ and v18.16b, v18.16b, v22.16b ++ ++ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 ++ ++ // process v2 and v3 ++ xtn v23.4h, v2.4s ++ ushr v2.4s, v2.4s, #10 ++ xtn v24.4h, v2.4s ++ ushr v2.4s, v2.4s, #10 ++ xtn v25.4h, v2.4s ++ ++ xtn2 v23.8h, v3.4s ++ and v23.16b, v23.16b, v22.16b ++ ushr v3.4s, v3.4s, #10 ++ xtn2 v24.8h, v3.4s ++ and v24.16b, v24.16b, v22.16b ++ ushr v3.4s, v3.4s, #10 ++ xtn2 v25.8h, v3.4s ++ and v25.16b, v25.16b, v22.16b ++ ++ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 ++ ++ // load the second half of the block -> 64 bytes into registers v4-v7 ++ ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 ++ ++ // process v4 and v5 ++ xtn v16.4h, v4.4s ++ ushr v4.4s, v4.4s, #10 ++ xtn v17.4h, v4.4s ++ ushr v4.4s, v4.4s, #10 ++ xtn v18.4h, v4.4s ++ ++ xtn2 v16.8h, v5.4s ++ and v16.16b, v16.16b, v22.16b ++ ushr v5.4s, v5.4s, #10 ++ xtn2 v17.8h, v5.4s ++ and v17.16b, v17.16b, v22.16b ++ ushr v5.4s, v5.4s, #10 ++ xtn2 v18.8h, v5.4s ++ and v18.16b, v18.16b, v22.16b ++ ++ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 ++ ++ // v6 and v7 ++ xtn v23.4h, v6.4s ++ ushr v6.4s, v6.4s, #10 ++ xtn v24.4h, v6.4s ++ ushr v6.4s, v6.4s, #10 ++ xtn v25.4h, v6.4s ++ ++ xtn2 v23.8h, v7.4s ++ and v23.16b, v23.16b, v22.16b ++ ushr v7.4s, v7.4s, #10 ++ xtn2 v24.8h, v7.4s ++ and v24.16b, v24.16b, v22.16b ++ ushr v7.4s, v7.4s, #10 ++ xtn2 v25.8h, v7.4s ++ and v25.16b, v25.16b, v22.16b ++ ++ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 ++ ++ add x13, x13, x5 // row src += slice_inc ++ add w14, w14, #1 ++ b block_loop_y16 ++block_loop_y16_fin: ++ ++ ++ ++ ++ add x2, x2, #128 // src += stride1 (start of the next row) ++ add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst ++ add w12, w12, #1 ++ b row_loop_y16 ++row_loop_y16_fin: ++ ++ // check whether we have incomplete blocks at the end of every row ++ // in that case decrease row block count by one ++ // change height back to it's original value (meaning increase it by 1) ++ // and jump back to another iteration of row_loop_y16 ++ ++ cmp w23, #1 ++ beq row_loop_y16_fin2 // don't continue here if we already processed the last row ++ add w6, w6, #1 // increase height to the original value ++ sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count ++ mov w23, #1 ++ b row_loop_y16 ++row_loop_y16_fin2: ++ ++ sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference ++ ++ // now we've got to handle the last block in the last row ++ eor w12, w12, w12 // w12 = 0 = counter ++integer_loop_y16: ++ cmp w12, w10 ++ bge integer_loop_y16_fin ++ ldr w14, [x13], #4 ++ and w15, w14, #0x3ff ++ strh w15, [x0], #2 ++ lsr w14, w14, #10 ++ and w15, w14, #0x3ff ++ strh w15, [x0], #2 ++ lsr w14, w14, #10 ++ and w15, w14, #0x3ff ++ strh w15, [x0], #2 ++ add w12, w12, #1 ++ b integer_loop_y16 ++integer_loop_y16_fin: ++ ++final_values_y16: ++ // remaining point count = w11 ++ ldr w14, [x13], #4 ++ cmp w11, #0 ++ beq final_values_y16_fin ++ and w15, w14, #0x3ff ++ strh w15, [x0], #2 ++ cmp w11, #1 ++ beq final_values_y16_fin ++ lsr w14, w14, #10 ++ and w15, w14, #0x3ff ++ strh w15, [x0], #2 ++final_values_y16_fin: ++ ++ ldp x23, x24, [sp, #32] ++ ldp x21, x22, [sp, #16] ++ ldp x19, x20, [sp], #48 ++ ret ++endfunc ++ ++//void ff_rpi_sand30_lines_to_planar_c16( ++// uint8_t * dst_u, // [x0] ++// unsigned int dst_stride_u, // [w1] == _w*2 ++// uint8_t * dst_v, // [x2] ++// unsigned int dst_stride_v, // [w3] == _w*2 ++// const uint8_t * src, // [x4] ++// unsigned int stride1, // [w5] == 128 ++// unsigned int stride2, // [w6] ++// unsigned int _x, // [w7] == 0 ++// unsigned int y, // [sp, #0] == 0 ++// unsigned int _w, // [sp, #8] -> w3 ++// unsigned int h); // [sp, #16] -> w7 ++ ++.macro rpi_sand30_lines_to_planar_c16_block_half ++ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 ++ ++ xtn v4.4h, v0.4s ++ ushr v0.4s, v0.4s, #10 ++ xtn v5.4h, v0.4s ++ ushr v0.4s, v0.4s, #10 ++ xtn v6.4h, v0.4s ++ xtn2 v4.8h, v1.4s ++ ushr v1.4s, v1.4s, #10 ++ xtn2 v5.8h, v1.4s ++ ushr v1.4s, v1.4s, #10 ++ xtn2 v6.8h, v1.4s ++ and v4.16b, v4.16b, v16.16b ++ and v5.16b, v5.16b, v16.16b ++ and v6.16b, v6.16b, v16.16b ++ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 ++ ++ xtn v4.4h, v2.4s ++ ushr v2.4s, v2.4s, #10 ++ xtn v5.4h, v2.4s ++ ushr v2.4s, v2.4s, #10 ++ xtn v6.4h, v2.4s ++ xtn2 v4.8h, v3.4s ++ ushr v3.4s, v3.4s, #10 ++ xtn2 v5.8h, v3.4s ++ ushr v3.4s, v3.4s, #10 ++ xtn2 v6.8h, v3.4s ++ and v4.16b, v4.16b, v16.16b ++ and v5.16b, v5.16b, v16.16b ++ and v6.16b, v6.16b, v16.16b ++ st3 { v4.8h, v5.8h, v6.8h }, [sp] ++ sub sp, sp, #48 ++.endm ++ ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ stp x19, x20, [sp, #-48]! ++ stp x21, x22, [sp, #16] ++ stp x23, x24, [sp, #32] ++ ++ ldr w3, [sp, #48+8] // w3 = width ++ ldr w7, [sp, #48+16] // w7 = height ++ ++ // reserve space on the stack for intermediate results ++ sub sp, sp, #256 ++ ++ // number of 128byte blocks per row, w8 = width / 48 ++ mov w9, #48 ++ udiv w8, w3, w9 ++ ++ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 ++ mul w9, w8, w9 ++ sub w9, w3, w9 ++ ++ // row offset, the beginning of the next row to process ++ eor w10, w10, w10 ++ ++ // offset to the beginning of the next block, w11 = stride2 * 128 - 128 ++ lsl w11, w6, #7 ++ sub w11, w11, #128 ++ ++ // decrease the height by one and in case of remaining pixels increase the block count by one ++ sub w7, w7, #1 ++ cmp w9, #0 ++ cset w19, ne // w19 == 1 iff reamining pixels != 0 ++ add w8, w8, w19 ++ ++ // bytes we have to move dst back by at the end of every row ++ mov w21, #48*2 ++ mul w21, w21, w8 ++ sub w21, w1, w21 ++ ++ mov w20, #0 // w20 = flag, last row processed ++ ++ mov x12, #0x03ff03ff03ff03ff ++ dup v16.2d, x12 ++ ++ // iterate through rows, row counter = w12 = 0 ++ eor w12, w12, w12 ++row_loop_c16: ++ cmp w12, w7 ++ bge row_loop_c16_fin ++ ++ // address of row data = src + row_offset ++ mov x13, x4 ++ add x13, x13, x10 ++ ++ eor w14, w14, w14 ++block_loop_c16: ++ cmp w14, w8 ++ bge block_loop_c16_fin ++ ++ rpi_sand30_lines_to_planar_c16_block_half ++ ++ ld2 { v0.8h, v1.8h }, [sp], #32 ++ ld2 { v2.8h, v3.8h }, [sp], #32 ++ ld2 { v4.8h, v5.8h }, [sp] ++ sub sp, sp, #64 ++ ++ st1 { v0.8h }, [x0], #16 ++ st1 { v2.8h }, [x0], #16 ++ st1 { v4.8h }, [x0], #16 ++ st1 { v1.8h }, [x2], #16 ++ st1 { v3.8h }, [x2], #16 ++ st1 { v5.8h }, [x2], #16 ++ ++ rpi_sand30_lines_to_planar_c16_block_half ++ ++ ld2 { v0.8h, v1.8h }, [sp], #32 ++ ld2 { v2.8h, v3.8h }, [sp], #32 ++ ld2 { v4.8h, v5.8h }, [sp] ++ sub sp, sp, #64 ++ ++ st1 { v0.8h }, [x0], #16 ++ st1 { v2.8h }, [x0], #16 ++ st1 { v4.8h }, [x0], #16 ++ st1 { v1.8h }, [x2], #16 ++ st1 { v3.8h }, [x2], #16 ++ st1 { v5.8h }, [x2], #16 ++ ++ add x13, x13, x11 // offset to next block ++ add w14, w14, #1 ++ b block_loop_c16 ++block_loop_c16_fin: ++ ++ add w10, w10, #128 ++ add w12, w12, #1 ++ add x0, x0, w21, sxtw // move dst pointers back by x21 ++ add x2, x2, w21, sxtw ++ b row_loop_c16 ++row_loop_c16_fin: ++ ++ cmp w20, #1 ++ beq row_loop_c16_fin2 ++ mov w20, #1 ++ sub w8, w8, w19 // decrease block count by w19 ++ add w7, w7, #1 // increase height ++ b row_loop_c16 ++ ++row_loop_c16_fin2: ++ sub x0, x0, w21, sxtw // readd x21 in case of the last row ++ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels ++ ++ // last incomplete block to be finished ++ // read operations are fine, stride2 is more than large enough even if rem_pix is 0 ++ rpi_sand30_lines_to_planar_c16_block_half ++ ld2 { v0.8h, v1.8h }, [sp], #32 ++ ld2 { v2.8h, v3.8h }, [sp], #32 ++ ld2 { v4.8h, v5.8h }, [sp], #32 ++ rpi_sand30_lines_to_planar_c16_block_half ++ ld2 { v0.8h, v1.8h }, [sp], #32 ++ ld2 { v2.8h, v3.8h }, [sp], #32 ++ ld2 { v4.8h, v5.8h }, [sp] ++ sub sp, sp, #160 ++ ++ mov x4, sp ++ eor w20, w20, w20 ++rem_pix_c16_loop: ++ cmp w20, w9 ++ bge rem_pix_c16_fin ++ ++ ldr w22, [x4], #4 ++ str w22, [x0], #2 ++ lsr w22, w22, #16 ++ str w22, [x2], #2 ++ ++ add w20, w20, #1 ++ b rem_pix_c16_loop ++rem_pix_c16_fin: ++ ++ add sp, sp, #256 ++ ++ ldp x23, x24, [sp, #32] ++ ldp x21, x22, [sp, #16] ++ ldp x19, x20, [sp], #48 ++ ret ++endfunc ++ ++ ++ ++//void ff_rpi_sand30_lines_to_planar_p010( ++// uint8_t * dest, ++// unsigned int dst_stride, ++// const uint8_t * src, ++// unsigned int src_stride1, ++// unsigned int src_stride2, ++// unsigned int _x, ++// unsigned int y, ++// unsigned int _w, ++// unsigned int h); ++ +diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h +new file mode 100644 +index 0000000000..b3aa481ea4 +--- /dev/null ++++ b/libavutil/aarch64/rpi_sand_neon.h +@@ -0,0 +1,55 @@ ++/* ++Copyright (c) 2021 Michael Eiler ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: Michael Eiler ++*/ ++ ++#pragma once ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u, ++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u, ++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, ++ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ ++#ifdef __cplusplus ++} ++#endif ++ +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +index 0324f6826d..0d5d203dc3 100644 +--- a/libavutil/rpi_sand_fn_pw.h ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -54,7 +54,7 @@ void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, + const unsigned int w = _w; + const unsigned int mask = stride1 - 1; + +-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) ++#if PW == 1 && HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, + src, stride1, stride2, _x, y, _w, h); +@@ -106,7 +106,7 @@ void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_strid + const unsigned int w = _w * 2; + const unsigned int mask = stride1 - 1; + +-#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64) ++#if PW == 1 && HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, + src, stride1, stride2, _x, y, _w, h); +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +index ed0261b02f..1f543e9357 100644 +--- a/libavutil/rpi_sand_fns.c ++++ b/libavutil/rpi_sand_fns.c +@@ -37,6 +37,9 @@ Authors: John Cox + #if ARCH_ARM && HAVE_NEON + #include "arm/rpi_sand_neon.h" + #define HAVE_SAND_ASM 1 ++#elif ARCH_AARCH64 && HAVE_NEON ++#include "aarch64/rpi_sand_neon.h" ++#define HAVE_SAND_ASM 1 + #else + #define HAVE_SAND_ASM 0 + #endif + +From 0d53002c00ccc9c51321f423a62cb04e523a908f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 11:56:02 +0100 +Subject: [PATCH 004/113] Add raw encoding for sand + +--- + libavcodec/raw.c | 6 +++ + libavcodec/rawenc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++- + 2 files changed, 96 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index a371bb36c4..5e965dfa08 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -294,6 +294,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = { + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ + ++ /* RPI (Might as well define for everything) */ ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, ++ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, ++ + { AV_PIX_FMT_NONE, 0 }, + }; + +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index 34d7a1bef4..0cd8eaffee 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -24,6 +24,7 @@ + * Raw Video Encoder + */ + ++#include "config.h" + #include "avcodec.h" + #include "codec_internal.h" + #include "encode.h" +@@ -33,6 +34,10 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" ++#if CONFIG_SAND ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -46,12 +51,95 @@ static av_cold int raw_encode_init(AVCodecContext *avctx) + return 0; + } + ++#if CONFIG_SAND ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3 / 2; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); ++ return 0; ++} ++ ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++ ++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height * 2; ++ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); ++ return 0; ++} ++#endif ++ ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame, int *got_packet) + { +- int ret = av_image_get_buffer_size(frame->format, +- frame->width, frame->height, 1); ++ int ret; + ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(frame)) { ++ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : ++ av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) : ++ av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1; ++ *got_packet = (ret == 0); ++ return ret; ++ } ++#endif ++ ++ ret = av_image_get_buffer_size(frame->format, ++ frame->width, frame->height, 1); + if (ret < 0) + return ret; + + +From dd7e385da41ab751d41a44d6395b4cc841b2839a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 12:02:09 +0100 +Subject: [PATCH 005/113] Deal with the lack of trivial sand cropping + +--- + fftools/ffmpeg.c | 4 ++-- + fftools/ffmpeg_filter.c | 4 ++-- + libavutil/frame.c | 11 +++++++++++ + libavutil/frame.h | 10 ++++++++++ + 4 files changed, 25 insertions(+), 4 deletions(-) + +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index e7384f052a..c68f96006b 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -1953,8 +1953,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref + av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout); + break; + case AVMEDIA_TYPE_VIDEO: +- need_reinit |= ifilter->width != frame->width || +- ifilter->height != frame->height; ++ need_reinit |= ifilter->width != av_frame_cropped_width(frame) || ++ ifilter->height != av_frame_cropped_height(frame); + break; + } + +diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c +index 0845c631a5..a052635927 100644 +--- a/fftools/ffmpeg_filter.c ++++ b/fftools/ffmpeg_filter.c +@@ -1175,8 +1175,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) + + ifilter->format = frame->format; + +- ifilter->width = frame->width; +- ifilter->height = frame->height; ++ ifilter->width = av_frame_cropped_width(frame); ++ ifilter->height = av_frame_cropped_height(frame); + ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; + + ifilter->sample_rate = frame->sample_rate; +diff --git a/libavutil/frame.c b/libavutil/frame.c +index 4c16488c66..6ff74a919b 100644 +--- a/libavutil/frame.c ++++ b/libavutil/frame.c +@@ -16,6 +16,8 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + #include "channel_layout.h" + #include "avassert.h" + #include "buffer.h" +@@ -27,6 +29,9 @@ + #include "mem.h" + #include "samplefmt.h" + #include "hwcontext.h" ++#if CONFIG_SAND ++#include "rpi_sand_fns.h" ++#endif + + #if FF_API_OLD_CHANNEL_LAYOUT + #define CHECK_CHANNELS_CONSISTENCY(frame) \ +@@ -875,6 +880,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) + (frame->crop_top + frame->crop_bottom) >= frame->height) + return AVERROR(ERANGE); + ++#if CONFIG_SAND ++ // Sand cannot be cropped - do not try ++ if (av_rpi_is_sand_format(frame->format)) ++ return 0; ++#endif ++ + desc = av_pix_fmt_desc_get(frame->format); + if (!desc) + return AVERROR_BUG; +diff --git a/libavutil/frame.h b/libavutil/frame.h +index 33fac2054c..a112e296f7 100644 +--- a/libavutil/frame.h ++++ b/libavutil/frame.h +@@ -940,6 +940,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); + */ + const char *av_frame_side_data_name(enum AVFrameSideDataType type); + ++ ++static inline int av_frame_cropped_width(const AVFrame * const frame) ++{ ++ return frame->width - (frame->crop_left + frame->crop_right); ++} ++static inline int av_frame_cropped_height(const AVFrame * const frame) ++{ ++ return frame->height - (frame->crop_top + frame->crop_bottom); ++} ++ + /** + * @} + */ + +From 15e1a26da5d1b6a244a8a663dd469010c1da55e6 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 12:31:16 +0100 +Subject: [PATCH 006/113] Add an unsand filter + +--- + configure | 1 + + libavfilter/Makefile | 1 + + libavfilter/allfilters.c | 1 + + libavfilter/buffersrc.c | 2 +- + libavfilter/vf_unsand.c | 228 +++++++++++++++++++++++++++++++++++++++ + 5 files changed, 232 insertions(+), 1 deletion(-) + create mode 100644 libavfilter/vf_unsand.c + +diff --git a/configure b/configure +index a0213c039a..c8674a7dad 100755 +--- a/configure ++++ b/configure +@@ -3748,6 +3748,7 @@ tonemap_opencl_filter_deps="opencl const_nan" + transpose_opencl_filter_deps="opencl" + transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" + transpose_vulkan_filter_deps="vulkan spirv_compiler" ++unsand_filter_select="sand" + unsharp_opencl_filter_deps="opencl" + uspp_filter_deps="gpl avcodec" + vaguedenoiser_filter_deps="gpl" +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index 30cc329fb6..6db336e74a 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -509,6 +509,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o + OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER) += vf_transpose_vulkan.o vulkan.o vulkan_filter.o + OBJS-$(CONFIG_TRIM_FILTER) += trim.o + OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o ++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o + OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o + OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ + opencl/unsharp.o +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 5ebacfde27..4bf3a5cfd8 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -483,6 +483,7 @@ extern const AVFilter ff_vf_trim; + extern const AVFilter ff_vf_unpremultiply; + extern const AVFilter ff_vf_unsharp; + extern const AVFilter ff_vf_unsharp_opencl; ++extern const AVFilter ff_vf_unsand; + extern const AVFilter ff_vf_untile; + extern const AVFilter ff_vf_uspp; + extern const AVFilter ff_vf_v360; +diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c +index a3190468bb..1ba381ca9f 100644 +--- a/libavfilter/buffersrc.c ++++ b/libavfilter/buffersrc.c +@@ -204,7 +204,7 @@ FF_ENABLE_DEPRECATION_WARNINGS + + switch (ctx->outputs[0]->type) { + case AVMEDIA_TYPE_VIDEO: +- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, ++ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), + frame->format, frame->pts); + break; + case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c +new file mode 100644 +index 0000000000..7100f2fc9b +--- /dev/null ++++ b/libavfilter/vf_unsand.c +@@ -0,0 +1,228 @@ ++/* ++ * Copyright (c) 2007 Bobby Bingham + * + * This file is part of FFmpeg. + * -+ * FFmpeg is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. + * -+ * You should have received a copy of the GNU General Public License along -+ * with FFmpeg; if not, write to the Free Software Foundation, Inc., -+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * format and noformat video filters + */ + +#include + -+#include "checkasm.h" -+ -+#include "libavcodec/vc1dsp.h" -+ -+#include "libavutil/common.h" +#include "libavutil/internal.h" -+#include "libavutil/intreadwrite.h" -+#include "libavutil/mem_internal.h" ++#include "libavutil/mem.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/opt.h" ++#include "libavutil/rpi_sand_fns.h" + -+#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) }, -+#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height }, ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" + -+typedef struct { -+ const char *name; -+ size_t offset; ++typedef struct UnsandContext { ++ const AVClass *class; ++} UnsandContext; ++ ++static av_cold void uninit(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++} ++ ++static av_cold int init(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ ++ return 0; ++} ++ ++ ++static int filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterLink * const outlink = link->dst->outputs[0]; ++ AVFrame *out = NULL; ++ int rv = 0; ++ ++ if (outlink->format == in->format) { ++ // If nothing to do then do nothing ++ out = in; ++ } ++ else ++ { ++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) ++ { ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ if (av_rpi_sand_to_planar_frame(out, in) != 0) ++ { ++ rv = -1; ++ goto fail; ++ } ++ ++ av_frame_free(&in); ++ } ++ ++ return ff_filter_frame(outlink, out); ++ ++fail: ++ av_frame_free(&out); ++ av_frame_free(&in); ++ return rv; ++} ++ ++#if 0 ++static void dump_fmts(const AVFilterFormats * fmts) ++{ ++ int i; ++ if (fmts== NULL) { ++ printf("NULL\n"); ++ return; ++ } ++ for (i = 0; i < fmts->nb_formats; ++i) { ++ printf(" %d", fmts->formats[i]); ++ } ++ printf("\n"); ++} ++#endif ++ ++static int query_formats(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ int ret; ++ ++ // If we aren't connected at both ends then just do nothing ++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) ++ return 0; ++ ++ // Our output formats depend on our input formats and we can't/don't ++ // want to convert between bit depths so we need to wait for the source ++ // to have an opinion before we do ++ if (ctx->inputs[0]->incfg.formats == NULL) ++ return AVERROR(EAGAIN); ++ ++ // Accept anything ++ if (ctx->inputs[0]->outcfg.formats == NULL && ++ (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0) ++ return ret; ++ ++ // Filter out sand formats ++ ++ // Generate a container if we don't already have one ++ if (ctx->outputs[0]->incfg.formats == NULL) ++ { ++ // Somewhat rubbish way of ensuring we have a good structure ++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; ++ AVFilterFormats *formats = ff_make_format_list(out_fmts); ++ ++ if (formats == NULL) ++ return AVERROR(ENOMEM); ++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) ++ return ret; ++ } ++ ++ // Replace old format list with new filtered list derived from what our ++ // input says it can do ++ { ++ const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats; ++ AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats; ++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); ++ int i; ++ int n = 0; ++ int seen_420p = 0; ++ int seen_420p10 = 0; ++ ++ for (i = 0; i < src_ff->nb_formats; ++i) { ++ const enum AVPixelFormat f = src_ff->formats[i]; ++ ++ switch (f){ ++ case AV_PIX_FMT_YUV420P: ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ if (!seen_420p) { ++ seen_420p = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ case AV_PIX_FMT_YUV420P10: ++ case AV_PIX_FMT_RPI4_10: ++ if (!seen_420p10) { ++ seen_420p10 = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; ++ } ++ break; ++ default: ++ dst_fmts[n++] = f; ++ break; ++ } ++ } ++ ++ av_freep(&dst_ff->formats); ++ dst_ff->formats = dst_fmts; ++ dst_ff->nb_formats = n; ++ } ++ ++// printf("Unsand: %s calc: ", __func__); ++// dump_fmts(ctx->outputs[0]->incfg.formats); ++ ++ return 0; ++} ++ ++ ++#define OFFSET(x) offsetof(UnsandContext, x) ++static const AVOption unsand_options[] = { ++ { NULL } ++}; ++ ++ ++AVFILTER_DEFINE_CLASS(unsand); ++ ++static const AVFilterPad avfilter_vf_unsand_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = filter_frame, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad avfilter_vf_unsand_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO ++ }, ++}; ++ ++AVFilter ff_vf_unsand = { ++ .name = "unsand", ++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), ++ ++ .init = init, ++ .uninit = uninit, ++ ++ FILTER_QUERY_FUNC(query_formats), ++ ++ .priv_size = sizeof(UnsandContext), ++ .priv_class = &unsand_class, ++ ++ FILTER_INPUTS(avfilter_vf_unsand_inputs), ++ FILTER_OUTPUTS(avfilter_vf_unsand_outputs), ++}; ++ + +From 5737587f9f3566b8e963d7059136bf6d2f987810 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 12:37:07 +0100 +Subject: [PATCH 007/113] Reduce mmal compile warnings + +--- + libavcodec/mmaldec.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c +index 7fd24ad3b7..03cf5c3bba 100644 +--- a/libavcodec/mmaldec.c ++++ b/libavcodec/mmaldec.c +@@ -24,6 +24,9 @@ + * MMAL Video Decoder + */ + ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" + #include + #include + #include +@@ -31,6 +34,7 @@ + #include + #include + #include ++#pragma GCC diagnostic pop + #include + + #include "avcodec.h" + +From 32db01e1f0c7aa86adad47a013d6ffe2eccae5b8 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 17:56:16 +0100 +Subject: [PATCH 008/113] Add chroma location to hevc parse + +--- + libavcodec/hevc_parser.c | 13 +++++++++++++ + libavcodec/hevcdec.c | 13 +++++++++++++ + 2 files changed, 26 insertions(+) + +diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c +index 59f9a0ff3e..8b3fee6329 100644 +--- a/libavcodec/hevc_parser.c ++++ b/libavcodec/hevc_parser.c +@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal, + avctx->profile = ps->sps->ptl.general_ptl.profile_idc; + avctx->level = ps->sps->ptl.general_ptl.level_idc; + ++ if (ps->sps->chroma_format_idc == 1) { ++ avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ? ++ ps->sps->vui.chroma_sample_loc_type_top_field + 1 : ++ AVCHROMA_LOC_LEFT; ++ } ++ else if (ps->sps->chroma_format_idc == 2 || ++ ps->sps->chroma_format_idc == 3) { ++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; ++ } ++ else { ++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; ++ } ++ + if (ps->vps->vps_timing_info_present_flag) { + num = ps->vps->vps_num_units_in_tick; + den = ps->vps->vps_time_scale; +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index f8f981e838..00e14f0115 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -340,6 +340,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps) + + ff_set_sar(avctx, sps->vui.sar); + ++ if (sps->chroma_format_idc == 1) { ++ avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ? ++ sps->vui.chroma_sample_loc_type_top_field + 1 : ++ AVCHROMA_LOC_LEFT; ++ } ++ else if (sps->chroma_format_idc == 2 || ++ sps->chroma_format_idc == 3) { ++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; ++ } ++ else { ++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; ++ } ++ + if (sps->vui.video_signal_type_present_flag) + avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG + : AVCOL_RANGE_MPEG; + +From afc9e6c5f8f092de1a7c8a686abd87f7be06e14c Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 26 Sep 2022 18:20:50 +0100 +Subject: [PATCH 009/113] hwaccel: Add .abort_frame & use in hevcdec + +--- + libavcodec/avcodec.h | 11 +++++++++++ + libavcodec/hevcdec.c | 7 ++++++- + 2 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index cb5c25bf63..62a3ca4d85 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -2212,6 +2212,17 @@ typedef struct AVHWAccel { + * that avctx->hwaccel_priv_data is invalid. + */ + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ ++ /** ++ * Called if parsing fails ++ * ++ * An error has occured, end_frame will not be called ++ * start_frame & decode_slice may or may not have been called ++ * Optional ++ * ++ * @param avctx the codec context ++ */ ++ void (*abort_frame)(AVCodecContext *avctx); + } AVHWAccel; + + /** +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index 00e14f0115..ec3dd3cfa7 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -3517,8 +3517,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe, + + s->ref = NULL; + ret = decode_nal_units(s, avpkt->data, avpkt->size); +- if (ret < 0) ++ if (ret < 0) { ++ // Ensure that hwaccel knows this frame is over ++ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) ++ s->avctx->hwaccel->abort_frame(s->avctx); ++ + return ret; ++ } + + if (avctx->hwaccel) { + if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { + +From 376135da2c0d311cad5287b8ee0055ea1e4c1eaf Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 26 Sep 2022 18:26:17 +0100 +Subject: [PATCH 010/113] hwaccel: Add CAP_MT_SAFE for accels that can use + multi-thread + +--- + libavcodec/hwconfig.h | 1 + + libavcodec/pthread_frame.c | 7 +++++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h +index 721424912c..c43ad55245 100644 +--- a/libavcodec/hwconfig.h ++++ b/libavcodec/hwconfig.h +@@ -24,6 +24,7 @@ + + + #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) ++#define HWACCEL_CAP_MT_SAFE (1 << 1) + + + typedef struct AVCodecHWConfigInternal { +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index 43d6cc8ff4..d98b885b0e 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -217,7 +217,8 @@ FF_ENABLE_DEPRECATION_WARNINGS + + /* if the previous thread uses hwaccel then we take the lock to ensure + * the threads don't run concurrently */ +- if (avctx->hwaccel) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } +@@ -656,7 +657,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { + + if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; + +- if (avctx->hwaccel && !p->hwaccel_serializing) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && ++ !p->hwaccel_serializing) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } + +From 653671fba5f2ac43703576d3d8ee41bbf175f003 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 17:59:08 +0100 +Subject: [PATCH 011/113] Weak link utils + +--- + libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++ + libavcodec/weak_link.h | 23 ++++++++++ + 2 files changed, 125 insertions(+) + create mode 100644 libavcodec/weak_link.c + create mode 100644 libavcodec/weak_link.h + +diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c +new file mode 100644 +index 0000000000..f234a985b9 +--- /dev/null ++++ b/libavcodec/weak_link.c +@@ -0,0 +1,102 @@ ++#include ++#include ++#include ++#include "weak_link.h" ++ ++struct ff_weak_link_master { ++ atomic_int ref_count; /* 0 is single ref for easier atomics */ ++ pthread_rwlock_t lock; ++ void * ptr; ++}; ++ ++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c) ++{ ++ return (struct ff_weak_link_master *)c; ++} ++ ++struct ff_weak_link_master * ff_weak_link_new(void * p) ++{ ++ struct ff_weak_link_master * w = malloc(sizeof(*w)); ++ if (!w) ++ return NULL; ++ w->ptr = p; ++ if (pthread_rwlock_init(&w->lock, NULL)) { ++ free(w); ++ return NULL; ++ } ++ return w; ++} ++ ++static void weak_link_do_unref(struct ff_weak_link_master * const w) ++{ ++ int n = atomic_fetch_sub(&w->ref_count, 1); ++ if (n) ++ return; ++ ++ pthread_rwlock_destroy(&w->lock); ++ free(w); ++} ++ ++// Unref & break link ++void ff_weak_link_break(struct ff_weak_link_master ** ppLink) ++{ ++ struct ff_weak_link_master * const w = *ppLink; ++ if (!w) ++ return; ++ ++ *ppLink = NULL; ++ pthread_rwlock_wrlock(&w->lock); ++ w->ptr = NULL; ++ pthread_rwlock_unlock(&w->lock); ++ ++ weak_link_do_unref(w); ++} ++ ++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) ++{ ++ if (!w) ++ return NULL; ++ atomic_fetch_add(&w->ref_count, 1); ++ return (struct ff_weak_link_client*)w; ++} ++ ++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(*ppLink); ++ if (!w) ++ return; ++ ++ *ppLink = NULL; ++ weak_link_do_unref(w); ++} ++ ++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(*ppLink); ++ ++ if (!w) ++ return NULL; ++ ++ if (pthread_rwlock_rdlock(&w->lock)) ++ goto broken; ++ ++ if (w->ptr) ++ return w->ptr; ++ ++ pthread_rwlock_unlock(&w->lock); ++ ++broken: ++ *ppLink = NULL; ++ weak_link_do_unref(w); ++ return NULL; ++} ++ ++// Ignores a NULL c (so can be on the return path of both broken & live links) ++void ff_weak_link_unlock(struct ff_weak_link_client * c) ++{ ++ struct ff_weak_link_master * const w = weak_link_x(c); ++ if (w) ++ pthread_rwlock_unlock(&w->lock); ++} ++ ++ +diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h +new file mode 100644 +index 0000000000..415b6a27a0 +--- /dev/null ++++ b/libavcodec/weak_link.h +@@ -0,0 +1,23 @@ ++struct ff_weak_link_master; ++struct ff_weak_link_client; ++ ++struct ff_weak_link_master * ff_weak_link_new(void * p); ++void ff_weak_link_break(struct ff_weak_link_master ** ppLink); ++ ++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w); ++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink); ++ ++// Returns NULL if link broken - in this case it will also zap ++// *ppLink and unref the weak_link. ++// Returns NULL if *ppLink is NULL (so a link once broken stays broken) ++// ++// The above does mean that there is a race if this is called simultainiously ++// by two threads using the same weak_link_client (so don't do that) ++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink); ++void ff_weak_link_unlock(struct ff_weak_link_client * c); ++ ++ ++ ++ ++ ++ + +From 172b496797b51a4c2638acacf45c8f6be03faf95 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 19:23:26 +0100 +Subject: [PATCH 012/113] Add v4l2_req V4L2 request H265 drm_prime decode + +Has the abiliy to switch between kernel API versions at runtime. This +could be removed later once teher is no chance of usage on an old +kernel. +--- + configure | 14 + + libavcodec/Makefile | 4 + + libavcodec/hevc-ctrls-v1.h | 229 +++++ + libavcodec/hevc-ctrls-v2.h | 257 +++++ + libavcodec/hevcdec.c | 10 + + libavcodec/hwaccels.h | 1 + + libavcodec/hwconfig.h | 2 + + libavcodec/v4l2_req_decode_q.c | 84 ++ + libavcodec/v4l2_req_decode_q.h | 25 + + libavcodec/v4l2_req_devscan.c | 449 +++++++++ + libavcodec/v4l2_req_devscan.h | 23 + + libavcodec/v4l2_req_dmabufs.c | 266 ++++++ + libavcodec/v4l2_req_dmabufs.h | 40 + + libavcodec/v4l2_req_hevc_v1.c | 3 + + libavcodec/v4l2_req_hevc_v2.c | 3 + + libavcodec/v4l2_req_hevc_vx.c | 1213 +++++++++++++++++++++++ + libavcodec/v4l2_req_media.c | 1596 +++++++++++++++++++++++++++++++ + libavcodec/v4l2_req_media.h | 151 +++ + libavcodec/v4l2_req_pollqueue.c | 361 +++++++ + libavcodec/v4l2_req_pollqueue.h | 18 + + libavcodec/v4l2_req_utils.h | 27 + + libavcodec/v4l2_request_hevc.c | 297 ++++++ + libavcodec/v4l2_request_hevc.h | 102 ++ + 23 files changed, 5175 insertions(+) + create mode 100644 libavcodec/hevc-ctrls-v1.h + create mode 100644 libavcodec/hevc-ctrls-v2.h + create mode 100644 libavcodec/v4l2_req_decode_q.c + create mode 100644 libavcodec/v4l2_req_decode_q.h + create mode 100644 libavcodec/v4l2_req_devscan.c + create mode 100644 libavcodec/v4l2_req_devscan.h + create mode 100644 libavcodec/v4l2_req_dmabufs.c + create mode 100644 libavcodec/v4l2_req_dmabufs.h + create mode 100644 libavcodec/v4l2_req_hevc_v1.c + create mode 100644 libavcodec/v4l2_req_hevc_v2.c + create mode 100644 libavcodec/v4l2_req_hevc_vx.c + create mode 100644 libavcodec/v4l2_req_media.c + create mode 100644 libavcodec/v4l2_req_media.h + create mode 100644 libavcodec/v4l2_req_pollqueue.c + create mode 100644 libavcodec/v4l2_req_pollqueue.h + create mode 100644 libavcodec/v4l2_req_utils.h + create mode 100644 libavcodec/v4l2_request_hevc.c + create mode 100644 libavcodec/v4l2_request_hevc.h + +diff --git a/configure b/configure +index c8674a7dad..b64a6cf822 100755 +--- a/configure ++++ b/configure +@@ -281,6 +281,7 @@ External library support: + if openssl, gnutls or mbedtls is not used [no] + --enable-libtwolame enable MP2 encoding via libtwolame [no] + --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] ++ --enable-libudev enable libudev [no] + --enable-libv4l2 enable libv4l2/v4l-utils [no] + --enable-libvidstab enable video stabilization using vid.stab [no] + --enable-libvmaf enable vmaf filter via libvmaf [no] +@@ -350,6 +351,7 @@ External library support: + --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] + --enable-rkmpp enable Rockchip Media Process Platform code [no] + --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] ++ --enable-v4l2-request enable V4L2 request API code [no] + --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] + --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] + --disable-videotoolbox disable VideoToolbox code [autodetect] +@@ -1870,6 +1872,7 @@ EXTERNAL_LIBRARY_LIST=" + libtheora + libtwolame + libuavs3d ++ libudev + libv4l2 + libvmaf + libvorbis +@@ -1925,6 +1928,7 @@ HWACCEL_LIBRARY_LIST=" + mmal + omx + opencl ++ v4l2_request + " + + DOCUMENT_LIST=" +@@ -3014,6 +3018,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" + dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" + ffnvcodec_deps_any="libdl LoadLibrary" + nvdec_deps="ffnvcodec" ++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" + vaapi_x11_deps="xlib_x11" + videotoolbox_hwaccel_deps="videotoolbox pthreads" + videotoolbox_hwaccel_extralibs="-framework QuartzCore" +@@ -3057,6 +3062,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" + hevc_dxva2_hwaccel_select="hevc_decoder" + hevc_nvdec_hwaccel_deps="nvdec" + hevc_nvdec_hwaccel_select="hevc_decoder" ++hevc_v4l2request_hwaccel_deps="v4l2_request" ++hevc_v4l2request_hwaccel_select="hevc_decoder" + hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" + hevc_vaapi_hwaccel_select="hevc_decoder" + hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" +@@ -6638,6 +6645,7 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame + { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || + die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } + enabled libuavs3d && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode ++enabled libudev && require_pkg_config libudev libudev libudev.h udev_new + enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl + enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit + enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init +@@ -6739,6 +6747,10 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r + { enabled libdrm || + die "ERROR: rkmpp requires --enable-libdrm"; } + } ++enabled v4l2_request && { enabled libdrm || ++ die "ERROR: v4l2-request requires --enable-libdrm"; } && ++ { enabled libudev || ++ die "ERROR: v4l2-request requires --enable-libudev"; } + enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init + + +@@ -6821,6 +6833,8 @@ if enabled v4l2_m2m; then + check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" + fi + ++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns ++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" + check_headers sys/videoio.h + test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 457ec58377..df7659d0b8 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -162,6 +162,8 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o + OBJS-$(CONFIG_VP56DSP) += vp56dsp.o + OBJS-$(CONFIG_VP8DSP) += vp8dsp.o + OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o ++OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ ++ v4l2_req_devscan.o weak_link.o + OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o + OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o + +@@ -972,6 +974,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o ++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ ++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o +diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h +new file mode 100644 +index 0000000000..72cbba0953 +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v1.h +@@ -0,0 +1,229 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 ++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 rps; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 num_active_dpb_entries; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 num_rps_poc_st_curr_before; ++ __u8 num_rps_poc_st_curr_after; ++ __u8 num_rps_poc_lt_curr; ++ ++ __u8 padding; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h +new file mode 100644 +index 0000000000..7cbbbf055f +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v2.h +@@ -0,0 +1,257 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 ++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 rps; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 padding[5]; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u8 num_active_dpb_entries; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ ++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) ++/* ++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - ++ * the number of data (in bits) to skip in the ++ * slice segment header. ++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" ++ * to before syntax element "slice_temporal_mvp_enabled_flag". ++ * If IDR, the skipped bits are just "pic_output_flag" ++ * (separate_colour_plane_flag is not supported). ++ */ ++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index ec3dd3cfa7..a2c43b888b 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -415,6 +415,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ + CONFIG_HEVC_NVDEC_HWACCEL + \ ++ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ + CONFIG_HEVC_VAAPI_HWACCEL + \ + CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ + CONFIG_HEVC_VDPAU_HWACCEL) +@@ -441,6 +442,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV420P10: +@@ -462,6 +466,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV444P: +@@ -3915,6 +3922,9 @@ const FFCodec ff_hevc_decoder = { + #if CONFIG_HEVC_NVDEC_HWACCEL + HWACCEL_NVDEC(hevc), + #endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(hevc), ++#endif + #if CONFIG_HEVC_VAAPI_HWACCEL + HWACCEL_VAAPI(hevc), + #endif +diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h +index aca55831f3..f32d1c4ec4 100644 +--- a/libavcodec/hwaccels.h ++++ b/libavcodec/hwaccels.h +@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; + extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; + extern const AVHWAccel ff_hevc_dxva2_hwaccel; + extern const AVHWAccel ff_hevc_nvdec_hwaccel; ++extern const AVHWAccel ff_hevc_v4l2request_hwaccel; + extern const AVHWAccel ff_hevc_vaapi_hwaccel; + extern const AVHWAccel ff_hevc_vdpau_hwaccel; + extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; +diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h +index c43ad55245..b8aa383071 100644 +--- a/libavcodec/hwconfig.h ++++ b/libavcodec/hwconfig.h +@@ -71,6 +71,8 @@ typedef struct AVCodecHWConfigInternal { + HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) + #define HWACCEL_NVDEC(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) ++#define HWACCEL_V4L2REQUEST(codec) \ ++ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) + #define HWACCEL_VAAPI(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) + #define HWACCEL_VDPAU(codec) \ +diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c +new file mode 100644 +index 0000000000..5b3fb958fa +--- /dev/null ++++ b/libavcodec/v4l2_req_decode_q.c +@@ -0,0 +1,84 @@ ++#include ++#include ++#include ++ ++#include "v4l2_req_decode_q.h" ++ ++int decode_q_in_q(const req_decode_ent * const d) ++{ ++ return d->in_q; ++} ++ ++void decode_q_add(req_decode_q * const q, req_decode_ent * const d) ++{ ++ pthread_mutex_lock(&q->q_lock); ++ if (!q->head) { ++ q->head = d; ++ q->tail = d; ++ d->prev = NULL; ++ } ++ else { ++ q->tail->next = d; ++ d->prev = q->tail; ++ q->tail = d; ++ } ++ d->next = NULL; ++ d->in_q = 1; ++ pthread_mutex_unlock(&q->q_lock); ++} ++ ++// Remove entry from Q - if head wake-up anything that was waiting ++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d) ++{ ++ int try_signal = 0; ++ ++ if (!d->in_q) ++ return; ++ ++ pthread_mutex_lock(&q->q_lock); ++ if (d->prev) ++ d->prev->next = d->next; ++ else { ++ try_signal = 1; // Only need to signal if we were head ++ q->head = d->next; ++ } ++ ++ if (d->next) ++ d->next->prev = d->prev; ++ else ++ q->tail = d->prev; ++ ++ // Not strictly needed but makes debug easier ++ d->next = NULL; ++ d->prev = NULL; ++ d->in_q = 0; ++ pthread_mutex_unlock(&q->q_lock); ++ ++ if (try_signal) ++ pthread_cond_broadcast(&q->q_cond); ++} ++ ++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d) ++{ ++ pthread_mutex_lock(&q->q_lock); ++ ++ while (q->head != d) ++ pthread_cond_wait(&q->q_cond, &q->q_lock); ++ ++ pthread_mutex_unlock(&q->q_lock); ++} ++ ++void decode_q_uninit(req_decode_q * const q) ++{ ++ pthread_mutex_destroy(&q->q_lock); ++ pthread_cond_destroy(&q->q_cond); ++} ++ ++void decode_q_init(req_decode_q * const q) ++{ ++ memset(q, 0, sizeof(*q)); ++ pthread_mutex_init(&q->q_lock, NULL); ++ pthread_cond_init(&q->q_cond, NULL); ++} ++ ++ +diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h +new file mode 100644 +index 0000000000..af7bbe1de4 +--- /dev/null ++++ b/libavcodec/v4l2_req_decode_q.h +@@ -0,0 +1,25 @@ ++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H ++#define AVCODEC_V4L2_REQ_DECODE_Q_H ++ ++typedef struct req_decode_ent { ++ struct req_decode_ent * next; ++ struct req_decode_ent * prev; ++ int in_q; ++} req_decode_ent; ++ ++typedef struct req_decode_q { ++ pthread_mutex_t q_lock; ++ pthread_cond_t q_cond; ++ req_decode_ent * head; ++ req_decode_ent * tail; ++} req_decode_q; ++ ++int decode_q_in_q(const req_decode_ent * const d); ++void decode_q_add(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d); ++void decode_q_uninit(req_decode_q * const q); ++void decode_q_init(req_decode_q * const q); ++ ++#endif ++ +diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c +new file mode 100644 +index 0000000000..cfa94d55c4 +--- /dev/null ++++ b/libavcodec/v4l2_req_devscan.c +@@ -0,0 +1,449 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_utils.h" ++ ++struct decdev { ++ enum v4l2_buf_type src_type; ++ uint32_t src_fmt_v4l2; ++ const char * vname; ++ const char * mname; ++}; ++ ++struct devscan { ++ struct decdev env; ++ unsigned int dev_size; ++ unsigned int dev_count; ++ struct decdev *devs; ++}; ++ ++static int video_src_pixfmt_supported(uint32_t fmt) ++{ ++ return 1; ++} ++ ++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, ++ unsigned int width, unsigned int height, ++ unsigned int pixelformat) ++{ ++ unsigned int sizeimage; ++ ++ memset(format, 0, sizeof(*format)); ++ format->type = type; ++ ++ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { ++ format->fmt.pix_mp.width = width; ++ format->fmt.pix_mp.height = height; ++ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; ++ format->fmt.pix_mp.pixelformat = pixelformat; ++ } else { ++ format->fmt.pix.width = width; ++ format->fmt.pix.height = height; ++ format->fmt.pix.sizeimage = sizeimage; ++ format->fmt.pix.pixelformat = pixelformat; ++ } ++} ++ ++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, ++ unsigned int width, unsigned int height) ++{ ++ struct v4l2_format format; ++ ++ v4l2_setup_format(&format, type, width, height, pixelformat); ++ ++ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0; ++} ++ ++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities) ++{ ++ struct v4l2_capability capability = { 0 }; ++ int rc; ++ ++ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability); ++ if (rc < 0) ++ return -errno; ++ ++ if (capabilities != NULL) { ++ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0) ++ *capabilities = capability.device_caps; ++ else ++ *capabilities = capability.capabilities; ++ } ++ ++ return 0; ++} ++ ++static int devscan_add(struct devscan *const scan, ++ enum v4l2_buf_type src_type, ++ uint32_t src_fmt_v4l2, ++ const char * vname, ++ const char * mname) ++{ ++ struct decdev *d; ++ ++ if (scan->dev_size <= scan->dev_count) { ++ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2; ++ d = realloc(scan->devs, n * sizeof(*d)); ++ if (!d) ++ return -ENOMEM; ++ scan->devs = d; ++ scan->dev_size = n; ++ } ++ ++ d = scan->devs + scan->dev_count; ++ d->src_type = src_type; ++ d->src_fmt_v4l2 = src_fmt_v4l2; ++ d->vname = strdup(vname); ++ if (!d->vname) ++ return -ENOMEM; ++ d->mname = strdup(mname); ++ if (!d->mname) { ++ free((char *)d->vname); ++ return -ENOMEM; ++ } ++ ++scan->dev_count; ++ return 0; ++} ++ ++void devscan_delete(struct devscan **const pScan) ++{ ++ unsigned int i; ++ struct devscan * const scan = *pScan; ++ ++ if (!scan) ++ return; ++ *pScan = NULL; ++ ++ for (i = 0; i < scan->dev_count; ++i) { ++ free((char*)scan->devs[i].mname); ++ free((char*)scan->devs[i].vname); ++ } ++ free(scan->devs); ++ free(scan); ++} ++ ++#define REQ_BUF_CAPS (\ ++ V4L2_BUF_CAP_SUPPORTS_DMABUF |\ ++ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\ ++ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) ++ ++static void probe_formats(void * const dc, ++ struct devscan *const scan, ++ const int fd, ++ const unsigned int type_v4l2, ++ const char *const mpath, ++ const char *const vpath) ++{ ++ unsigned int i; ++ for (i = 0;; ++i) { ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = i, ++ .type = type_v4l2 ++ }; ++ struct v4l2_requestbuffers rbufs = { ++ .count = 0, ++ .type = type_v4l2, ++ .memory = V4L2_MEMORY_MMAP ++ }; ++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { ++ if (errno == EINTR) ++ continue; ++ if (errno != EINVAL) ++ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2); ++ return; ++ } ++ if (!video_src_pixfmt_supported(fmtdesc.pixelformat)) ++ continue; ++ ++ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) { ++ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat); ++ continue; ++ } ++ ++ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) { ++ if (errno != EINTR) { ++ request_debug(dc, "%s: Reqbufs failed\n", vpath); ++ continue; ++ } ++ } ++ ++ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) { ++ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities); ++ continue; ++ } ++ ++ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n", ++ mpath, vpath, fmtdesc.pixelformat, type_v4l2); ++ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath); ++ } ++} ++ ++ ++static int probe_video_device(void * const dc, ++ struct udev_device *const device, ++ struct devscan *const scan, ++ const char *const mpath) ++{ ++ int ret; ++ unsigned int capabilities = 0; ++ int video_fd = -1; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ request_err(dc, "%s: get video device devnode failed\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ video_fd = open(path, O_RDWR, 0); ++ if (video_fd == -1) { ++ ret = -errno; ++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); ++ goto fail; ++ } ++ ++ ret = v4l2_query_capabilities(video_fd, &capabilities); ++ if (ret < 0) { ++ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities); ++ ++ if (!(capabilities & V4L2_CAP_STREAMING)) { ++ request_debug(dc, "%s: missing required streaming capability\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) { ++ request_debug(dc, "%s: missing required mem2mem capability\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ /* Should check capture formats too... */ ++ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0) ++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path); ++ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) ++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path); ++ ++ close(video_fd); ++ return 0; ++ ++fail: ++ if (video_fd >= 0) ++ close(video_fd); ++ return ret; ++} ++ ++static int probe_media_device(void * const dc, ++ struct udev_device *const device, ++ struct devscan *const scan) ++{ ++ int ret; ++ int rv; ++ struct media_device_info device_info = { 0 }; ++ struct media_v2_topology topology = { 0 }; ++ struct media_v2_interface *interfaces = NULL; ++ struct udev *udev = udev_device_get_udev(device); ++ struct udev_device *video_device; ++ dev_t devnum; ++ int media_fd = -1; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ request_err(dc, "%s: get media device devnode failed\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ media_fd = open(path, O_RDWR, 0); ++ if (media_fd < 0) { ++ ret = -errno; ++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ if (topology.num_interfaces <= 0) { ++ request_err(dc, "%s: media device has no interfaces\n", __func__); ++ ret = -EINVAL; ++ goto fail; ++ } ++ ++ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces)); ++ if (!interfaces) { ++ request_err(dc, "%s: allocating media interface struct failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; ++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (rv < 0) { ++ ret = -errno; ++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); ++ goto fail; ++ } ++ ++ for (int i = 0; i < topology.num_interfaces; i++) { ++ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) ++ continue; ++ ++ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); ++ video_device = udev_device_new_from_devnum(udev, 'c', devnum); ++ if (!video_device) { ++ ret = -errno; ++ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device); ++ continue; ++ } ++ ++ ret = probe_video_device(dc, video_device, scan, path); ++ udev_device_unref(video_device); ++ ++ if (ret != 0) ++ goto fail; ++ } ++ ++fail: ++ free(interfaces); ++ if (media_fd != -1) ++ close(media_fd); ++ return ret; ++} ++ ++const char *decdev_media_path(const struct decdev *const dev) ++{ ++ return !dev ? NULL : dev->mname; ++} ++ ++const char *decdev_video_path(const struct decdev *const dev) ++{ ++ return !dev ? NULL : dev->vname; ++} ++ ++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev) ++{ ++ return !dev ? 0 : dev->src_type; ++} ++ ++uint32_t decdev_src_pixelformat(const struct decdev *const dev) ++{ ++ return !dev ? 0 : dev->src_fmt_v4l2; ++} ++ ++ ++const struct decdev *devscan_find(struct devscan *const scan, ++ const uint32_t src_fmt_v4l2) ++{ ++ unsigned int i; ++ ++ if (scan->env.mname && scan->env.vname) ++ return &scan->env; ++ ++ if (!src_fmt_v4l2) ++ return scan->dev_count ? scan->devs + 0 : NULL; ++ ++ for (i = 0; i != scan->dev_count; ++i) { ++ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2) ++ return scan->devs + i; ++ } ++ return NULL; ++} ++ ++int devscan_build(void * const dc, struct devscan **pscan) ++{ ++ int ret; ++ struct udev *udev; ++ struct udev_enumerate *enumerate; ++ struct udev_list_entry *devices; ++ struct udev_list_entry *entry; ++ struct udev_device *device; ++ struct devscan * scan; ++ ++ *pscan = NULL; ++ ++ scan = calloc(1, sizeof(*scan)); ++ if (!scan) { ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH"); ++ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH"); ++ if (scan->env.mname && scan->env.vname) { ++ request_info(dc, "Media/video device env overrides found: %s,%s\n", ++ scan->env.mname, scan->env.vname); ++ *pscan = scan; ++ return 0; ++ } ++ ++ udev = udev_new(); ++ if (!udev) { ++ request_err(dc, "%s: allocating udev context failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ enumerate = udev_enumerate_new(udev); ++ if (!enumerate) { ++ request_err(dc, "%s: allocating udev enumerator failed\n", __func__); ++ ret = -ENOMEM; ++ goto fail; ++ } ++ ++ udev_enumerate_add_match_subsystem(enumerate, "media"); ++ udev_enumerate_scan_devices(enumerate); ++ ++ devices = udev_enumerate_get_list_entry(enumerate); ++ udev_list_entry_foreach(entry, devices) { ++ const char *path = udev_list_entry_get_name(entry); ++ if (!path) ++ continue; ++ ++ device = udev_device_new_from_syspath(udev, path); ++ if (!device) ++ continue; ++ ++ probe_media_device(dc, device, scan); ++ udev_device_unref(device); ++ } ++ ++ udev_enumerate_unref(enumerate); ++ ++ *pscan = scan; ++ return 0; ++ ++fail: ++ udev_unref(udev); ++ devscan_delete(&scan); ++ return ret; ++} ++ +diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h +new file mode 100644 +index 0000000000..956d9234f1 +--- /dev/null ++++ b/libavcodec/v4l2_req_devscan.h +@@ -0,0 +1,23 @@ ++#ifndef _DEVSCAN_H_ ++#define _DEVSCAN_H_ ++ ++#include ++ ++struct devscan; ++struct decdev; ++enum v4l2_buf_type; ++ ++/* These return pointers to data in the devscan structure and so are vaild ++ * for the lifetime of that ++ */ ++const char *decdev_media_path(const struct decdev *const dev); ++const char *decdev_video_path(const struct decdev *const dev); ++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); ++uint32_t decdev_src_pixelformat(const struct decdev *const dev); ++ ++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); ++ ++int devscan_build(void * const dc, struct devscan **pscan); ++void devscan_delete(struct devscan **const pScan); ++ ++#endif +diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c +new file mode 100644 +index 0000000000..ae6c648369 +--- /dev/null ++++ b/libavcodec/v4l2_req_dmabufs.c +@@ -0,0 +1,266 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_utils.h" ++ ++#define DMABUF_NAME1 "/dev/dma_heap/linux,cma" ++#define DMABUF_NAME2 "/dev/dma_heap/reserved" ++ ++#define TRACE_ALLOC 0 ++ ++struct dmabufs_ctl { ++ int fd; ++ size_t page_size; ++}; ++ ++struct dmabuf_h { ++ int fd; ++ size_t size; ++ size_t len; ++ void * mapptr; ++}; ++ ++#if TRACE_ALLOC ++static unsigned int total_bufs = 0; ++static size_t total_size = 0; ++#endif ++ ++struct dmabuf_h * dmabuf_import(int fd, size_t size) ++{ ++ struct dmabuf_h *dh; ++ ++ fd = dup(fd); ++ if (fd < 0 || size == 0) ++ return NULL; ++ ++ dh = malloc(sizeof(*dh)); ++ if (!dh) { ++ close(fd); ++ return NULL; ++ } ++ ++ *dh = (struct dmabuf_h) { ++ .fd = fd, ++ .size = size, ++ .mapptr = MAP_FAILED ++ }; ++ ++#if TRACE_ALLOC ++ ++total_bufs; ++ total_size += dh->size; ++ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ return dh; ++} ++ ++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) ++{ ++ struct dmabuf_h * dh; ++ struct dma_heap_allocation_data data = { ++ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), ++ .fd = 0, ++ .fd_flags = O_RDWR, ++ .heap_flags = 0 ++ }; ++ ++ if (old != NULL) { ++ if (old->size == data.len) { ++ return old; ++ } ++ dmabuf_free(old); ++ } ++ ++ if (size == 0 || ++ (dh = malloc(sizeof(*dh))) == NULL) ++ return NULL; ++ ++ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { ++ int err = errno; ++ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", ++ (uint64_t)data.len, ++ dbsc->fd, ++ err, ++ strerror(err)); ++ if (err == EINTR) ++ continue; ++ goto fail; ++ } ++ ++ *dh = (struct dmabuf_h){ ++ .fd = data.fd, ++ .size = (size_t)data.len, ++ .mapptr = MAP_FAILED ++ }; ++ ++#if TRACE_ALLOC ++ ++total_bufs; ++ total_size += dh->size; ++ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ return dh; ++ ++fail: ++ free(dh); ++ return NULL; ++} ++ ++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) ++{ ++ struct dma_buf_sync sync = { ++ .flags = flags ++ }; ++ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { ++ const int err = errno; ++ if (errno == EINTR) ++ continue; ++ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags); ++ return -err; ++ } ++ return 0; ++} ++ ++int dmabuf_write_start(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE); ++} ++ ++int dmabuf_write_end(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE); ++} ++ ++int dmabuf_read_start(struct dmabuf_h * const dh) ++{ ++ if (!dmabuf_map(dh)) ++ return -1; ++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ); ++} ++ ++int dmabuf_read_end(struct dmabuf_h * const dh) ++{ ++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ); ++} ++ ++ ++void * dmabuf_map(struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return NULL; ++ if (dh->mapptr != MAP_FAILED) ++ return dh->mapptr; ++ dh->mapptr = mmap(NULL, dh->size, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ dh->fd, 0); ++ if (dh->mapptr == MAP_FAILED) { ++ request_log("%s: Map failed\n", __func__); ++ return NULL; ++ } ++ return dh->mapptr; ++} ++ ++int dmabuf_fd(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return -1; ++ return dh->fd; ++} ++ ++size_t dmabuf_size(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return 0; ++ return dh->size; ++} ++ ++size_t dmabuf_len(const struct dmabuf_h * const dh) ++{ ++ if (!dh) ++ return 0; ++ return dh->len; ++} ++ ++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) ++{ ++ dh->len = len; ++} ++ ++ ++ ++void dmabuf_free(struct dmabuf_h * dh) ++{ ++ if (!dh) ++ return; ++ ++#if TRACE_ALLOC ++ --total_bufs; ++ total_size -= dh->size; ++ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); ++#endif ++ ++ if (dh->mapptr != MAP_FAILED) ++ munmap(dh->mapptr, dh->size); ++ while (close(dh->fd) == -1 && errno == EINTR) ++ /* loop */; ++ free(dh); ++} ++ ++struct dmabufs_ctl * dmabufs_ctl_new(void) ++{ ++ struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc)); ++ ++ if (!dbsc) ++ return NULL; ++ ++ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ ++ if (dbsc->fd == -1) { ++ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ if (dbsc->fd == -1) { ++ request_log("Unable to open either %s or %s\n", ++ DMABUF_NAME1, DMABUF_NAME2); ++ goto fail; ++ } ++ } ++ ++ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); ++ ++ return dbsc; ++ ++fail: ++ free(dbsc); ++ return NULL; ++} ++ ++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) ++{ ++ struct dmabufs_ctl * const dbsc = *pDbsc; ++ ++ if (!dbsc) ++ return; ++ *pDbsc = NULL; ++ ++ while (close(dbsc->fd) == -1 && errno == EINTR) ++ /* loop */; ++ ++ free(dbsc); ++} ++ ++ +diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h +new file mode 100644 +index 0000000000..cfb17e801d +--- /dev/null ++++ b/libavcodec/v4l2_req_dmabufs.h +@@ -0,0 +1,40 @@ ++#ifndef DMABUFS_H ++#define DMABUFS_H ++ ++#include ++ ++struct dmabufs_ctl; ++struct dmabuf_h; ++ ++struct dmabufs_ctl * dmabufs_ctl_new(void); ++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc); ++ ++// Need not preserve old contents ++// On NULL return old buffer is freed ++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size); ++ ++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) { ++ return dmabuf_realloc(dbsc, NULL, size); ++} ++/* Create from existing fd - dups(fd) */ ++struct dmabuf_h * dmabuf_import(int fd, size_t size); ++void * dmabuf_map(struct dmabuf_h * const dh); ++ ++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ ++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags); ++ ++int dmabuf_write_start(struct dmabuf_h * const dh); ++int dmabuf_write_end(struct dmabuf_h * const dh); ++int dmabuf_read_start(struct dmabuf_h * const dh); ++int dmabuf_read_end(struct dmabuf_h * const dh); ++ ++int dmabuf_fd(const struct dmabuf_h * const dh); ++/* Allocated size */ ++size_t dmabuf_size(const struct dmabuf_h * const dh); ++/* Bytes in use */ ++size_t dmabuf_len(const struct dmabuf_h * const dh); ++/* Set bytes in use */ ++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len); ++void dmabuf_free(struct dmabuf_h * dh); ++ ++#endif +diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c +new file mode 100644 +index 0000000000..169b532832 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v1.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 1 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c +new file mode 100644 +index 0000000000..42af98e156 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v2.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 2 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +new file mode 100644 +index 0000000000..0ae03b10c4 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -0,0 +1,1213 @@ ++// File included by v4l2_req_hevc_v* - not compiled on its own ++ ++#include "decode.h" ++#include "hevcdec.h" ++#include "hwconfig.h" ++#include "internal.h" ++#include "thread.h" ++ ++#include "v4l2_request_hevc.h" ++ ++#if HEVC_CTRLS_VERSION == 1 ++#include "hevc-ctrls-v1.h" ++ ++// Fixup renamed entries ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT ++ ++#elif HEVC_CTRLS_VERSION == 2 ++#include "hevc-ctrls-v2.h" ++#else ++#error Unknown HEVC_CTRLS_VERSION ++#endif ++ ++#include "libavutil/hwcontext_drm.h" ++ ++#include ++#include ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_utils.h" ++ ++// Attached to buf[0] in frame ++// Pooled in hwcontext so generally create once - 1/frame ++typedef struct V4L2MediaReqDescriptor { ++ AVDRMFrameDescriptor drm; ++ ++ // Media ++ uint64_t timestamp; ++ struct qent_dst * qe_dst; ++ ++ // Decode only - should be NULL by the time we emit the frame ++ struct req_decode_ent decode_ent; ++ ++ struct media_request *req; ++ struct qent_src *qe_src; ++ ++#if HEVC_CTRLS_VERSION >= 2 ++ struct v4l2_ctrl_hevc_decode_params dec; ++#endif ++ ++ size_t num_slices; ++ size_t alloced_slices; ++ struct v4l2_ctrl_hevc_slice_params * slice_params; ++ struct slice_info * slices; ++ ++} V4L2MediaReqDescriptor; ++ ++struct slice_info { ++ const uint8_t * ptr; ++ size_t len; // bytes ++}; ++ ++// Handy container for accumulating controls before setting ++struct req_controls { ++ int has_scaling; ++ struct timeval tv; ++ struct v4l2_ctrl_hevc_sps sps; ++ struct v4l2_ctrl_hevc_pps pps; ++ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; ++}; ++ ++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; ++ ++ ++// Get an FFmpeg format from the v4l2 format ++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format) ++{ ++ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? ++ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) { ++ case V4L2_PIX_FMT_YUV420: ++ return AV_PIX_FMT_YUV420P; ++ case V4L2_PIX_FMT_NV12: ++ return AV_PIX_FMT_NV12; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ return AV_PIX_FMT_RPI4_8; ++ case V4L2_PIX_FMT_NV12_10_COL128: ++ return AV_PIX_FMT_RPI4_10; ++#endif ++ default: ++ break; ++ } ++ return AV_PIX_FMT_NONE; ++} ++ ++static inline uint64_t frame_capture_dpb(const AVFrame * const frame) ++{ ++ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; ++ return rd->timestamp; ++} ++ ++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp) ++{ ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; ++ rd->timestamp = dpb_stamp; ++} ++ ++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) ++{ ++ int32_t luma_weight_denom, chroma_weight_denom; ++ const SliceHeader *sh = &h->sh; ++ ++ if (sh->slice_type == HEVC_SLICE_I || ++ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || ++ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) ++ return; ++ ++ table->luma_log2_weight_denom = sh->luma_log2_weight_denom; ++ ++ if (h->ps.sps->chroma_format_idc) ++ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; ++ ++ luma_weight_denom = (1 << sh->luma_log2_weight_denom); ++ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { ++ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; ++ table->luma_offset_l0[i] = sh->luma_offset_l0[i]; ++ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; ++ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; ++ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; ++ } ++ ++ if (sh->slice_type != HEVC_SLICE_B) ++ return; ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { ++ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; ++ table->luma_offset_l1[i] = sh->luma_offset_l1[i]; ++ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; ++ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; ++ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; ++ } ++} ++ ++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) ++{ ++ const HEVCFrame *frame; ++ int i; ++ ++ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { ++ frame = h->rps[ST_CURR_BEF].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; ++ } ++ ++ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { ++ frame = h->rps[ST_CURR_AFT].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; ++ } ++ ++ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { ++ frame = h->rps[LT_CURR].ref[i]; ++ if (frame && timestamp == frame_capture_dpb(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; ++ } ++ ++ return 0; ++} ++ ++static unsigned int ++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, ++ const struct v4l2_hevc_dpb_entry * const entries, ++ const unsigned int num_entries) ++{ ++ uint64_t timestamp; ++ ++ if (!frame) ++ return 0; ++ ++ timestamp = frame_capture_dpb(frame->frame); ++ ++ for (unsigned int i = 0; i < num_entries; i++) { ++ if (entries[i].timestamp == timestamp) ++ return i; ++ } ++ ++ return 0; ++} ++ ++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) ++{ ++ unsigned int z = 0; ++ while (idx--) { ++ if (*b++ == 0) { ++ ++z; ++ if (z >= 2 && *b == 3) { ++ ++b; ++ z = 0; ++ } ++ } ++ else { ++ z = 0; ++ } ++ } ++ return b; ++} ++ ++static int slice_add(V4L2MediaReqDescriptor * const rd) ++{ ++ if (rd->num_slices >= rd->alloced_slices) { ++ struct v4l2_ctrl_hevc_slice_params * p2; ++ struct slice_info * s2; ++ size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; ++ ++ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); ++ if (p2 == NULL) ++ return AVERROR(ENOMEM); ++ rd->slice_params = p2; ++ ++ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2)); ++ if (s2 == NULL) ++ return AVERROR(ENOMEM); ++ rd->slices = s2; ++ ++ rd->alloced_slices = n2; ++ } ++ ++rd->num_slices; ++ return 0; ++} ++ ++static unsigned int ++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) ++{ ++ unsigned int i; ++ unsigned int n = 0; ++ const HEVCFrame * const pic = h->ref; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { ++ const HEVCFrame * const frame = &h->DPB[i]; ++ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { ++ struct v4l2_hevc_dpb_entry * const entry = entries + n++; ++ ++ entry->timestamp = frame_capture_dpb(frame->frame); ++ entry->rps = find_frame_rps_type(h, entry->timestamp); ++ entry->field_pic = frame->frame->interlaced_frame; ++ ++ /* TODO: Interleaved: Get the POC for each field. */ ++ entry->pic_order_cnt[0] = frame->poc; ++ entry->pic_order_cnt[1] = frame->poc; ++ } ++ } ++ return n; ++} ++ ++static void fill_slice_params(const HEVCContext * const h, ++#if HEVC_CTRLS_VERSION >= 2 ++ const struct v4l2_ctrl_hevc_decode_params * const dec, ++#endif ++ struct v4l2_ctrl_hevc_slice_params *slice_params, ++ uint32_t bit_size, uint32_t bit_offset) ++{ ++ const SliceHeader * const sh = &h->sh; ++#if HEVC_CTRLS_VERSION >= 2 ++ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb; ++ const unsigned int dpb_n = dec->num_active_dpb_entries; ++#else ++ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb; ++ unsigned int dpb_n; ++#endif ++ unsigned int i; ++ RefPicList *rpl; ++ ++ *slice_params = (struct v4l2_ctrl_hevc_slice_params) { ++ .bit_size = bit_size, ++ .data_bit_offset = bit_offset, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_segment_addr = sh->slice_segment_addr, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ .nal_unit_type = h->nal_unit_type, ++ .nuh_temporal_id_plus1 = h->temporal_id + 1, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_type = sh->slice_type, ++ .colour_plane_id = sh->colour_plane_id, ++ .slice_pic_order_cnt = h->ref->poc, ++ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, ++ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, ++ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, ++ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, ++ .slice_qp_delta = sh->slice_qp_delta, ++ .slice_cb_qp_offset = sh->slice_cb_qp_offset, ++ .slice_cr_qp_offset = sh->slice_cr_qp_offset, ++ .slice_act_y_qp_offset = 0, ++ .slice_act_cb_qp_offset = 0, ++ .slice_act_cr_qp_offset = 0, ++ .slice_beta_offset_div2 = sh->beta_offset / 2, ++ .slice_tc_offset_div2 = sh->tc_offset / 2, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ .pic_struct = h->sei.picture_timing.picture_struct, ++ ++#if HEVC_CTRLS_VERSION < 2 ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, ++ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, ++ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, ++#endif ++ }; ++ ++ if (sh->slice_sample_adaptive_offset_flag[0]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; ++ ++ if (sh->slice_sample_adaptive_offset_flag[1]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; ++ ++ if (sh->slice_temporal_mvp_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; ++ ++ if (sh->mvd_l1_zero_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; ++ ++ if (sh->cabac_init_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; ++ ++ if (sh->collocated_list == L0) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; ++ ++ if (sh->disable_deblocking_filter_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; ++ ++ if (sh->slice_loop_filter_across_slices_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (sh->dependent_slice_segment_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; ++ ++#if HEVC_CTRLS_VERSION < 2 ++ dpb_n = fill_dpb_entries(h, dpb); ++ slice_params->num_active_dpb_entries = dpb_n; ++#endif ++ ++ if (sh->slice_type != HEVC_SLICE_I) { ++ rpl = &h->ref->refPicList[0]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ rpl = &h->ref->refPicList[1]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); ++ } ++ ++ fill_pred_table(h, &slice_params->pred_weight_table); ++ ++ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; ++ if (slice_params->num_entry_point_offsets > 256) { ++ slice_params->num_entry_point_offsets = 256; ++ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); ++ } ++ ++ for (i = 0; i < slice_params->num_entry_point_offsets; i++) ++ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; ++} ++ ++#if HEVC_CTRLS_VERSION >= 2 ++static void ++fill_decode_params(const HEVCContext * const h, ++ struct v4l2_ctrl_hevc_decode_params * const dec) ++{ ++ unsigned int i; ++ ++ *dec = (struct v4l2_ctrl_hevc_decode_params){ ++ .pic_order_cnt_val = h->poc, ++ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, ++ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, ++ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs, ++ }; ++ ++ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb); ++ ++ // The docn does seem to ask that we fit our 32 bit signed POC into ++ // a U8 so... (To be fair 16 bits would be enough) ++ // Luckily we (Pi) don't use these fields ++ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i) ++ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc; ++ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i) ++ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc; ++ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i) ++ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc; ++ ++ if (IS_IRAP(h)) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; ++ if (IS_IDR(h)) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; ++ if (h->sh.no_output_of_prior_pics_flag) ++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR; ++ ++} ++#endif ++ ++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps) ++{ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ *ctrl = (struct v4l2_ctrl_hevc_sps) { ++ .chroma_format_idc = sps->chroma_format_idc, ++ .pic_width_in_luma_samples = sps->width, ++ .pic_height_in_luma_samples = sps->height, ++ .bit_depth_luma_minus8 = sps->bit_depth - 8, ++ .bit_depth_chroma_minus8 = sps->bit_depth - 8, ++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, ++ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, ++ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, ++ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, ++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, ++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, ++ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, ++ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, ++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, ++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, ++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, ++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, ++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, ++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, ++ .num_short_term_ref_pic_sets = sps->nb_st_rps, ++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, ++ .chroma_format_idc = sps->chroma_format_idc, ++ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1, ++ }; ++ ++ if (sps->separate_colour_plane_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; ++ ++ if (sps->scaling_list_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; ++ ++ if (sps->amp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; ++ ++ if (sps->sao_enabled) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; ++ ++ if (sps->pcm_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; ++ ++ if (sps->pcm.loop_filter_disable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; ++ ++ if (sps->long_term_ref_pics_present_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; ++ ++ if (sps->sps_temporal_mvp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; ++ ++ if (sps->sps_strong_intra_smoothing_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; ++} ++ ++static void fill_scaling_matrix(const ScalingList * const sl, ++ struct v4l2_ctrl_hevc_scaling_matrix * const sm) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < 6; i++) { ++ unsigned int j; ++ ++ for (j = 0; j < 16; j++) ++ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j]; ++ for (j = 0; j < 64; j++) { ++ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j]; ++ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j]; ++ if (i < 2) ++ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; ++ } ++ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; ++ if (i < 2) ++ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; ++ } ++} ++ ++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps) ++{ ++ uint64_t flags = 0; ++ ++ if (pps->dependent_slice_segments_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; ++ ++ if (pps->output_flag_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; ++ ++ if (pps->sign_data_hiding_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; ++ ++ if (pps->cabac_init_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; ++ ++ if (pps->constrained_intra_pred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; ++ ++ if (pps->transform_skip_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; ++ ++ if (pps->cu_qp_delta_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; ++ ++ if (pps->pic_slice_level_chroma_qp_offsets_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; ++ ++ if (pps->weighted_pred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; ++ ++ if (pps->weighted_bipred_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; ++ ++ if (pps->transquant_bypass_enable_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; ++ ++ if (pps->tiles_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; ++ ++ if (pps->loop_filter_across_tiles_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; ++ ++ if (pps->seq_loop_filter_across_slices_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (pps->deblocking_filter_override_enabled_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; ++ ++ if (pps->disable_dbf) ++ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; ++ ++ if (pps->lists_modification_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; ++ ++ if (pps->slice_header_extension_present_flag) ++ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ *ctrl = (struct v4l2_ctrl_hevc_pps) { ++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, ++ .init_qp_minus26 = pps->pic_init_qp_minus26, ++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, ++ .pps_cb_qp_offset = pps->cb_qp_offset, ++ .pps_cr_qp_offset = pps->cr_qp_offset, ++ .pps_beta_offset_div2 = pps->beta_offset / 2, ++ .pps_tc_offset_div2 = pps->tc_offset / 2, ++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, ++ .flags = flags ++ }; ++ ++ ++ if (pps->tiles_enabled_flag) { ++ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1; ++ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1; ++ ++ for (int i = 0; i < pps->num_tile_columns; i++) ++ ctrl->column_width_minus1[i] = pps->column_width[i] - 1; ++ ++ for (int i = 0; i < pps->num_tile_rows; i++) ++ ctrl->row_height_minus1[i] = pps->row_height[i] - 1; ++ } ++} ++ ++// Called before finally returning the frame to the user ++// Set corrupt flag here as this is actually the frame structure that ++// is going to the user (in MT land each thread has its own pool) ++static int frame_post_process(void *logctx, AVFrame *frame) ++{ ++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0]; ++ ++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); ++ frame->flags &= ~AV_FRAME_FLAG_CORRUPT; ++ if (rd->qe_dst) { ++ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst); ++ if (stat != MEDIABUFS_STATUS_SUCCESS) { ++ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__); ++ frame->flags |= AV_FRAME_FLAG_CORRUPT; ++ } ++ } ++ ++ return 0; ++} ++ ++static inline struct timeval cvt_dpb_to_tv(uint64_t t) ++{ ++ t /= 1000; ++ return (struct timeval){ ++ .tv_usec = t % 1000000, ++ .tv_sec = t / 1000000 ++ }; ++} ++ ++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t) ++{ ++ return (uint64_t)t * 1000; ++} ++ ++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); ++ decode_q_add(&ctx->decode_q, &rd->decode_ent); ++ ++ rd->num_slices = 0; ++ ctx->timestamp++; ++ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp); ++ ++ { ++ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data; ++ fdd->post_process = frame_post_process; ++ } ++ ++ // qe_dst needs to be bound to the data buffer and only returned when that is ++ if (!rd->qe_dst) ++ { ++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame ++ ++ return 0; ++} ++ ++// Object fd & size will be zapped by this & need setting later ++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format) ++{ ++ AVDRMLayerDescriptor *layer = &desc->layers[0]; ++ unsigned int width; ++ unsigned int height; ++ unsigned int bpl; ++ uint32_t pixelformat; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ width = format->fmt.pix_mp.width; ++ height = format->fmt.pix_mp.height; ++ pixelformat = format->fmt.pix_mp.pixelformat; ++ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline; ++ } ++ else { ++ width = format->fmt.pix.width; ++ height = format->fmt.pix.height; ++ pixelformat = format->fmt.pix.pixelformat; ++ bpl = format->fmt.pix.bytesperline; ++ } ++ ++ switch (pixelformat) { ++ case V4L2_PIX_FMT_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); ++ break; ++ case V4L2_PIX_FMT_NV12_10_COL128: ++ layer->format = DRM_FORMAT_P030; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); ++ break; ++#endif ++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED ++ case V4L2_PIX_FMT_SUNXI_TILED_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; ++ break; ++#endif ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ case V4L2_PIX_FMT_NV15: ++ layer->format = DRM_FORMAT_NV15; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ case V4L2_PIX_FMT_NV16: ++ layer->format = DRM_FORMAT_NV16; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ case V4L2_PIX_FMT_NV20: ++ layer->format = DRM_FORMAT_NV20; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ default: ++ return -1; ++ } ++ ++ desc->nb_objects = 1; ++ desc->objects[0].fd = -1; ++ desc->objects[0].size = 0; ++ ++ desc->nb_layers = 1; ++ layer->nb_planes = 2; ++ ++ layer->planes[0].object_index = 0; ++ layer->planes[0].offset = 0; ++ layer->planes[0].pitch = bpl; ++#if CONFIG_SAND ++ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = width; ++ layer->planes[1].pitch = width; ++ } ++ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy ++ layer->planes[1].pitch = width * 2; ++ } ++ else ++#endif ++ { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = layer->planes[0].pitch * height; ++ layer->planes[1].pitch = layer->planes[0].pitch; ++ } ++ ++ return 0; ++} ++ ++static int ++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, ++ struct req_controls *const controls, ++#if HEVC_CTRLS_VERSION >= 2 ++ struct v4l2_ctrl_hevc_decode_params * const dec, ++#endif ++ struct v4l2_ctrl_hevc_slice_params * const slices, ++ const unsigned int slice_no, ++ const unsigned int slice_count) ++{ ++ int rv; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, ++ .ptr = &controls->sps, ++ .size = sizeof(controls->sps), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, ++ .ptr = &controls->pps, ++ .size = sizeof(controls->pps), ++ }, ++#if HEVC_CTRLS_VERSION >= 2 ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, ++ .ptr = dec, ++ .size = sizeof(*dec), ++ }, ++#endif ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, ++ .ptr = slices + slice_no, ++ .size = sizeof(*slices) * slice_count, ++ }, ++ // Optional ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, ++ .ptr = &controls->scaling_matrix, ++ .size = sizeof(controls->scaling_matrix), ++ }, ++ }; ++ ++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, ++ controls->has_scaling ? ++ FF_ARRAY_ELEMS(control) : ++ FF_ARRAY_ELEMS(control) - 1); ++ ++ return rv; ++} ++ ++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; ++ int bcount = get_bits_count(&h->HEVClc->gb); ++ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; ++ ++ int rv; ++ struct slice_info * si; ++ ++ if ((rv = slice_add(rd)) != 0) ++ return rv; ++ ++ si = rd->slices + rd->num_slices - 1; ++ si->ptr = buffer; ++ si->len = size; ++ ++ if (ctx->multi_slice && rd->num_slices > 1) { ++ struct slice_info *const si0 = rd->slices; ++ const size_t offset = (buffer - si0->ptr); ++ boff += offset * 8; ++ size += offset; ++ si0->len = si->len + offset; ++ } ++ ++#if HEVC_CTRLS_VERSION >= 2 ++ if (rd->num_slices == 1) ++ fill_decode_params(h, &rd->dec); ++ fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++#else ++ fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++#endif ++ ++ return 0; ++} ++ ++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ if (h->ref != NULL) { ++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ media_request_abort(&rd->req); ++ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src); ++ ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ } ++} ++ ++static int send_slice(AVCodecContext * const avctx, ++ V4L2MediaReqDescriptor * const rd, ++ struct req_controls *const controls, ++ const unsigned int i, const unsigned int j) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ struct slice_info *const si = rd->slices + i; ++ struct media_request * req = NULL; ++ struct qent_src * src = NULL; ++ MediaBufsStatus stat; ++ ++ if ((req = media_request_get(ctx->mpool)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ ++ if (set_req_ctls(ctx, req, ++ controls, ++#if HEVC_CTRLS_VERSION >= 2 ++ &rd->dec, ++#endif ++ rd->slice_params, ++ i, j - i)) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); ++ goto fail1; ++ } ++ ++ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__); ++ goto fail1; ++ } ++ ++ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__); ++ goto fail2; ++ } ++ ++ if (qent_src_params_set(src, &controls->tv)) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__); ++ goto fail2; ++ } ++ ++#warning ANNEX_B start code ++// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { ++// } ++ ++ stat = mediabufs_start_request(ctx->mbufs, &req, &src, ++ i == 0 ? rd->qe_dst : NULL, ++ j == rd->num_slices); ++ ++ if (stat != MEDIABUFS_STATUS_SUCCESS) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); ++ return AVERROR_UNKNOWN; ++ } ++ return 0; ++ ++fail2: ++ mediabufs_src_qent_abort(ctx->mbufs, &src); ++fail1: ++ media_request_abort(&req); ++ return AVERROR_UNKNOWN; ++} ++ ++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) ++{ ++ const HEVCContext * const h = avctx->priv_data; ++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ struct req_controls rc; ++ unsigned int i; ++ int rv; ++ ++ // It is possible, though maybe a bug, to get an end_frame without ++ // a previous start_frame. If we do then give up. ++ if (!decode_q_in_q(&rd->decode_ent)) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ { ++ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ? ++ &h->ps.pps->scaling_list : ++ h->ps.sps->scaling_list_enable_flag ? ++ &h->ps.sps->scaling_list : NULL; ++ ++ ++ memset(&rc, 0, sizeof(rc)); ++ rc.tv = cvt_dpb_to_tv(rd->timestamp); ++ fill_sps(&rc.sps, h->ps.sps); ++ fill_pps(&rc.pps, h->ps.pps); ++ if (sl) { ++ rc.has_scaling = 1; ++ fill_scaling_matrix(sl, &rc.scaling_matrix); ++ } ++ } ++ ++ decode_q_wait(&ctx->decode_q, &rd->decode_ent); ++ ++ // qe_dst needs to be bound to the data buffer and only returned when that is ++ // Alloc almost certainly wants to be serialised if there is any chance of blocking ++ // so we get the next frame to be free in the thread that needs it for decode first. ++ // ++ // In our current world this probably isn't a concern but put it here anyway ++ if (!rd->qe_dst) ++ { ++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ } ++ ++ // Send as slices ++ if (ctx->multi_slice) ++ { ++ if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) ++ goto fail; ++ } ++ else ++ { ++ for (i = 0; i != rd->num_slices; ++i) { ++ if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) ++ goto fail; ++ } ++ } ++ ++ // Set the drm_prime desriptor ++ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); ++ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0)); ++ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0)); ++ ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ return 0; ++ ++fail: ++ decode_q_remove(&ctx->decode_q, &rd->decode_ent); ++ return rv; ++} ++ ++// Initial check & init ++static int ++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ const HEVCSPS * const sps = h->ps.sps; ++ struct v4l2_ctrl_hevc_sps ctrl_sps; ++ unsigned int i; ++ ++ // Check for var slice array ++ struct v4l2_query_ext_ctrl qc[] = { ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, ++#if HEVC_CTRLS_VERSION >= 2 ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, ++#endif ++ }; ++ // Order & size must match! ++ static const size_t ctrl_sizes[] = { ++ sizeof(struct v4l2_ctrl_hevc_slice_params), ++ sizeof(struct v4l2_ctrl_hevc_sps), ++ sizeof(struct v4l2_ctrl_hevc_pps), ++ sizeof(struct v4l2_ctrl_hevc_scaling_matrix), ++#if HEVC_CTRLS_VERSION >= 2 ++ sizeof(struct v4l2_ctrl_hevc_decode_params), ++#endif ++ }; ++ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); ++ ++ if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); ++ return AVERROR(EINVAL); ++ } ++ for (i = 0; i != noof_ctrls; ++i) { ++ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", ++ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ fill_sps(&ctrl_sps, sps); ++ ++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; ++ return 0; ++} ++ ++// Final init ++static int ++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) ++{ ++ int ret; ++ ++ struct v4l2_query_ext_ctrl querys[] = { ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, ++ }; ++ ++ struct v4l2_ext_control ctrls[] = { ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, ++ }; ++ ++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); ++ ++ ctx->decode_mode = querys[0].default_value; ++ ++ if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && ++ ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->start_code = querys[1].default_value; ++ if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && ++ ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->max_slices = querys[2].elems; ++ if (ctx->max_slices > MAX_SLICES) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); ++ return AVERROR(EINVAL); ++ } ++ ++ ctrls[0].value = ctx->decode_mode; ++ ctrls[1].value = ctx->start_code; ++ ++ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls)); ++ return !ret ? 0 : AVERROR(-ret); ++} ++ ++static void v4l2_req_frame_free(void *opaque, uint8_t *data) ++{ ++ AVCodecContext *avctx = opaque; ++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data); ++ ++ qent_dst_unref(&rd->qe_dst); ++ ++ // We don't expect req or qe_src to be set ++ if (rd->req || rd->qe_src) ++ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src); ++ ++ av_freep(&rd->slices); ++ av_freep(&rd->slice_params); ++ ++ av_free(rd); ++} ++ ++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size) ++{ ++ AVCodecContext *avctx = opaque; ++// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++// V4L2MediaReqDescriptor *req; ++ AVBufferRef *ref; ++ uint8_t *data; ++// int ret; ++ ++ data = av_mallocz(size); ++ if (!data) ++ return NULL; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); ++ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0); ++ if (!ref) { ++ av_freep(&data); ++ return NULL; ++ } ++ return ref; ++} ++ ++#if 0 ++static void v4l2_req_pool_free(void *opaque) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); ++} ++ ++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); ++ ++ av_buffer_pool_uninit(&hwfc->pool); ++} ++#endif ++ ++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) ++{ ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; ++ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs); ++ ++ hwfc->format = AV_PIX_FMT_DRM_PRIME; ++ hwfc->sw_format = pixel_format_from_format(vfmt); ++ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) { ++ hwfc->width = vfmt->fmt.pix_mp.width; ++ hwfc->height = vfmt->fmt.pix_mp.height; ++ } else { ++ hwfc->width = vfmt->fmt.pix.width; ++ hwfc->height = vfmt->fmt.pix.height; ++ } ++#if 0 ++ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); ++ if (!hwfc->pool) ++ return AVERROR(ENOMEM); ++ ++ hwfc->free = v4l2_req_hwframe_ctx_free; ++ ++ hwfc->initial_pool_size = 1; ++ ++ switch (avctx->codec_id) { ++ case AV_CODEC_ID_VP9: ++ hwfc->initial_pool_size += 8; ++ break; ++ case AV_CODEC_ID_VP8: ++ hwfc->initial_pool_size += 3; ++ break; ++ default: ++ hwfc->initial_pool_size += 2; ++ } ++#endif ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); ++ ++ return 0; ++} ++ ++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ int rv; ++ ++ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); ++ if (!frame->buf[0]) ++ return AVERROR(ENOMEM); ++ ++ frame->data[0] = frame->buf[0]->data; ++ ++ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); ++ ++ if ((rv = ff_attach_decode_data(frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); ++ av_frame_unref(frame); ++ return rv; ++ } ++ ++ return 0; ++} ++ ++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { ++ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, ++ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION), ++ .probe = probe, ++ .set_controls = set_controls, ++ ++ .start_frame = v4l2_request_hevc_start_frame, ++ .decode_slice = v4l2_request_hevc_decode_slice, ++ .end_frame = v4l2_request_hevc_end_frame, ++ .abort_frame = v4l2_request_hevc_abort_frame, ++ .frame_params = frame_params, ++ .alloc_frame = alloc_frame, ++}; ++ +diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c +new file mode 100644 +index 0000000000..eb00ecb406 +--- /dev/null ++++ b/libavcodec/v4l2_req_media.c +@@ -0,0 +1,1596 @@ ++/* ++ * Copyright (C) 2018 Paul Kocialkowski ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_utils.h" ++#include "weak_link.h" ++ ++ ++/* floor(log2(x)) */ ++static unsigned int log2_size(size_t x) ++{ ++ unsigned int n = 0; ++ ++ if (x & ~0xffff) { ++ n += 16; ++ x >>= 16; ++ } ++ if (x & ~0xff) { ++ n += 8; ++ x >>= 8; ++ } ++ if (x & ~0xf) { ++ n += 4; ++ x >>= 4; ++ } ++ if (x & ~3) { ++ n += 2; ++ x >>= 2; ++ } ++ return (x & ~1) ? n + 1 : n; ++} ++ ++static size_t round_up_size(const size_t x) ++{ ++ /* Admit no size < 256 */ ++ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1; ++ ++ return x >= (3 << n) ? 4 << n : (3 << n); ++} ++ ++struct media_request; ++ ++struct media_pool { ++ int fd; ++ sem_t sem; ++ pthread_mutex_t lock; ++ struct media_request * free_reqs; ++ struct pollqueue * pq; ++}; ++ ++struct media_request { ++ struct media_request * next; ++ struct media_pool * mp; ++ int fd; ++ struct polltask * pt; ++}; ++ ++ ++static inline int do_trywait(sem_t *const sem) ++{ ++ while (sem_trywait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static inline int do_wait(sem_t *const sem) ++{ ++ while (sem_wait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static int request_buffers(int video_fd, unsigned int type, ++ enum v4l2_memory memory, unsigned int buffers_count) ++{ ++ struct v4l2_requestbuffers buffers; ++ int rc; ++ ++ memset(&buffers, 0, sizeof(buffers)); ++ buffers.type = type; ++ buffers.memory = memory; ++ buffers.count = buffers_count; ++ ++ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); ++ if (rc < 0) { ++ rc = -errno; ++ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc)); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++ ++static int set_stream(int video_fd, unsigned int type, bool enable) ++{ ++ enum v4l2_buf_type buf_type = type; ++ int rc; ++ ++ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, ++ &buf_type); ++ if (rc < 0) { ++ rc = -errno; ++ request_log("Unable to %sable stream: %s\n", ++ enable ? "en" : "dis", strerror(-rc)); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++ ++ ++struct media_request * media_request_get(struct media_pool * const mp) ++{ ++ struct media_request *req = NULL; ++ ++ /* Timeout handled by poll code */ ++ if (do_wait(&mp->sem)) ++ return NULL; ++ ++ pthread_mutex_lock(&mp->lock); ++ req = mp->free_reqs; ++ if (req) { ++ mp->free_reqs = req->next; ++ req->next = NULL; ++ } ++ pthread_mutex_unlock(&mp->lock); ++ return req; ++} ++ ++int media_request_fd(const struct media_request * const req) ++{ ++ return req->fd; ++} ++ ++int media_request_start(struct media_request * const req) ++{ ++ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1) ++ { ++ const int err = errno; ++ if (err == EINTR) ++ continue; ++ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err)); ++ return -err; ++ } ++ ++ pollqueue_add_task(req->pt, 2000); ++ return 0; ++} ++ ++static void media_request_done(void *v, short revents) ++{ ++ struct media_request *const req = v; ++ struct media_pool *const mp = req->mp; ++ ++ /* ** Not sure what to do about timeout */ ++ ++ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0) ++ request_log("Unable to reinit media request: %s\n", ++ strerror(errno)); ++ ++ pthread_mutex_lock(&mp->lock); ++ req->next = mp->free_reqs; ++ mp->free_reqs = req; ++ pthread_mutex_unlock(&mp->lock); ++ sem_post(&mp->sem); ++} ++ ++int media_request_abort(struct media_request ** const preq) ++{ ++ struct media_request * const req = *preq; ++ ++ if (req == NULL) ++ return 0; ++ *preq = NULL; ++ ++ media_request_done(req, 0); ++ return 0; ++} ++ ++static void delete_req_chain(struct media_request * const chain) ++{ ++ struct media_request * next = chain; ++ while (next) { ++ struct media_request * const req = next; ++ next = req->next; ++ if (req->pt) ++ polltask_delete(&req->pt); ++ if (req->fd != -1) ++ close(req->fd); ++ free(req); ++ } ++} ++ ++struct media_pool * media_pool_new(const char * const media_path, ++ struct pollqueue * const pq, ++ const unsigned int n) ++{ ++ struct media_pool * const mp = calloc(1, sizeof(*mp)); ++ unsigned int i; ++ ++ if (!mp) ++ goto fail0; ++ ++ mp->pq = pq; ++ pthread_mutex_init(&mp->lock, NULL); ++ mp->fd = open(media_path, O_RDWR | O_NONBLOCK); ++ if (mp->fd == -1) { ++ request_log("Failed to open '%s': %s\n", media_path, strerror(errno)); ++ goto fail1; ++ } ++ ++ for (i = 0; i != n; ++i) { ++ struct media_request * req = malloc(sizeof(*req)); ++ if (!req) ++ goto fail4; ++ ++ *req = (struct media_request){ ++ .next = mp->free_reqs, ++ .mp = mp, ++ .fd = -1 ++ }; ++ mp->free_reqs = req; ++ ++ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) { ++ request_log("Failed to alloc request %d: %s\n", i, strerror(errno)); ++ goto fail4; ++ } ++ ++ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req); ++ if (!req->pt) ++ goto fail4; ++ } ++ ++ sem_init(&mp->sem, 0, n); ++ ++ return mp; ++ ++fail4: ++ delete_req_chain(mp->free_reqs); ++ close(mp->fd); ++ pthread_mutex_destroy(&mp->lock); ++fail1: ++ free(mp); ++fail0: ++ return NULL; ++} ++ ++void media_pool_delete(struct media_pool ** pMp) ++{ ++ struct media_pool * const mp = *pMp; ++ ++ if (!mp) ++ return; ++ *pMp = NULL; ++ ++ delete_req_chain(mp->free_reqs); ++ close(mp->fd); ++ sem_destroy(&mp->sem); ++ pthread_mutex_destroy(&mp->lock); ++ free(mp); ++} ++ ++ ++#define INDEX_UNSET (~(uint32_t)0) ++ ++enum qent_status { ++ QENT_NEW = 0, // Initial state - shouldn't last ++ QENT_FREE, // On free chain ++ QENT_PENDING, // User has ent ++ QENT_WAITING, // On inuse ++ QENT_DONE, // Frame rx ++ QENT_ERROR, // Error ++ QENT_IMPORT ++}; ++ ++struct qent_base { ++ atomic_int ref_count; ++ struct qent_base *next; ++ struct qent_base *prev; ++ enum qent_status status; ++ uint32_t index; ++ struct dmabuf_h *dh[VIDEO_MAX_PLANES]; ++ struct timeval timestamp; ++}; ++ ++struct qent_src { ++ struct qent_base base; ++ int fixed_size; ++}; ++ ++struct qent_dst { ++ struct qent_base base; ++ bool waiting; ++ pthread_mutex_t lock; ++ pthread_cond_t cond; ++ struct ff_weak_link_client * mbc_wl; ++}; ++ ++struct qe_list_head { ++ struct qent_base *head; ++ struct qent_base *tail; ++}; ++ ++struct buf_pool { ++ pthread_mutex_t lock; ++ sem_t free_sem; ++ enum v4l2_buf_type buf_type; ++ struct qe_list_head free; ++ struct qe_list_head inuse; ++}; ++ ++ ++static inline struct qent_dst *base_to_dst(struct qent_base *be) ++{ ++ return (struct qent_dst *)be; ++} ++ ++static inline struct qent_src *base_to_src(struct qent_base *be) ++{ ++ return (struct qent_src *)be; ++} ++ ++ ++#define QENT_BASE_INITIALIZER {\ ++ .ref_count = ATOMIC_VAR_INIT(0),\ ++ .status = QENT_NEW,\ ++ .index = INDEX_UNSET\ ++} ++ ++static void qe_base_uninit(struct qent_base *const be) ++{ ++ unsigned int i; ++ for (i = 0; i != VIDEO_MAX_PLANES; ++i) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++} ++ ++static void qe_src_free(struct qent_src *const be_src) ++{ ++ if (!be_src) ++ return; ++ qe_base_uninit(&be_src->base); ++ free(be_src); ++} ++ ++static struct qent_src * qe_src_new(void) ++{ ++ struct qent_src *const be_src = malloc(sizeof(*be_src)); ++ if (!be_src) ++ return NULL; ++ *be_src = (struct qent_src){ ++ .base = QENT_BASE_INITIALIZER ++ }; ++ return be_src; ++} ++ ++static void qe_dst_free(struct qent_dst *const be_dst) ++{ ++ if (!be_dst) ++ return; ++ ++ ff_weak_link_unref(&be_dst->mbc_wl); ++ pthread_cond_destroy(&be_dst->cond); ++ pthread_mutex_destroy(&be_dst->lock); ++ qe_base_uninit(&be_dst->base); ++ free(be_dst); ++} ++ ++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) ++{ ++ struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); ++ if (!be_dst) ++ return NULL; ++ *be_dst = (struct qent_dst){ ++ .base = QENT_BASE_INITIALIZER, ++ .lock = PTHREAD_MUTEX_INITIALIZER, ++ .cond = PTHREAD_COND_INITIALIZER, ++ .mbc_wl = ff_weak_link_ref(wl) ++ }; ++ return be_dst; ++} ++ ++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be) ++{ ++ if (ql->tail) ++ ql->tail->next = be; ++ else ++ ql->head = be; ++ be->prev = ql->tail; ++ be->next = NULL; ++ ql->tail = be; ++} ++ ++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be) ++{ ++ if (!be) ++ return NULL; ++ ++ if (be->next) ++ be->next->prev = be->prev; ++ else ++ ql->tail = be->prev; ++ if (be->prev) ++ be->prev->next = be->next; ++ else ++ ql->head = be->next; ++ be->next = NULL; ++ be->prev = NULL; ++ return be; ++} ++ ++ ++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be) ++{ ++ ql_add_tail(&bp->free, be); ++} ++ ++static struct qent_base * bq_get_free(struct buf_pool *const bp) ++{ ++ return ql_extract(&bp->free, bp->free.head); ++} ++ ++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be) ++{ ++ return ql_extract(&bp->inuse, be); ++} ++ ++static struct qent_base * bq_get_inuse(struct buf_pool *const bp) ++{ ++ return ql_extract(&bp->inuse, bp->inuse.head); ++} ++ ++static void bq_free_all_free_src(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_free(bp)) != NULL) ++ qe_src_free(base_to_src(be)); ++} ++ ++static void bq_free_all_inuse_src(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_inuse(bp)) != NULL) ++ qe_src_free(base_to_src(be)); ++} ++ ++static void bq_free_all_free_dst(struct buf_pool *const bp) ++{ ++ struct qent_base *be; ++ while ((be = bq_get_free(bp)) != NULL) ++ qe_dst_free(base_to_dst(be)); ++} ++ ++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be) ++{ ++ unsigned int i; ++ ++ pthread_mutex_lock(&bp->lock); ++ /* Clear out state vars */ ++ be->timestamp.tv_sec = 0; ++ be->timestamp.tv_usec = 0; ++ be->status = QENT_FREE; ++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) ++ dmabuf_len_set(be->dh[i], 0); ++ bq_put_free(bp, be); ++ pthread_mutex_unlock(&bp->lock); ++ sem_post(&bp->free_sem); ++} ++ ++static bool queue_is_inuse(const struct buf_pool *const bp) ++{ ++ return bp->inuse.tail != NULL; ++} ++ ++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be) ++{ ++ if (!be) ++ return; ++ pthread_mutex_lock(&bp->lock); ++ ql_add_tail(&bp->inuse, be); ++ be->status = QENT_WAITING; ++ pthread_mutex_unlock(&bp->lock); ++} ++ ++static struct qent_base *queue_get_free(struct buf_pool *const bp) ++{ ++ struct qent_base *buf; ++ ++ if (do_wait(&bp->free_sem)) ++ return NULL; ++ pthread_mutex_lock(&bp->lock); ++ buf = bq_get_free(bp); ++ pthread_mutex_unlock(&bp->lock); ++ return buf; ++} ++ ++static struct qent_base *queue_tryget_free(struct buf_pool *const bp) ++{ ++ struct qent_base *buf; ++ ++ if (do_trywait(&bp->free_sem)) ++ return NULL; ++ pthread_mutex_lock(&bp->lock); ++ buf = bq_get_free(bp); ++ pthread_mutex_unlock(&bp->lock); ++ return buf; ++} ++ ++static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd) ++{ ++ struct qent_base *be; ++ ++ pthread_mutex_lock(&bp->lock); ++ /* Expect 1st in Q, but allow anywhere */ ++ for (be = bp->inuse.head; be; be = be->next) { ++ if (dmabuf_fd(be->dh[0]) == fd) { ++ bq_extract_inuse(bp, be); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&bp->lock); ++ ++ return be; ++} ++ ++static void queue_delete(struct buf_pool *const bp) ++{ ++ sem_destroy(&bp->free_sem); ++ pthread_mutex_destroy(&bp->lock); ++ free(bp); ++} ++ ++static struct buf_pool* queue_new(const int vfd) ++{ ++ struct buf_pool *bp = calloc(1, sizeof(*bp)); ++ if (!bp) ++ return NULL; ++ pthread_mutex_init(&bp->lock, NULL); ++ sem_init(&bp->free_sem, 0, 0); ++ return bp; ++} ++ ++ ++struct mediabufs_ctl { ++ atomic_int ref_count; /* 0 is single ref for easier atomics */ ++ void * dc; ++ int vfd; ++ bool stream_on; ++ bool polling; ++ bool dst_fixed; // Dst Q is fixed size ++ pthread_mutex_t lock; ++ struct buf_pool * src; ++ struct buf_pool * dst; ++ struct polltask * pt; ++ struct pollqueue * pq; ++ struct ff_weak_link_master * this_wlm; ++ ++ struct v4l2_format src_fmt; ++ struct v4l2_format dst_fmt; ++}; ++ ++static int qe_v4l2_queue(struct qent_base *const be, ++ const int vfd, struct media_request *const mreq, ++ const struct v4l2_format *const fmt, ++ const bool is_dst, const bool hold_flag) ++{ ++ struct v4l2_buffer buffer = { ++ .type = fmt->type, ++ .memory = V4L2_MEMORY_DMABUF, ++ .index = be->index ++ }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ unsigned int i; ++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (is_dst) ++ dmabuf_len_set(be->dh[i], 0); ++ ++ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ ++ planes[i].length = dmabuf_size(be->dh[i]); ++ planes[i].bytesused = dmabuf_len(be->dh[i]); ++ planes[i].m.fd = dmabuf_fd(be->dh[i]); ++ } ++ buffer.m.planes = planes; ++ buffer.length = i; ++ } ++ else { ++ if (is_dst) ++ dmabuf_len_set(be->dh[0], 0); ++ ++ buffer.bytesused = dmabuf_len(be->dh[0]); ++ buffer.length = dmabuf_size(be->dh[0]); ++ buffer.m.fd = dmabuf_fd(be->dh[0]); ++ } ++ ++ if (!is_dst && mreq) { ++ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD; ++ buffer.request_fd = media_request_fd(mreq); ++ if (hold_flag) ++ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF; ++ } ++ ++ if (is_dst) ++ be->timestamp = (struct timeval){0,0}; ++ ++ buffer.timestamp = be->timestamp; ++ ++ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) { ++ const int err = errno; ++ if (err != EINTR) { ++ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err)); ++ return -err; ++ } ++ } ++ return 0; ++} ++ ++static struct qent_base * qe_dequeue(struct buf_pool *const bp, ++ const int vfd, ++ const struct v4l2_format * const f) ++{ ++ int fd; ++ struct qent_base *be; ++ int rc; ++ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; ++ struct v4l2_buffer buffer = { ++ .type = f->type, ++ .memory = V4L2_MEMORY_DMABUF ++ }; ++ if (mp) { ++ buffer.length = f->fmt.pix_mp.num_planes; ++ buffer.m.planes = planes; ++ } ++ ++ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 && ++ errno == EINTR) ++ /* Loop */; ++ if (rc) { ++ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno)); ++ return NULL; ++ } ++ ++ fd = mp ? planes[0].m.fd : buffer.m.fd; ++ be = queue_find_extract_fd(bp, fd); ++ if (!be) { ++ request_log("Failed to find fd %d in Q\n", fd); ++ return NULL; ++ } ++ ++ be->timestamp = buffer.timestamp; ++ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; ++ return be; ++} ++ ++static void qe_dst_done(struct qent_dst * dst_be) ++{ ++ pthread_mutex_lock(&dst_be->lock); ++ dst_be->waiting = false; ++ pthread_cond_broadcast(&dst_be->cond); ++ pthread_mutex_unlock(&dst_be->lock); ++ ++ qent_dst_unref(&dst_be); ++} ++ ++static bool qe_dst_waiting(struct qent_dst *const dst_be) ++{ ++ bool waiting; ++ pthread_mutex_lock(&dst_be->lock); ++ waiting = dst_be->waiting; ++ dst_be->waiting = true; ++ pthread_mutex_unlock(&dst_be->lock); ++ return waiting; ++} ++ ++ ++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc) ++{ ++ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst); ++} ++ ++static void mediabufs_poll_cb(void * v, short revents) ++{ ++ struct mediabufs_ctl *mbc = v; ++ struct qent_src *src_be = NULL; ++ struct qent_dst *dst_be = NULL; ++ ++ if (!revents) ++ request_err(mbc->dc, "%s: Timeout\n", __func__); ++ ++ pthread_mutex_lock(&mbc->lock); ++ mbc->polling = false; ++ ++ if ((revents & POLLOUT) != 0) ++ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt)); ++ if ((revents & POLLIN) != 0) ++ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt)); ++ ++ /* Reschedule */ ++ if (mediabufs_wants_poll(mbc)) { ++ mbc->polling = true; ++ pollqueue_add_task(mbc->pt, 2000); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ ++ if (src_be) ++ queue_put_free(mbc->src, &src_be->base); ++ if (dst_be) ++ qe_dst_done(dst_be); ++} ++ ++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp) ++{ ++ struct qent_base *const be = &be_src->base; ++ ++ be->timestamp = *timestamp; ++ return 0; ++} ++ ++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst) ++{ ++ return be_dst->base.timestamp; ++} ++ ++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { ++ size_t newsize = round_up_size(len); ++ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); ++ if (!dbsc) { ++ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); ++ return -ENOMEM; ++ } ++ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { ++ request_log("%s: Realloc %zd failed\n", __func__, newsize); ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ struct qent_base *const be = &be_src->base; ++ return qent_base_realloc(be, len, dbsc); ++} ++ ++ ++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc) ++{ ++ void * dst; ++ struct qent_base *const be = &be_src->base; ++ int rv; ++ ++ // Realloc doesn't copy so don't alloc if offset != 0 ++ if ((rv = qent_base_realloc(be, offset + len, ++ be_src->fixed_size || offset ? NULL : dbsc)) != 0) ++ return rv; ++ ++ dmabuf_write_start(be->dh[0]); ++ dst = dmabuf_map(be->dh[0]); ++ if (!dst) ++ return -1; ++ memcpy((char*)dst + offset, src, len); ++ dmabuf_len_set(be->dh[0], len); ++ dmabuf_write_end(be->dh[0]); ++ return 0; ++} ++ ++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane) ++{ ++ const struct qent_base *const be = &be_dst->base; ++ ++ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane]; ++} ++ ++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane) ++{ ++ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane))); ++} ++ ++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, ++ struct media_request **const pmreq, ++ struct qent_src **const psrc_be, ++ struct qent_dst *const dst_be, ++ const bool is_final) ++{ ++ struct media_request * mreq = *pmreq; ++ struct qent_src *const src_be = *psrc_be; ++ ++ // Req & src are always both "consumed" ++ *pmreq = NULL; ++ *psrc_be = NULL; ++ ++ pthread_mutex_lock(&mbc->lock); ++ ++ if (!src_be) ++ goto fail1; ++ ++ if (dst_be) { ++ if (qe_dst_waiting(dst_be)) { ++ request_info(mbc->dc, "Request buffer already waiting on start\n"); ++ goto fail1; ++ } ++ dst_be->base.timestamp = (struct timeval){0,0}; ++ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false)) ++ goto fail1; ++ ++ qent_dst_ref(dst_be); ++ queue_put_inuse(mbc->dst, &dst_be->base); ++ } ++ ++ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final)) ++ goto fail1; ++ queue_put_inuse(mbc->src, &src_be->base); ++ ++ if (!mbc->polling && mediabufs_wants_poll(mbc)) { ++ mbc->polling = true; ++ pollqueue_add_task(mbc->pt, 2000); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ ++ if (media_request_start(mreq)) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail1: ++ media_request_abort(&mreq); ++ if (src_be) ++ queue_put_free(mbc->src, &src_be->base); ++ ++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q ++ if (dst_be) { ++ dst_be->base.status = QENT_ERROR; ++ qe_dst_done(dst_be); ++ } ++ pthread_mutex_unlock(&mbc->lock); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++ ++static int qe_alloc_from_fmt(struct qent_base *const be, ++ struct dmabufs_ctl *const dbsc, ++ const struct v4l2_format *const fmt) ++{ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ unsigned int i; ++ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) { ++ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i], ++ fmt->fmt.pix_mp.plane_fmt[i].sizeimage); ++ /* On failure tidy up and die */ ++ if (!be->dh[i]) { ++ while (i--) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++ return -1; ++ } ++ } ++ } ++ else { ++// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage); ++ size_t size = fmt->fmt.pix.sizeimage; ++ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size); ++ if (!be->dh[0]) ++ return -1; ++ } ++ return 0; ++} ++ ++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd, ++ const enum v4l2_buf_type buftype, ++ uint32_t pixfmt, ++ const unsigned int width, const unsigned int height, ++ const size_t bufsize) ++{ ++ *fmt = (struct v4l2_format){.type = buftype}; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { ++ fmt->fmt.pix_mp.width = width; ++ fmt->fmt.pix_mp.height = height; ++ fmt->fmt.pix_mp.pixelformat = pixfmt; ++ if (bufsize) { ++ fmt->fmt.pix_mp.num_planes = 1; ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize; ++ } ++ } ++ else { ++ fmt->fmt.pix.width = width; ++ fmt->fmt.pix.height = height; ++ fmt->fmt.pix.pixelformat = pixfmt; ++ fmt->fmt.pix.sizeimage = bufsize; ++ } ++ ++ while (ioctl(fd, VIDIOC_S_FMT, fmt)) ++ if (errno != EINTR) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ // Treat anything where we don't get at least what we asked for as a fail ++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { ++ if (fmt->fmt.pix_mp.width < width || ++ fmt->fmt.pix_mp.height < height || ++ fmt->fmt.pix_mp.pixelformat != pixfmt) { ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ } ++ else { ++ if (fmt->fmt.pix.width < width || ++ fmt->fmt.pix.height < height || ++ fmt->fmt.pix.pixelformat != pixfmt) { ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ } ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt, ++ const int fd, ++ const unsigned int type_v4l2, ++ const uint32_t flags_must, ++ const uint32_t flags_not, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v) ++{ ++ unsigned int i; ++ ++ for (i = 0;; ++i) { ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = i, ++ .type = type_v4l2 ++ }; ++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { ++ if (errno != EINTR) ++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; ++ } ++ if ((fmtdesc.flags & flags_must) != flags_must || ++ (fmtdesc.flags & flags_not)) ++ continue; ++ if (!accept_fn(accept_v, &fmtdesc)) ++ continue; ++ ++ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat, ++ width, height, 0) == MEDIABUFS_STATUS_SUCCESS) ++ return MEDIABUFS_STATUS_SUCCESS; ++ } ++ return 0; ++} ++ ++ ++/* Wait for qent done */ ++ ++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ enum qent_status estat; ++ ++ pthread_mutex_lock(&be_dst->lock); ++ while (be_dst->waiting && ++ !pthread_cond_wait(&be_dst->cond, &be_dst->lock)) ++ /* Loop */; ++ estat = be->status; ++ pthread_mutex_unlock(&be_dst->lock); ++ ++ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS : ++ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR : ++ MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no) ++{ ++ struct qent_base *const be = &be_dst->base; ++ return dmabuf_map(be->dh[buf_no]); ++} ++ ++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ unsigned int i; ++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (dmabuf_read_start(be->dh[i])) { ++ while (i--) ++ dmabuf_read_end(be->dh[i]); ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ } ++ } ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst) ++{ ++ struct qent_base *const be = &be_dst->base; ++ unsigned int i; ++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; ++ ++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { ++ if (dmabuf_read_end(be->dh[i])) ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ return status; ++} ++ ++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst) ++{ ++ if (be_dst) ++ atomic_fetch_add(&be_dst->base.ref_count, 1); ++ return be_dst; ++} ++ ++void qent_dst_unref(struct qent_dst ** const pbe_dst) ++{ ++ struct qent_dst * const be_dst = *pbe_dst; ++ struct mediabufs_ctl * mbc; ++ if (!be_dst) ++ return; ++ *pbe_dst = NULL; ++ ++ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0) ++ return; ++ ++ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) { ++ queue_put_free(mbc->dst, &be_dst->base); ++ ff_weak_link_unlock(be_dst->mbc_wl); ++ } ++ else { ++ qe_dst_free(be_dst); ++ } ++} ++ ++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, ++ unsigned int plane, ++ int fd, size_t size) ++{ ++ struct qent_base *const be = &be_dst->base; ++ struct dmabuf_h * dh; ++ ++ if (be->status != QENT_IMPORT || be->dh[plane]) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ dh = dmabuf_import(fd, size); ++ if (!dh) ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ ++ be->dh[plane] = dh; ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++// Returns noof buffers created, -ve for error ++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) ++{ ++ unsigned int i; ++ ++ struct v4l2_create_buffers cbuf = { ++ .count = n, ++ .memory = V4L2_MEMORY_DMABUF, ++ .format = mbc->dst_fmt, ++ }; ++ ++ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) { ++ const int err = -errno; ++ if (err != EINTR) { ++ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__); ++ return -err; ++ } ++ } ++ ++ if (cbuf.count != n) ++ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); ++ ++ for (i = 0; i != cbuf.count; ++i) ++ qes[i]->base.index = cbuf.index + i; ++ ++ return cbuf.count; ++} ++ ++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) ++{ ++ struct qent_dst * be_dst; ++ ++ if (mbc == NULL) { ++ be_dst = qe_dst_new(NULL); ++ if (be_dst) ++ be_dst->base.status = QENT_IMPORT; ++ return be_dst; ++ } ++ ++ if (mbc->dst_fixed) { ++ be_dst = base_to_dst(queue_get_free(mbc->dst)); ++ if (!be_dst) ++ return NULL; ++ } ++ else { ++ be_dst = base_to_dst(queue_tryget_free(mbc->dst)); ++ if (!be_dst) { ++ be_dst = qe_dst_new(mbc->this_wlm); ++ if (!be_dst) ++ return NULL; ++ ++ if (create_dst_bufs(mbc, 1, &be_dst) != 1) { ++ qe_dst_free(be_dst); ++ return NULL; ++ } ++ } ++ } ++ ++ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { ++ /* Given how create buf works we can't uncreate it on alloc failure ++ * all we can do is put it on the free Q ++ */ ++ queue_put_free(mbc->dst, &be_dst->base); ++ return NULL; ++ } ++ ++ be_dst->base.status = QENT_PENDING; ++ atomic_store(&be_dst->base.ref_count, 0); ++ return be_dst; ++} ++ ++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc) ++{ ++ return &mbc->dst_fmt; ++} ++ ++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v) ++{ ++ MediaBufsStatus status; ++ unsigned int i; ++ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type; ++ static const struct { ++ unsigned int flags_must; ++ unsigned int flags_not; ++ } trys[] = { ++ {0, V4L2_FMT_FLAG_EMULATED}, ++ {V4L2_FMT_FLAG_EMULATED, 0}, ++ }; ++ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) { ++ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd, ++ buf_type, ++ trys[i].flags_must, ++ trys[i].flags_not, ++ width, height, accept_fn, accept_v); ++ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE) ++ return status; ++ } ++ ++ if (status != MEDIABUFS_STATUS_SUCCESS) ++ return status; ++ ++ /* Try to create a buffer - don't alloc */ ++ return status; ++} ++ ++// ** This is a mess if we get partial alloc but without any way to remove ++// individual V4L2 Q members we are somewhat stuffed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) ++{ ++ unsigned int i; ++ int a = 0; ++ unsigned int qc; ++ struct qent_dst * qes[32]; ++ ++ if (n > 32) ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++ ++ // Create qents first as it is hard to get rid of the V4L2 buffers on error ++ for (qc = 0; qc != n; ++qc) ++ { ++ if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) ++ goto fail; ++ } ++ ++ if ((a = create_dst_bufs(mbc, n, qes)) < 0) ++ goto fail; ++ ++ for (i = 0; i != a; ++i) ++ queue_put_free(mbc->dst, &qes[i]->base); ++ ++ if (a != n) ++ goto fail; ++ ++ mbc->dst_fixed = fixed; ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail: ++ for (i = (a < 0 ? 0 : a); i != qc; ++i) ++ qe_dst_free(qes[i]); ++ ++ return MEDIABUFS_ERROR_ALLOCATION_FAILED; ++} ++ ++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) ++{ ++ struct qent_base * buf = queue_get_free(mbc->src); ++ buf->status = QENT_PENDING; ++ return base_to_src(buf); ++} ++ ++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src) ++{ ++ struct qent_src *const qe_src = *pqe_src; ++ if (!qe_src) ++ return; ++ *pqe_src = NULL; ++ queue_put_free(mbc->src, &qe_src->base); ++} ++ ++/* src format must have been set up before this */ ++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, ++ struct dmabufs_ctl * const dbsc, ++ unsigned int n) ++{ ++ unsigned int i; ++ struct v4l2_requestbuffers req = { ++ .count = n, ++ .type = mbc->src_fmt.type, ++ .memory = V4L2_MEMORY_DMABUF ++ }; ++ ++ bq_free_all_free_src(mbc->src); ++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { ++ if (errno != EINTR) { ++ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ } ++ ++ if (n > req.count) { ++ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n); ++ n = req.count; ++ } ++ ++ for (i = 0; i != n; ++i) { ++ struct qent_src *const be_src = qe_src_new(); ++ if (!be_src) { ++ request_err(mbc->dc, "Failed to create src be %d\n", i); ++ goto fail; ++ } ++ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { ++ qe_src_free(be_src); ++ goto fail; ++ } ++ be_src->base.index = i; ++ be_src->fixed_size = !mediabufs_src_resizable(mbc); ++ ++ queue_put_free(mbc->src, &be_src->base); ++ } ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++fail: ++ bq_free_all_free_src(mbc->src); ++ req.count = 0; ++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 && ++ errno == EINTR) ++ /* Loop */; ++ ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++ ++ ++/* ++ * Set stuff order: ++ * Set src fmt ++ * Set parameters (sps) on vfd ++ * Negotiate dst format (dst_fmt_set) ++ * Create src buffers ++ * Alloc a dst buffer or Create dst slots ++*/ ++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc) ++{ ++ if (mbc->stream_on) ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) { ++ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) { ++ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type); ++ set_stream(mbc->vfd, mbc->src_fmt.type, false); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ mbc->stream_on = true; ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc) ++{ ++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; ++ ++ if (!mbc->stream_on) ++ return MEDIABUFS_STATUS_SUCCESS; ++ ++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) { ++ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type); ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) { ++ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type); ++ status = MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ mbc->stream_on = false; ++ return status; ++} ++ ++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n) ++{ ++ struct v4l2_ext_controls controls = { ++ .controls = control_array, ++ .count = n ++ }; ++ ++ if (mreq) { ++ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL; ++ controls.request_fd = media_request_fd(mreq); ++ } ++ ++ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls)) ++ { ++ const int err = errno; ++ if (err != EINTR) { ++ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err)); ++ return -err; ++ } ++ } ++ ++ return 0; ++} ++ ++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, ++ struct media_request * const mreq, ++ unsigned int id, void *data, ++ unsigned int size) ++{ ++ struct v4l2_ext_control control = { ++ .id = id, ++ .ptr = data, ++ .size = size ++ }; ++ ++ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1); ++ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED; ++} ++ ++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, ++ enum v4l2_buf_type buf_type, ++ const uint32_t pixfmt, ++ const uint32_t width, const uint32_t height, ++ const size_t bufsize) ++{ ++ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize); ++ if (rv != MEDIABUFS_STATUS_SUCCESS) ++ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height); ++ ++ return rv; ++} ++ ++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n) ++{ ++ int rv = 0; ++ while (n--) { ++ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) { ++ const int err = errno; ++ if (err != EINTR) { ++ // Often used for probing - errors are to be expected ++ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err); ++ ctrls->type = 0; // 0 is invalid ++ rv = -err; ++ break; ++ } ++ } ++ ++ctrls; ++ } ++ return rv; ++} ++ ++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) ++{ ++ // Single planar OUTPUT can only take exact size buffers ++ // Multiplanar will take larger than negotiated ++ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); ++} ++ ++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) ++{ ++ if (!mbc) ++ return; ++ ++ // Break the weak link first ++ ff_weak_link_break(&mbc->this_wlm); ++ ++ polltask_delete(&mbc->pt); ++ ++ mediabufs_stream_off(mbc); ++ ++ // Empty v4l2 buffer stash ++ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0); ++ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0); ++ ++ bq_free_all_free_src(mbc->src); ++ bq_free_all_inuse_src(mbc->src); ++ bq_free_all_free_dst(mbc->dst); ++ ++ { ++ struct qent_dst *dst_be; ++ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) { ++ dst_be->base.timestamp = (struct timeval){0}; ++ dst_be->base.status = QENT_ERROR; ++ qe_dst_done(dst_be); ++ } ++ } ++ ++ queue_delete(mbc->dst); ++ queue_delete(mbc->src); ++ close(mbc->vfd); ++ pthread_mutex_destroy(&mbc->lock); ++ ++ free(mbc); ++} ++ ++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc) ++{ ++ atomic_fetch_add(&mbc->ref_count, 1); ++ return mbc; ++} ++ ++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) ++{ ++ struct mediabufs_ctl *const mbc = *pmbc; ++ int n; ++ ++ if (!mbc) ++ return; ++ *pmbc = NULL; ++ n = atomic_fetch_sub(&mbc->ref_count, 1); ++ if (n) ++ return; ++ mediabufs_ctl_delete(mbc); ++} ++ ++static int set_capabilities(struct mediabufs_ctl *const mbc) ++{ ++ struct v4l2_capability capability = { 0 }; ++ uint32_t caps; ++ ++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) { ++ int err = errno; ++ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); ++ return -err; ++ } ++ ++ caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? ++ capability.device_caps : ++ capability.capabilities; ++ ++ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { ++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ } ++ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) { ++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ } ++ else { ++ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* One of these per context */ ++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq) ++{ ++ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc)); ++ ++ if (!mbc) ++ return NULL; ++ ++ mbc->dc = dc; ++ // Default mono planar ++ mbc->pq = pq; ++ pthread_mutex_init(&mbc->lock, NULL); ++ ++ /* Pick a default - could we scan for this? */ ++ if (vpath == NULL) ++ vpath = "/dev/media0"; ++ ++ while ((mbc->vfd = open(vpath, O_RDWR)) == -1) ++ { ++ const int err = errno; ++ if (err != EINTR) { ++ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err)); ++ goto fail0; ++ } ++ } ++ ++ if (set_capabilities(mbc)) { ++ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath); ++ goto fail1; ++ } ++ ++ mbc->src = queue_new(mbc->vfd); ++ if (!mbc->src) ++ goto fail1; ++ mbc->dst = queue_new(mbc->vfd); ++ if (!mbc->dst) ++ goto fail2; ++ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc); ++ if (!mbc->pt) ++ goto fail3; ++ mbc->this_wlm = ff_weak_link_new(mbc); ++ if (!mbc->this_wlm) ++ goto fail4; ++ ++ /* Cannot add polltask now - polling with nothing pending ++ * generates infinite error polls ++ */ ++ return mbc; ++ ++fail4: ++ polltask_delete(&mbc->pt); ++fail3: ++ queue_delete(mbc->dst); ++fail2: ++ queue_delete(mbc->src); ++fail1: ++ close(mbc->vfd); ++fail0: ++ free(mbc); ++ request_info(dc, "%s: FAILED\n", __func__); ++ return NULL; ++} ++ ++ ++ +diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h +new file mode 100644 +index 0000000000..2f826cfb14 +--- /dev/null ++++ b/libavcodec/v4l2_req_media.h +@@ -0,0 +1,151 @@ ++/* ++e.h ++* ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef _MEDIA_H_ ++#define _MEDIA_H_ ++ ++#include ++#include ++ ++struct v4l2_format; ++struct v4l2_fmtdesc; ++struct v4l2_query_ext_ctrl; ++ ++struct pollqueue; ++struct media_request; ++struct media_pool; ++ ++typedef enum media_buf_status { ++ MEDIABUFS_STATUS_SUCCESS = 0, ++ MEDIABUFS_ERROR_OPERATION_FAILED, ++ MEDIABUFS_ERROR_DECODING_ERROR, ++ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, ++ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, ++ MEDIABUFS_ERROR_ALLOCATION_FAILED, ++} MediaBufsStatus; ++ ++struct media_pool * media_pool_new(const char * const media_path, ++ struct pollqueue * const pq, ++ const unsigned int n); ++void media_pool_delete(struct media_pool ** pmp); ++ ++// Obtain a media request ++// Will block if none availible - has a 2sec timeout ++struct media_request * media_request_get(struct media_pool * const mp); ++int media_request_fd(const struct media_request * const req); ++ ++// Start this request ++// Request structure is returned to pool once done ++int media_request_start(struct media_request * const req); ++ ++// Return an *unstarted* media_request to the pool ++// May later be upgraded to allow for aborting a started req ++int media_request_abort(struct media_request ** const preq); ++ ++ ++struct mediabufs_ctl; ++struct qent_src; ++struct qent_dst; ++struct dmabuf_h; ++struct dmabufs_ctl; ++ ++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); ++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); ++ ++// prealloc ++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc); ++// dbsc may be NULL if realloc not required ++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc); ++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane); ++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane); ++MediaBufsStatus qent_dst_wait(struct qent_dst *const be); ++void qent_dst_delete(struct qent_dst *const be); ++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead ++void qent_dst_unref(struct qent_dst ** const pbe_dst); ++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst); ++ ++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no); ++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be); ++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be); ++/* Import an fd unattached to any mediabuf */ ++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, ++ unsigned int plane, ++ int fd, size_t size); ++ ++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, ++ struct media_request **const pmreq, ++ struct qent_src **const psrc_be, ++ struct qent_dst *const dst_be, ++ const bool is_final); ++// Get / alloc a dst buffer & associate with a slot ++// If the dst pool is empty then behaviour depends on the fixed flag passed to ++// dst_slots_create. Default is !fixed = unlimited alloc ++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, ++ struct dmabufs_ctl *const dbsc); ++// Create dst slots without alloc ++// If fixed true then qent_alloc will only get slots from this pool and will ++// block until a qent has been unrefed ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); ++ ++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); ++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); ++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc); ++ ++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc); ++ ++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, ++ const unsigned int width, ++ const unsigned int height, ++ mediabufs_dst_fmt_accept_fn *const accept_fn, ++ void *const accept_v); ++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc); ++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src); ++ ++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, ++ struct v4l2_ext_control control_array[], unsigned int n); ++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, ++ struct media_request * const mreq, ++ unsigned int id, void *data, ++ unsigned int size); ++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n); ++ ++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc); ++ ++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, ++ enum v4l2_buf_type buf_type, ++ const uint32_t pixfmt, ++ const uint32_t width, const uint32_t height, ++ const size_t bufsize); ++ ++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, ++ struct dmabufs_ctl * const dbsc, ++ unsigned int n); ++ ++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, ++ const char *vpath, struct pollqueue *const pq); ++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); ++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc); ++ ++ ++#endif +diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c +new file mode 100644 +index 0000000000..cc8a5d4001 +--- /dev/null ++++ b/libavcodec/v4l2_req_pollqueue.c +@@ -0,0 +1,361 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_utils.h" ++ ++ ++struct pollqueue; ++ ++enum polltask_state { ++ POLLTASK_UNQUEUED = 0, ++ POLLTASK_QUEUED, ++ POLLTASK_RUNNING, ++ POLLTASK_Q_KILL, ++ POLLTASK_RUN_KILL, ++}; ++ ++struct polltask { ++ struct polltask *next; ++ struct polltask *prev; ++ struct pollqueue *q; ++ enum polltask_state state; ++ ++ int fd; ++ short events; ++ ++ void (*fn)(void *v, short revents); ++ void * v; ++ ++ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */ ++ sem_t kill_sem; ++}; ++ ++struct pollqueue { ++ atomic_int ref_count; ++ pthread_mutex_t lock; ++ ++ struct polltask *head; ++ struct polltask *tail; ++ ++ bool kill; ++ bool no_prod; ++ int prod_fd; ++ struct polltask *prod_pt; ++ pthread_t worker; ++}; ++ ++struct polltask *polltask_new(struct pollqueue *const pq, ++ const int fd, const short events, ++ void (*const fn)(void *v, short revents), ++ void *const v) ++{ ++ struct polltask *pt; ++ ++ if (!events) ++ return NULL; ++ ++ pt = malloc(sizeof(*pt)); ++ if (!pt) ++ return NULL; ++ ++ *pt = (struct polltask){ ++ .next = NULL, ++ .prev = NULL, ++ .q = pollqueue_ref(pq), ++ .fd = fd, ++ .events = events, ++ .fn = fn, ++ .v = v ++ }; ++ ++ sem_init(&pt->kill_sem, 0, 0); ++ ++ return pt; ++} ++ ++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt) ++{ ++ if (pt->prev) ++ pt->prev->next = pt->next; ++ else ++ pq->head = pt->next; ++ if (pt->next) ++ pt->next->prev = pt->prev; ++ else ++ pq->tail = pt->prev; ++ pt->next = NULL; ++ pt->prev = NULL; ++} ++ ++static void polltask_free(struct polltask * const pt) ++{ ++ sem_destroy(&pt->kill_sem); ++ free(pt); ++} ++ ++static int pollqueue_prod(const struct pollqueue *const pq) ++{ ++ static const uint64_t one = 1; ++ return write(pq->prod_fd, &one, sizeof(one)); ++} ++ ++void polltask_delete(struct polltask **const ppt) ++{ ++ struct polltask *const pt = *ppt; ++ struct pollqueue * pq; ++ enum polltask_state state; ++ bool prodme; ++ ++ if (!pt) ++ return; ++ ++ pq = pt->q; ++ pthread_mutex_lock(&pq->lock); ++ state = pt->state; ++ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL; ++ prodme = !pq->no_prod; ++ pthread_mutex_unlock(&pq->lock); ++ ++ if (state != POLLTASK_UNQUEUED) { ++ if (prodme) ++ pollqueue_prod(pq); ++ while (sem_wait(&pt->kill_sem) && errno == EINTR) ++ /* loop */; ++ } ++ ++ // Leave zapping the ref until we have DQed the PT as might well be ++ // legitimately used in it ++ *ppt = NULL; ++ polltask_free(pt); ++ pollqueue_unref(&pq); ++} ++ ++static uint64_t pollqueue_now(int timeout) ++{ ++ struct timespec now; ++ uint64_t now_ms; ++ ++ if (clock_gettime(CLOCK_MONOTONIC, &now)) ++ return 0; ++ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout; ++ return now_ms ? now_ms : (uint64_t)1; ++} ++ ++void pollqueue_add_task(struct polltask *const pt, const int timeout) ++{ ++ bool prodme = false; ++ struct pollqueue * const pq = pt->q; ++ ++ pthread_mutex_lock(&pq->lock); ++ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) { ++ if (pq->tail) ++ pq->tail->next = pt; ++ else ++ pq->head = pt; ++ pt->prev = pq->tail; ++ pt->next = NULL; ++ pt->state = POLLTASK_QUEUED; ++ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout); ++ pq->tail = pt; ++ prodme = !pq->no_prod; ++ } ++ pthread_mutex_unlock(&pq->lock); ++ if (prodme) ++ pollqueue_prod(pq); ++} ++ ++static void *poll_thread(void *v) ++{ ++ struct pollqueue *const pq = v; ++ struct pollfd *a = NULL; ++ size_t asize = 0; ++ ++ pthread_mutex_lock(&pq->lock); ++ do { ++ unsigned int i; ++ unsigned int n = 0; ++ struct polltask *pt; ++ struct polltask *pt_next; ++ uint64_t now = pollqueue_now(0); ++ int timeout = -1; ++ int rv; ++ ++ for (pt = pq->head; pt; pt = pt_next) { ++ int64_t t; ++ ++ pt_next = pt->next; ++ ++ if (pt->state == POLLTASK_Q_KILL) { ++ pollqueue_rem_task(pq, pt); ++ sem_post(&pt->kill_sem); ++ continue; ++ } ++ ++ if (n >= asize) { ++ asize = asize ? asize * 2 : 4; ++ a = realloc(a, asize * sizeof(*a)); ++ if (!a) { ++ request_log("Failed to realloc poll array to %zd\n", asize); ++ goto fail_locked; ++ } ++ } ++ ++ a[n++] = (struct pollfd){ ++ .fd = pt->fd, ++ .events = pt->events ++ }; ++ ++ t = (int64_t)(pt->timeout - now); ++ if (pt->timeout && t < INT_MAX && ++ (timeout < 0 || (int)t < timeout)) ++ timeout = (t < 0) ? 0 : (int)t; ++ } ++ pthread_mutex_unlock(&pq->lock); ++ ++ if ((rv = poll(a, n, timeout)) == -1) { ++ if (errno != EINTR) { ++ request_log("Poll error: %s\n", strerror(errno)); ++ goto fail_unlocked; ++ } ++ } ++ ++ pthread_mutex_lock(&pq->lock); ++ now = pollqueue_now(0); ++ ++ /* Prodding in this loop is pointless and might lead to ++ * infinite looping ++ */ ++ pq->no_prod = true; ++ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { ++ pt_next = pt->next; ++ ++ /* Pending? */ ++ if (a[i].revents || ++ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) { ++ pollqueue_rem_task(pq, pt); ++ if (pt->state == POLLTASK_QUEUED) ++ pt->state = POLLTASK_RUNNING; ++ if (pt->state == POLLTASK_Q_KILL) ++ pt->state = POLLTASK_RUN_KILL; ++ pthread_mutex_unlock(&pq->lock); ++ ++ /* This can add new entries to the Q but as ++ * those are added to the tail our existing ++ * chain remains intact ++ */ ++ pt->fn(pt->v, a[i].revents); ++ ++ pthread_mutex_lock(&pq->lock); ++ if (pt->state == POLLTASK_RUNNING) ++ pt->state = POLLTASK_UNQUEUED; ++ if (pt->state == POLLTASK_RUN_KILL) ++ sem_post(&pt->kill_sem); ++ } ++ } ++ pq->no_prod = false; ++ ++ } while (!pq->kill); ++ ++fail_locked: ++ pthread_mutex_unlock(&pq->lock); ++fail_unlocked: ++ free(a); ++ return NULL; ++} ++ ++static void prod_fn(void *v, short revents) ++{ ++ struct pollqueue *const pq = v; ++ char buf[8]; ++ if (revents) ++ read(pq->prod_fd, buf, 8); ++ if (!pq->kill) ++ pollqueue_add_task(pq->prod_pt, -1); ++} ++ ++struct pollqueue * pollqueue_new(void) ++{ ++ struct pollqueue *pq = malloc(sizeof(*pq)); ++ if (!pq) ++ return NULL; ++ *pq = (struct pollqueue){ ++ .ref_count = ATOMIC_VAR_INIT(0), ++ .lock = PTHREAD_MUTEX_INITIALIZER, ++ .head = NULL, ++ .tail = NULL, ++ .kill = false, ++ .prod_fd = -1 ++ }; ++ ++ pq->prod_fd = eventfd(0, EFD_NONBLOCK); ++ if (pq->prod_fd == 1) ++ goto fail1; ++ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq); ++ if (!pq->prod_pt) ++ goto fail2; ++ pollqueue_add_task(pq->prod_pt, -1); ++ if (pthread_create(&pq->worker, NULL, poll_thread, pq)) ++ goto fail3; ++ // Reset ref count which will have been inced by the add_task ++ atomic_store(&pq->ref_count, 0); ++ return pq; ++ ++fail3: ++ polltask_free(pq->prod_pt); ++fail2: ++ close(pq->prod_fd); ++fail1: ++ free(pq); ++ return NULL; ++} ++ ++static void pollqueue_free(struct pollqueue *const pq) ++{ ++ void *rv; ++ ++ pthread_mutex_lock(&pq->lock); ++ pq->kill = true; ++ pollqueue_prod(pq); ++ pthread_mutex_unlock(&pq->lock); ++ ++ pthread_join(pq->worker, &rv); ++ polltask_free(pq->prod_pt); ++ pthread_mutex_destroy(&pq->lock); ++ close(pq->prod_fd); ++ free(pq); ++} ++ ++struct pollqueue * pollqueue_ref(struct pollqueue *const pq) ++{ ++ atomic_fetch_add(&pq->ref_count, 1); ++ return pq; ++} ++ ++void pollqueue_unref(struct pollqueue **const ppq) ++{ ++ struct pollqueue * const pq = *ppq; ++ ++ if (!pq) ++ return; ++ *ppq = NULL; ++ ++ if (atomic_fetch_sub(&pq->ref_count, 1) != 0) ++ return; ++ ++ pollqueue_free(pq); ++} ++ ++ ++ +diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h +new file mode 100644 +index 0000000000..e1182cb2fc +--- /dev/null ++++ b/libavcodec/v4l2_req_pollqueue.h +@@ -0,0 +1,18 @@ ++#ifndef POLLQUEUE_H_ ++#define POLLQUEUE_H_ ++ ++struct polltask; ++struct pollqueue; ++ ++struct polltask *polltask_new(struct pollqueue *const pq, ++ const int fd, const short events, ++ void (*const fn)(void *v, short revents), ++ void *const v); ++void polltask_delete(struct polltask **const ppt); ++ ++void pollqueue_add_task(struct polltask *const pt, const int timeout); ++struct pollqueue * pollqueue_new(void); ++void pollqueue_unref(struct pollqueue **const ppq); ++struct pollqueue * pollqueue_ref(struct pollqueue *const pq); ++ ++#endif /* POLLQUEUE_H_ */ +diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h +new file mode 100644 +index 0000000000..a31cc1f4ec +--- /dev/null ++++ b/libavcodec/v4l2_req_utils.h +@@ -0,0 +1,27 @@ ++#ifndef AVCODEC_V4L2_REQ_UTILS_H ++#define AVCODEC_V4L2_REQ_UTILS_H ++ ++#include ++#include "libavutil/log.h" ++ ++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) ++ ++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) ++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) ++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) ++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) ++ ++static inline char safechar(char c) { ++ return c > 0x20 && c < 0x7f ? c : '.'; ++} ++ ++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) { ++ tbuf[0] = safechar((fcc >> 0) & 0xff); ++ tbuf[1] = safechar((fcc >> 8) & 0xff); ++ tbuf[2] = safechar((fcc >> 16) & 0xff); ++ tbuf[3] = safechar((fcc >> 24) & 0xff); ++ tbuf[4] = '\0'; ++ return tbuf; ++} ++ ++#endif +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +new file mode 100644 +index 0000000000..b0a5930844 +--- /dev/null ++++ b/libavcodec/v4l2_request_hevc.c +@@ -0,0 +1,297 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++ ++#include "decode.h" ++#include "hevcdec.h" ++#include "hwconfig.h" ++#include "internal.h" ++ ++#include "v4l2_request_hevc.h" ++ ++#include "libavutil/hwcontext_drm.h" ++ ++#include "v4l2_req_devscan.h" ++#include "v4l2_req_dmabufs.h" ++#include "v4l2_req_pollqueue.h" ++#include "v4l2_req_media.h" ++#include "v4l2_req_utils.h" ++ ++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8) ++{ ++ const size_t wxh = w * h; ++ size_t bits_alloc; ++ ++ /* Annex A gives a min compression of 2 @ lvl 3.1 ++ * (wxh <= 983040) and min 4 thereafter but avoid ++ * the odity of 983041 having a lower limit than ++ * 983040. ++ * Multiply by 3/2 for 4:2:0 ++ */ ++ bits_alloc = wxh < 983040 ? wxh * 3 / 4 : ++ wxh < 983040 * 2 ? 983040 * 3 / 4 : ++ wxh * 3 / 8; ++ /* Allow for bit depth */ ++ bits_alloc += (bits_alloc * bits_minus8) / 8; ++ /* Add a few bytes (16k) for overhead */ ++ bits_alloc += 0x4000; ++ return bits_alloc; ++} ++ ++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->start_frame(avctx, buffer, size); ++} ++ ++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->decode_slice(avctx, buffer, size); ++} ++ ++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->end_frame(avctx); ++} ++ ++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ctx->fns->abort_frame(avctx); ++} ++ ++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->frame_params(avctx, hw_frames_ctx); ++} ++ ++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ return ctx->fns->alloc_frame(avctx, frame); ++} ++ ++ ++static int v4l2_request_hevc_uninit(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode ++ ++ mediabufs_ctl_unref(&ctx->mbufs); ++ media_pool_delete(&ctx->mpool); ++ pollqueue_unref(&ctx->pq); ++ dmabufs_ctl_delete(&ctx->dbufs); ++ devscan_delete(&ctx->devscan); ++ ++ decode_q_uninit(&ctx->decode_q); ++ ++// if (avctx->hw_frames_ctx) { ++// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; ++// av_buffer_pool_flush(hwfc->pool); ++// } ++ return 0; ++} ++ ++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc) ++{ ++ AVCodecContext *const avctx = v; ++ const HEVCContext *const h = avctx->priv_data; ++ ++ if (h->ps.sps->bit_depth == 8) { ++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 || ++ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) { ++ return 1; ++ } ++ } ++ else if (h->ps.sps->bit_depth == 10) { ++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++static int v4l2_request_hevc_init(AVCodecContext *avctx) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ const HEVCSPS * const sps = h->ps.sps; ++ int ret; ++ const struct decdev * decdev; ++ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes ++ size_t src_size; ++ ++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); ++ return (AVERROR(-ret)); ++ } ++ ret = AVERROR(ENOMEM); // Assume mem fail by default for these ++ ++ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL) ++ { ++ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n"); ++ ret = AVERROR(ENODEV); ++ goto fail0; ++ } ++ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n", ++ decdev_media_path(decdev), decdev_video_path(decdev)); ++ ++ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n"); ++ goto fail0; ++ } ++ ++ if ((ctx->pq = pollqueue_new()) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n"); ++ goto fail1; ++ } ++ ++ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n"); ++ goto fail2; ++ } ++ ++ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n"); ++ goto fail3; ++ } ++ ++ // Ask for an initial bitbuf size of max size / 4 ++ // We will realloc if we need more ++ // Must use sps->h/w as avctx contains cropped size ++ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); ++ if (mediabufs_src_resizable(ctx->mbufs)) ++ src_size /= 4; ++ // Kludge for conformance tests which break Annex A limits ++ else if (src_size < 0x40000) ++ src_size = 0x40000; ++ ++ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt, ++ sps->width, sps->height, src_size)) { ++ char tbuf1[5]; ++ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); ++ goto fail4; ++ } ++ ++ if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 2); ++ } ++ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 1); ++ } ++ else { ++ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); ++ ret = AVERROR(EINVAL); ++ goto fail4; ++ } ++ ++ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) { ++ char tbuf1[5]; ++ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); ++ goto fail4; ++ } ++ ++ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); ++ goto fail4; ++ } ++ ++ { ++ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + ++ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); ++ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, ++ avctx->thread_count, avctx->extra_hw_frames); ++ ++ // extra_hw_frames is -1 if unset ++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); ++ goto fail4; ++ } ++ } ++ ++ if (mediabufs_stream_on(ctx->mbufs)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n"); ++ goto fail4; ++ } ++ ++ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n"); ++ goto fail4; ++ } ++ ++ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n"); ++ goto fail5; ++ } ++ ++ decode_q_init(&ctx->decode_q); ++ ++ // Set our s/w format ++ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; ++ ++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n", ++ ctx->fns->name, ++ decdev_media_path(decdev), decdev_video_path(decdev)); ++ ++ return 0; ++ ++fail5: ++ av_buffer_unref(&avctx->hw_frames_ctx); ++fail4: ++ mediabufs_ctl_unref(&ctx->mbufs); ++fail3: ++ media_pool_delete(&ctx->mpool); ++fail2: ++ pollqueue_unref(&ctx->pq); ++fail1: ++ dmabufs_ctl_delete(&ctx->dbufs); ++fail0: ++ devscan_delete(&ctx->devscan); ++ return ret; ++} ++ ++const AVHWAccel ff_hevc_v4l2request_hwaccel = { ++ .name = "hevc_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .alloc_frame = v4l2_req_hevc_alloc_frame, ++ .start_frame = v4l2_req_hevc_start_frame, ++ .decode_slice = v4l2_req_hevc_decode_slice, ++ .end_frame = v4l2_req_hevc_end_frame, ++ .abort_frame = v4l2_req_hevc_abort_frame, ++ .init = v4l2_request_hevc_init, ++ .uninit = v4l2_request_hevc_uninit, ++ .priv_data_size = sizeof(V4L2RequestContextHEVC), ++ .frame_params = v4l2_req_hevc_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +new file mode 100644 +index 0000000000..f14f594564 +--- /dev/null ++++ b/libavcodec/v4l2_request_hevc.h +@@ -0,0 +1,102 @@ ++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H ++#define AVCODEC_V4L2_REQUEST_HEVC_H ++ ++#include ++#include ++#include "v4l2_req_decode_q.h" ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#include ++#ifndef V4L2_CID_CODEC_BASE ++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in drm_fourcc.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ ++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY ++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 ++#endif ++ ++#define MAX_SLICES 128 ++ ++#define VCAT(name, version) name##_v##version ++#define V2(n,v) VCAT(n, v) ++#define V(n) V2(n, HEVC_CTRLS_VERSION) ++ ++#define S2(x) #x ++#define STR(x) S2(x) ++ ++// 1 per decoder ++struct v4l2_req_decode_fns; ++ ++typedef struct V4L2RequestContextHEVC { ++// V4L2RequestContext base; ++ const struct v4l2_req_decode_fns * fns; ++ ++ unsigned int timestamp; // ?? maybe uint64_t ++ ++ int multi_slice; ++ int decode_mode; ++ int start_code; ++ int max_slices; ++ ++ req_decode_q decode_q; ++ ++ struct devscan *devscan; ++ struct dmabufs_ctl *dbufs; ++ struct pollqueue *pq; ++ struct media_pool * mpool; ++ struct mediabufs_ctl *mbufs; ++} V4L2RequestContextHEVC; ++ ++typedef struct v4l2_req_decode_fns { ++ int src_pix_fmt_v4l2; ++ const char * name; ++ ++ // Init setup ++ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); ++ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); ++ ++ // Passthrough of hwaccel fns ++ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); ++ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); ++ int (*end_frame)(AVCodecContext *avctx); ++ void (*abort_frame)(AVCodecContext *avctx); ++ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); ++} v4l2_req_decode_fns; ++ ++ ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); ++ ++#endif + +From 297e4d885dd99aa0d1703c854fcb1f926d60abbb Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 27 Apr 2021 19:30:36 +0100 +Subject: [PATCH 013/113] Add no_cvt_hw option to ffmpeg + +--- + fftools/ffmpeg.c | 6 ++++-- + fftools/ffmpeg.h | 1 + + fftools/ffmpeg_opt.c | 3 +++ + 3 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index c68f96006b..e98f0b8149 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -1965,6 +1965,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref + (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) + need_reinit = 1; + ++ if (no_cvt_hw && fg->graph) ++ need_reinit = 0; ++ + if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) { + if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9)) + need_reinit = 1; +@@ -2220,8 +2223,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ + decoded_frame->top_field_first = ist->top_field_first; + + ist->frames_decoded++; +- +- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { ++ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { + err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); + if (err < 0) + goto fail; +diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h +index 391a35cf50..f43438b1ed 100644 +--- a/fftools/ffmpeg.h ++++ b/fftools/ffmpeg.h +@@ -626,6 +626,7 @@ extern enum VideoSyncMethod video_sync_method; + extern float frame_drop_threshold; + extern int do_benchmark; + extern int do_benchmark_all; ++extern int no_cvt_hw; + extern int do_deinterlace; + extern int do_hex_dump; + extern int do_pkt_dump; +diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c +index 6e18a4a23e..877689f8bc 100644 +--- a/fftools/ffmpeg_opt.c ++++ b/fftools/ffmpeg_opt.c +@@ -162,6 +162,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO; + float frame_drop_threshold = 0; + int do_benchmark = 0; + int do_benchmark_all = 0; ++int no_cvt_hw = 0; + int do_hex_dump = 0; + int do_pkt_dump = 0; + int copy_ts = 0; +@@ -3718,6 +3719,8 @@ const OptionDef options[] = { + "add timings for benchmarking" }, + { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, + "add timings for each task" }, ++ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, ++ "do not auto-convert hw frames to sw" }, + { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, + "write program-readable progress information", "url" }, + { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, + +From 1707f2fac047809232ec19c6c9f038461ff1ec94 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 28 Apr 2021 10:16:39 +0100 +Subject: [PATCH 014/113] Add vout_drm + +--- + configure | 4 + + libavdevice/Makefile | 1 + + libavdevice/alldevices.c | 1 + + libavdevice/drm_vout.c | 636 +++++++++++++++++++++++++++++++++++++++ + 4 files changed, 642 insertions(+) + create mode 100644 libavdevice/drm_vout.c + +diff --git a/configure b/configure +index b64a6cf822..e6440cd9f3 100755 +--- a/configure ++++ b/configure +@@ -345,6 +345,7 @@ External library support: + --enable-libnpp enable Nvidia Performance Primitives-based code [no] + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] + --enable-sand enable sand video formats [rpi] ++ --enable-vout-drm enable the vout_drm module - for internal testing only [no] + --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] +@@ -1951,6 +1952,7 @@ FEATURE_LIST=" + small + static + swscale_alpha ++ vout_drm + " + + # this list should be kept in linking order +@@ -3549,8 +3551,10 @@ sndio_indev_deps="sndio" + sndio_outdev_deps="sndio" + v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_indev_suggest="libv4l2" ++v4l2_outdev_deps="libdrm" + v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_outdev_suggest="libv4l2" ++vout_drm_outdev_deps="libdrm vout_drm" + vfwcap_indev_deps="vfw32 vfwcap_defines" + xcbgrab_indev_deps="libxcb" + xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" +diff --git a/libavdevice/Makefile b/libavdevice/Makefile +index bbe2f69dcc..447dbfdf9a 100644 +--- a/libavdevice/Makefile ++++ b/libavdevice/Makefile +@@ -48,6 +48,7 @@ OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o + OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o + OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o + OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o ++OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o + OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o + OBJS-$(CONFIG_XV_OUTDEV) += xv.o + +diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c +index 22323a0a44..23351f0316 100644 +--- a/libavdevice/alldevices.c ++++ b/libavdevice/alldevices.c +@@ -51,6 +51,7 @@ extern const AVOutputFormat ff_sndio_muxer; + extern const AVInputFormat ff_v4l2_demuxer; + extern const AVOutputFormat ff_v4l2_muxer; + extern const AVInputFormat ff_vfwcap_demuxer; ++extern const AVOutputFormat ff_vout_drm_muxer; + extern const AVInputFormat ff_xcbgrab_demuxer; + extern const AVOutputFormat ff_xv_muxer; + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +new file mode 100644 +index 0000000000..15ed1b8825 +--- /dev/null ++++ b/libavdevice/drm_vout.c +@@ -0,0 +1,636 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++ ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++ ++#include ++#include ++ ++#define TRACE_ALL 0 ++ ++#define DRM_MODULE "vc4" ++ ++#define ERRSTR strerror(errno) ++ ++struct drm_setup { ++ int conId; ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ unsigned int out_fourcc; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct drm_aux_s { ++ unsigned int fb_handle; ++ uint32_t bo_handles[AV_DRM_MAX_PLANES]; ++ AVFrame * frame; ++} drm_aux_t; ++ ++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS ++// we get initial flicker probably due to dodgy drm timing ++#define AUX_SIZE 3 ++typedef struct drm_display_env_s ++{ ++ AVClass *class; ++ ++ int drm_fd; ++ uint32_t con_id; ++ struct drm_setup setup; ++ enum AVPixelFormat avfmt; ++ int show_all; ++ ++ unsigned int ano; ++ drm_aux_t aux[AUX_SIZE]; ++ ++ pthread_t q_thread; ++ sem_t q_sem_in; ++ sem_t q_sem_out; ++ int q_terminate; ++ AVFrame * q_next; ++ ++} drm_display_env_t; ++ ++ ++static int drm_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int drm_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++static int find_plane(struct AVFormatContext * const avctx, ++ const int drmfd, const int crtcidx, const uint32_t format, ++ uint32_t * const pplane_id) ++{ ++ drmModePlaneResPtr planes; ++ drmModePlanePtr plane; ++ unsigned int i; ++ unsigned int j; ++ int ret = 0; ++ ++ planes = drmModeGetPlaneResources(drmfd); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ for (i = 0; i < planes->count_planes; ++i) { ++ plane = drmModeGetPlane(drmfd, planes->planes[i]); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); ++ break; ++ } ++ ++ if (!(plane->possible_crtcs & (1 << crtcidx))) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ for (j = 0; j < plane->count_formats; ++j) { ++ if (plane->formats[j] == format) ++ break; ++ } ++ ++ if (j == plane->count_formats) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ *pplane_id = plane->plane_id; ++ drmModeFreePlane(plane); ++ break; ++ } ++ ++ if (i == planes->count_planes) ++ ret = -1; ++ ++ drmModeFreePlaneResources(planes); ++ return ret; ++} ++ ++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) ++{ ++ if (da->fb_handle != 0) { ++ drmModeRmFB(de->drm_fd, da->fb_handle); ++ da->fb_handle = 0; ++ } ++ ++ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) { ++ if (da->bo_handles[i]) { ++ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]}; ++ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close); ++ da->bo_handles[i] = 0; ++ } ++ } ++ av_frame_free(&da->frame); ++} ++ ++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame) ++{ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; ++ drm_aux_t * da = de->aux + de->ano; ++ const uint32_t format = desc->layers[0].format; ++ int ret = 0; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd); ++#endif ++ ++ if (de->setup.out_fourcc != format) { ++ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) { ++ av_frame_free(&frame); ++ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format); ++ return -1; ++ } ++ de->setup.out_fourcc = format; ++ } ++ ++ { ++ drmVBlank vbl = { ++ .request = { ++ .type = DRM_VBLANK_RELATIVE, ++ .sequence = 0 ++ } ++ }; ++ ++ while (drmWaitVBlank(de->drm_fd, &vbl)) { ++ if (errno != EINTR) { ++// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); ++ break; ++ } ++ } ++ } ++ ++ da_uninit(de, da); ++ ++ { ++ uint32_t pitches[4] = {0}; ++ uint32_t offsets[4] = {0}; ++ uint64_t modifiers[4] = {0}; ++ uint32_t bo_handles[4] = {0}; ++ int i, j, n; ++ ++ da->frame = frame; ++ ++ for (i = 0; i < desc->nb_objects; ++i) { ++ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); ++ return -1; ++ } ++ } ++ ++ n = 0; ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ pitches[n] = p->pitch; ++ offsets[n] = p->offset; ++ modifiers[n] = obj->format_modifier; ++ bo_handles[n] = da->bo_handles[p->object_index]; ++ ++n; ++ } ++ } ++ ++#if 1 && TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," ++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, ++ bo_handles[0], ++ bo_handles[1], ++ bo_handles[2], ++ bo_handles[3], ++ pitches[0], ++ pitches[1], ++ pitches[2], ++ pitches[3], ++ offsets[0], ++ offsets[1], ++ offsets[2], ++ offsets[3], ++ (long long)modifiers[0], ++ (long long)modifiers[1], ++ (long long)modifiers[2], ++ (long long)modifiers[3] ++ ); ++#endif ++ ++ if (drmModeAddFB2WithModifiers(de->drm_fd, ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, bo_handles, ++ pitches, offsets, modifiers, ++ &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); ++ return -1; ++ } ++ } ++ ++ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, ++ da->fb_handle, 0, ++ de->setup.compose.x, de->setup.compose.y, ++ de->setup.compose.width, ++ de->setup.compose.height, ++ 0, 0, ++ av_frame_cropped_width(frame) << 16, ++ av_frame_cropped_height(frame) << 16); ++ ++ if (ret != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); ++ } ++ ++ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1; ++ ++ return ret; ++} ++ ++static int do_sem_wait(sem_t * const sem, const int nowait) ++{ ++ while (nowait ? sem_trywait(sem) : sem_wait(sem)) { ++ if (errno != EINTR) ++ return -errno; ++ } ++ return 0; ++} ++ ++static void * display_thread(void * v) ++{ ++ AVFormatContext * const s = v; ++ drm_display_env_t * const de = s->priv_data; ++ int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++#endif ++ ++ sem_post(&de->q_sem_out); ++ ++ for (;;) { ++ AVFrame * frame; ++ ++ do_sem_wait(&de->q_sem_in, 0); ++ ++ if (de->q_terminate) ++ break; ++ ++ frame = de->q_next; ++ de->q_next = NULL; ++ sem_post(&de->q_sem_out); ++ ++ do_display(s, de, frame); ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++#endif ++ ++ for (i = 0; i != AUX_SIZE; ++i) ++ da_uninit(de, de->aux + i); ++ ++ av_frame_free(&de->q_next); ++ ++ return NULL; ++} ++ ++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame * const src_frame = (AVFrame *)pkt->data; ++ AVFrame * frame; ++ drm_display_env_t * const de = s->priv_data; ++ int ret; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__); ++#endif ++ ++ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { ++ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts); ++ return 0; ++ } ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) ++ { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ ret = do_sem_wait(&de->q_sem_out, !de->show_all); ++ if (ret) { ++ av_frame_free(&frame); ++ } ++ else { ++ de->q_next = frame; ++ sem_post(&de->q_sem_in); ++ } ++ ++ return 0; ++} ++ ++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++ av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++ return AVERROR_PATCHWELCOME; ++} ++ ++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) ++{ ++ int ret = -1; ++ int i; ++ drmModeRes *res = drmModeGetResources(drmfd); ++ drmModeConnector *c; ++ ++ if(!res) ++ { ++ printf( "drmModeGetResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ if (res->count_crtcs <= 0) ++ { ++ printf( "drm: no crts\n"); ++ goto fail_res; ++ } ++ ++ if (!s->conId) { ++ fprintf(stderr, ++ "No connector ID specified. Choosing default from list:\n"); ++ ++ for (i = 0; i < res->count_connectors; i++) { ++ drmModeConnector *con = ++ drmModeGetConnector(drmfd, res->connectors[i]); ++ drmModeEncoder *enc = NULL; ++ drmModeCrtc *crtc = NULL; ++ ++ if (con->encoder_id) { ++ enc = drmModeGetEncoder(drmfd, con->encoder_id); ++ if (enc->crtc_id) { ++ crtc = drmModeGetCrtc(drmfd, enc->crtc_id); ++ } ++ } ++ ++ if (!s->conId && crtc) { ++ s->conId = con->connector_id; ++ s->crtcId = crtc->crtc_id; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n", ++ con->connector_id, ++ crtc ? crtc->crtc_id : 0, ++ con->connector_type, ++ crtc ? crtc->width : 0, ++ crtc ? crtc->height : 0, ++ (s->conId == (int)con->connector_id ? ++ " (chosen)" : "")); ++ } ++ ++ if (!s->conId) { ++ av_log(avctx, AV_LOG_ERROR, ++ "No suitable enabled connector found.\n"); ++ return -1;; ++ } ++ } ++ ++ s->crtcIdx = -1; ++ ++ for (i = 0; i < res->count_crtcs; ++i) { ++ if (s->crtcId == res->crtcs[i]) { ++ s->crtcIdx = i; ++ break; ++ } ++ } ++ ++ if (s->crtcIdx == -1) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); ++ goto fail_res; ++ } ++ ++ if (res->count_connectors <= 0) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); ++ goto fail_res; ++ } ++ ++ c = drmModeGetConnector(drmfd, s->conId); ++ if (!c) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); ++ goto fail_res; ++ } ++ ++ if (!c->count_modes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); ++ goto fail_conn; ++ } ++ ++ { ++ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); ++ s->compose.x = crtc->x; ++ s->compose.y = crtc->y; ++ s->compose.width = crtc->width; ++ s->compose.height = crtc->height; ++ drmModeFreeCrtc(crtc); ++ } ++ ++ if (pConId) ++ *pConId = c->connector_id; ++ ret = 0; ++ ++fail_conn: ++ drmModeFreeConnector(c); ++ ++fail_res: ++ drmModeFreeResources(res); ++ ++ return ret; ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int drm_vout_init(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ int rv; ++ const char * drm_module = DRM_MODULE; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->drm_fd = -1; ++ de->con_id = 0; ++ de->setup = (struct drm_setup){0}; ++ de->q_terminate = 0; ++ ++ if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0) ++ { ++ rv = AVERROR(errno); ++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv)); ++ return rv; ++ } ++ ++ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) ++ { ++ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); ++ rv = AVERROR(EINVAL); ++ goto fail_close; ++ } ++ ++ sem_init(&de->q_sem_in, 0, 0); ++ sem_init(&de->q_sem_out, 0, 0); ++ if (pthread_create(&de->q_thread, NULL, display_thread, s)) { ++ rv = AVERROR(errno); ++ av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv)); ++ goto fail_close; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++ ++ return 0; ++ ++fail_close: ++ close(de->drm_fd); ++ de->drm_fd = -1; ++ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__); ++ ++ return rv; ++} ++ ++static void drm_vout_deinit(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem_in); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem_in); ++ sem_destroy(&de->q_sem_out); ++ ++ for (unsigned int i = 0; i != AUX_SIZE; ++i) ++ da_uninit(de, de->aux + i); ++ ++ av_frame_free(&de->q_next); ++ ++ if (de->drm_fd >= 0) { ++ close(de->drm_fd); ++ de->drm_fd = -1; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++} ++ ++ ++#define OFFSET(x) offsetof(drm_display_env_t, x) ++static const AVOption options[] = { ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++}; ++ ++static const AVClass drm_vout_class = { ++ .class_name = "drm vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_drm_muxer = { ++ .name = "vout_drm", ++ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), ++ .priv_data_size = sizeof(drm_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = drm_vout_write_header, ++ .write_packet = drm_vout_write_packet, ++ .write_uncoded_frame = drm_vout_write_frame, ++ .write_trailer = drm_vout_write_trailer, ++ .control_message = drm_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &drm_vout_class, ++ .init = drm_vout_init, ++ .deinit = drm_vout_deinit, ++}; ++ + +From 4b2ffa315b5a20a84a38e17e74d8a2e6b778b200 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 28 Apr 2021 11:34:18 +0100 +Subject: [PATCH 015/113] Add vout_egl + +--- + configure | 6 + + libavdevice/Makefile | 1 + + libavdevice/alldevices.c | 1 + + libavdevice/egl_vout.c | 809 +++++++++++++++++++++++++++++++++++++++ + 4 files changed, 817 insertions(+) + create mode 100644 libavdevice/egl_vout.c + +diff --git a/configure b/configure +index e6440cd9f3..8424d451fb 100755 +--- a/configure ++++ b/configure +@@ -346,6 +346,7 @@ External library support: + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] + --enable-sand enable sand video formats [rpi] + --enable-vout-drm enable the vout_drm module - for internal testing only [no] ++ --enable-vout-egl enable the vout_egl module - for internal testing only [no] + --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] +@@ -1830,6 +1831,7 @@ EXTERNAL_LIBRARY_LIST=" + libdav1d + libdc1394 + libdrm ++ epoxy + libflite + libfontconfig + libfreetype +@@ -1953,6 +1955,7 @@ FEATURE_LIST=" + static + swscale_alpha + vout_drm ++ vout_egl + " + + # this list should be kept in linking order +@@ -3555,6 +3558,8 @@ v4l2_outdev_deps="libdrm" + v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_outdev_suggest="libv4l2" + vout_drm_outdev_deps="libdrm vout_drm" ++vout_egl_outdev_deps="xlib" ++vout_egl_outdev_select="epoxy" + vfwcap_indev_deps="vfw32 vfwcap_defines" + xcbgrab_indev_deps="libxcb" + xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" +@@ -6556,6 +6561,7 @@ enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d + enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open + enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new + enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion ++enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version + enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || + { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && + warn "using libfdk without pkg-config"; } } +diff --git a/libavdevice/Makefile b/libavdevice/Makefile +index 447dbfdf9a..8d83b0f19e 100644 +--- a/libavdevice/Makefile ++++ b/libavdevice/Makefile +@@ -49,6 +49,7 @@ OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o + OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o + OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o + OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o ++OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o + OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o + OBJS-$(CONFIG_XV_OUTDEV) += xv.o + +diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c +index 23351f0316..90d81d69ac 100644 +--- a/libavdevice/alldevices.c ++++ b/libavdevice/alldevices.c +@@ -52,6 +52,7 @@ extern const AVInputFormat ff_v4l2_demuxer; + extern const AVOutputFormat ff_v4l2_muxer; + extern const AVInputFormat ff_vfwcap_demuxer; + extern const AVOutputFormat ff_vout_drm_muxer; ++extern const AVOutputFormat ff_vout_egl_muxer; + extern const AVInputFormat ff_xcbgrab_demuxer; + extern const AVOutputFormat ff_xv_muxer; + +diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c +new file mode 100644 +index 0000000000..0195c9d026 +--- /dev/null ++++ b/libavdevice/egl_vout.c +@@ -0,0 +1,809 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++// Amongst other issues it doesn't wait for the pic to be displayed before ++// returning the buffer so flikering does occur. ++ ++#include ++#include ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++#define TRACE_ALL 0 ++ ++struct egl_setup { ++ int conId; ++ ++ Display *dpy; ++ EGLDisplay egl_dpy; ++ EGLContext ctx; ++ EGLSurface surf; ++ Window win; ++ ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct egl_aux_s { ++ int fd; ++ GLuint texture; ++ ++} egl_aux_t; ++ ++typedef struct egl_display_env_s ++{ ++ AVClass *class; ++ ++ struct egl_setup setup; ++ enum AVPixelFormat avfmt; ++ ++ int show_all; ++ int window_width, window_height; ++ int window_x, window_y; ++ int fullscreen; ++ ++ egl_aux_t aux[32]; ++ ++ pthread_t q_thread; ++ pthread_mutex_t q_lock; ++ sem_t display_start_sem; ++ sem_t q_sem; ++ int q_terminate; ++ AVFrame * q_this; ++ AVFrame * q_next; ++ ++} egl_display_env_t; ++ ++ ++/** ++ * Remove window border/decorations. ++ */ ++static void ++no_border( Display *dpy, Window w) ++{ ++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); ++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; ++ ++ typedef struct ++ { ++ unsigned long flags; ++ unsigned long functions; ++ unsigned long decorations; ++ long inputMode; ++ unsigned long status; ++ } PropMotifWmHints; ++ ++ PropMotifWmHints motif_hints; ++ Atom prop, proptype; ++ unsigned long flags = 0; ++ ++ /* setup the property */ ++ motif_hints.flags = MWM_HINTS_DECORATIONS; ++ motif_hints.decorations = flags; ++ ++ /* get the atom for the property */ ++ prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); ++ if (!prop) { ++ /* something went wrong! */ ++ return; ++ } ++ ++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ ++ proptype = prop; ++ ++ XChangeProperty( dpy, w, /* display, window */ ++ prop, proptype, /* property, type */ ++ 32, /* format: 32-bit datums */ ++ PropModeReplace, /* mode */ ++ (unsigned char *) &motif_hints, /* data */ ++ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ ++ ); ++} ++ ++ ++/* ++ * Create an RGB, double-buffered window. ++ * Return the window and context handles. ++ */ ++static int ++make_window(struct AVFormatContext * const s, ++ egl_display_env_t * const de, ++ Display *dpy, EGLDisplay egl_dpy, const char *name, ++ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) ++{ ++ int scrnum = DefaultScreen( dpy ); ++ XSetWindowAttributes attr; ++ unsigned long mask; ++ Window root = RootWindow( dpy, scrnum ); ++ Window win; ++ EGLContext ctx; ++ const int fullscreen = de->fullscreen; ++ EGLConfig config; ++ int x = de->window_x; ++ int y = de->window_y; ++ int width = de->window_width ? de->window_width : 1280; ++ int height = de->window_height ? de->window_height : 720; ++ ++ ++ if (fullscreen) { ++ int scrnum = DefaultScreen(dpy); ++ ++ x = 0; y = 0; ++ width = DisplayWidth(dpy, scrnum); ++ height = DisplayHeight(dpy, scrnum); ++ } ++ ++ { ++ EGLint num_configs; ++ static const EGLint attribs[] = { ++ EGL_RED_SIZE, 1, ++ EGL_GREEN_SIZE, 1, ++ EGL_BLUE_SIZE, 1, ++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, ++ EGL_NONE ++ }; ++ ++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { ++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); ++ return -1; ++ } ++ } ++ ++ { ++ EGLint vid; ++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); ++ return -1; ++ } ++ ++ { ++ XVisualInfo visTemplate = { ++ .visualid = vid, ++ }; ++ int num_visuals; ++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, ++ &visTemplate, &num_visuals); ++ ++ /* window attributes */ ++ attr.background_pixel = 0; ++ attr.border_pixel = 0; ++ attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); ++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; ++ /* XXX this is a bad way to get a borderless window! */ ++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; ++ ++ win = XCreateWindow( dpy, root, x, y, width, height, ++ 0, visinfo->depth, InputOutput, ++ visinfo->visual, mask, &attr ); ++ XFree(visinfo); ++ } ++ } ++ ++ if (fullscreen) ++ no_border(dpy, win); ++ ++ /* set hints and properties */ ++ { ++ XSizeHints sizehints; ++ sizehints.x = x; ++ sizehints.y = y; ++ sizehints.width = width; ++ sizehints.height = height; ++ sizehints.flags = USSize | USPosition; ++ XSetNormalHints(dpy, win, &sizehints); ++ XSetStandardProperties(dpy, win, name, name, ++ None, (char **)NULL, 0, &sizehints); ++ } ++ ++ eglBindAPI(EGL_OPENGL_ES_API); ++ ++ { ++ static const EGLint ctx_attribs[] = { ++ EGL_CONTEXT_CLIENT_VERSION, 2, ++ EGL_NONE ++ }; ++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); ++ if (!ctx) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ } ++ ++ ++ XMapWindow(dpy, win); ++ ++ { ++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); ++ if (!surf) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); ++ return -1; ++ } ++ ++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ ++ *winRet = win; ++ *ctxRet = ctx; ++ *surfRet = surf; ++ } ++ ++ return 0; ++} ++ ++static GLint ++compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) ++{ ++ GLuint s = glCreateShader(target); ++ ++ if (s == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); ++ return 0; ++ } ++ ++ glShaderSource(s, 1, (const GLchar **) &source, NULL); ++ glCompileShader(s); ++ ++ { ++ GLint ok; ++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); ++ ++ if (!ok) { ++ GLchar *info; ++ GLint size; ++ ++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); ++ info = malloc(size); ++ ++ glGetShaderInfoLog(s, size, NULL, info); ++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); ++ ++ return 0; ++ } ++ } ++ ++ return s; ++} ++ ++static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) ++{ ++ GLuint prog = glCreateProgram(); ++ ++ if (prog == 0) { ++ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); ++ return 0; ++ } ++ ++ glAttachShader(prog, vs); ++ glAttachShader(prog, fs); ++ glLinkProgram(prog); ++ ++ { ++ GLint ok; ++ glGetProgramiv(prog, GL_LINK_STATUS, &ok); ++ if (!ok) { ++ /* Some drivers return a size of 1 for an empty log. This is the size ++ * of a log that contains only a terminating NUL character. ++ */ ++ GLint size; ++ GLchar *info = NULL; ++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); ++ if (size > 1) { ++ info = malloc(size); ++ glGetProgramInfoLog(prog, size, NULL, info); ++ } ++ ++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", ++ (info != NULL) ? info : ""); ++ return 0; ++ } ++ } ++ ++ return prog; ++} ++ ++static int ++gl_setup(struct AVFormatContext * const s) ++{ ++ const char *vs = ++ "attribute vec4 pos;\n" ++ "varying vec2 texcoord;\n" ++ "\n" ++ "void main() {\n" ++ " gl_Position = pos;\n" ++ " texcoord.x = (pos.x + 1.0) / 2.0;\n" ++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" ++ "}\n"; ++ const char *fs = ++ "#extension GL_OES_EGL_image_external : enable\n" ++ "precision mediump float;\n" ++ "uniform samplerExternalOES s;\n" ++ "varying vec2 texcoord;\n" ++ "void main() {\n" ++ " gl_FragColor = texture2D(s, texcoord);\n" ++ "}\n"; ++ ++ GLuint vs_s; ++ GLuint fs_s; ++ GLuint prog; ++ ++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || ++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || ++ !(prog = link_program(s, vs_s, fs_s))) ++ return -1; ++ ++ glUseProgram(prog); ++ ++ { ++ static const float verts[] = { ++ -1, -1, ++ 1, -1, ++ 1, 1, ++ -1, 1, ++ }; ++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); ++ } ++ ++ glEnableVertexAttribArray(0); ++ return 0; ++} ++ ++static int egl_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int egl_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++ ++static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) ++{ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; ++ egl_aux_t * da = NULL; ++ unsigned int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ ++ for (i = 0; i != 32; ++i) { ++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { ++ da = de->aux + i; ++ break; ++ } ++ } ++ ++ if (da == NULL) { ++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ if (da->texture == 0) { ++ EGLint attribs[50]; ++ EGLint * a = attribs; ++ int i, j; ++ static const EGLint anames[] = { ++ EGL_DMA_BUF_PLANE0_FD_EXT, ++ EGL_DMA_BUF_PLANE0_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE0_PITCH_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE1_FD_EXT, ++ EGL_DMA_BUF_PLANE1_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE1_PITCH_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE2_FD_EXT, ++ EGL_DMA_BUF_PLANE2_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE2_PITCH_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, ++ }; ++ const EGLint * b = anames; ++ ++ *a++ = EGL_WIDTH; ++ *a++ = av_frame_cropped_width(frame); ++ *a++ = EGL_HEIGHT; ++ *a++ = av_frame_cropped_height(frame); ++ *a++ = EGL_LINUX_DRM_FOURCC_EXT; ++ *a++ = desc->layers[0].format; ++ ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ *a++ = *b++; ++ *a++ = obj->fd; ++ *a++ = *b++; ++ *a++ = p->offset; ++ *a++ = *b++; ++ *a++ = p->pitch; ++ if (obj->format_modifier == 0) { ++ b += 2; ++ } ++ else { ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier >> 32); ++ } ++ } ++ } ++ ++ *a = EGL_NONE; ++ ++#if TRACE_ALL ++ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { ++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); ++ } ++#endif ++ { ++ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, ++ EGL_NO_CONTEXT, ++ EGL_LINUX_DMA_BUF_EXT, ++ NULL, attribs); ++ if (!image) { ++ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); ++ return -1; ++ } ++ ++ glGenTextures(1, &da->texture); ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); ++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); ++ ++ eglDestroyImageKHR(de->setup.egl_dpy, image); ++ } ++ ++ da->fd = desc->objects[0].fd; ++ ++#if 0 ++ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," ++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, ++ bo_plane_handles[0], ++ bo_plane_handles[1], ++ bo_plane_handles[2], ++ bo_plane_handles[3], ++ pitches[0], ++ pitches[1], ++ pitches[2], ++ pitches[3], ++ offsets[0], ++ offsets[1], ++ offsets[2], ++ offsets[3], ++ (long long)modifiers[0], ++ (long long)modifiers[1], ++ (long long)modifiers[2], ++ (long long)modifiers[3] ++ ); ++#endif ++ } ++ ++ glClearColor(0.5, 0.5, 0.5, 0.5); ++ glClear(GL_COLOR_BUFFER_BIT); ++ ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glDrawArrays(GL_TRIANGLE_FAN, 0, 4); ++ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); ++ ++ glDeleteTextures(1, &da->texture); ++ da->texture = 0; ++ da->fd = -1; ++ ++ return 0; ++} ++ ++static void * display_thread(void * v) ++{ ++ AVFormatContext * const s = v; ++ egl_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ { ++ EGLint egl_major, egl_minor; ++ ++ de->setup.dpy = XOpenDisplay(NULL); ++ if (!de->setup.dpy) { ++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); ++ goto fail; ++ } ++ ++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); ++ if (!de->setup.egl_dpy) { ++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); ++ goto fail; ++ } ++ ++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); ++ goto fail; ++ } ++ ++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); ++ ++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { ++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); ++ goto fail; ++ } ++ } ++ ++ if (!de->window_width || !de->window_height) { ++ de->window_width = 1280; ++ de->window_height = 720; ++ } ++ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", ++ &de->setup.win, &de->setup.ctx, &de->setup.surf)) { ++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); ++ goto fail; ++ } ++ ++ if (gl_setup(s)) { ++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); ++ goto fail; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); ++#endif ++ sem_post(&de->display_start_sem); ++ ++ for (;;) { ++ AVFrame * frame; ++ ++ while (sem_wait(&de->q_sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++ ++ if (de->q_terminate) ++ break; ++ ++ pthread_mutex_lock(&de->q_lock); ++ frame = de->q_next; ++ de->q_next = NULL; ++ pthread_mutex_unlock(&de->q_lock); ++ ++ do_display(s, de, frame); ++ ++ av_frame_free(&de->q_this); ++ de->q_this = frame; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++#endif ++ ++ return NULL; ++ ++fail: ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); ++#endif ++ de->q_terminate = 1; ++ sem_post(&de->display_start_sem); ++ ++ return NULL; ++} ++ ++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame * const src_frame = (AVFrame *)pkt->data; ++ AVFrame * frame; ++ egl_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) ++ { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ // Really hacky sync ++ while (de->show_all && de->q_next) { ++ usleep(3000); ++ } ++ ++ pthread_mutex_lock(&de->q_lock); ++ { ++ AVFrame * const t = de->q_next; ++ de->q_next = frame; ++ frame = t; ++ } ++ pthread_mutex_unlock(&de->q_lock); ++ ++ if (frame == NULL) ++ sem_post(&de->q_sem); ++ else ++ av_frame_free(&frame); ++ ++ return 0; ++} ++ ++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++ av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++ return AVERROR_PATCHWELCOME; ++} ++ ++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int egl_vout_init(struct AVFormatContext * s) ++{ ++ egl_display_env_t * const de = s->priv_data; ++ unsigned int i; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->setup = (struct egl_setup){0}; ++ ++ for (i = 0; i != 32; ++i) { ++ de->aux[i].fd = -1; ++ } ++ ++ de->q_terminate = 0; ++ pthread_mutex_init(&de->q_lock, NULL); ++ sem_init(&de->q_sem, 0, 0); ++ sem_init(&de->display_start_sem, 0, 0); ++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); ++ ++ sem_wait(&de->display_start_sem); ++ if (de->q_terminate) { ++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); ++ return -1; ++ } ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++ ++ return 0; ++} ++ ++static void egl_vout_deinit(struct AVFormatContext * s) ++{ ++ egl_display_env_t * const de = s->priv_data; ++ ++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem); ++ pthread_mutex_destroy(&de->q_lock); ++ ++ av_frame_free(&de->q_next); ++ av_frame_free(&de->q_this); ++ ++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); ++} ++ ++#define OFFSET(x) offsetof(egl_display_env_t, x) ++static const AVOption options[] = { ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++ ++}; ++ ++static const AVClass egl_vout_class = { ++ .class_name = "egl vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_egl_muxer = { ++ .name = "vout_egl", ++ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), ++ .priv_data_size = sizeof(egl_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = egl_vout_write_header, ++ .write_packet = egl_vout_write_packet, ++ .write_uncoded_frame = egl_vout_write_frame, ++ .write_trailer = egl_vout_write_trailer, ++ .control_message = egl_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &egl_vout_class, ++ .init = egl_vout_init, ++ .deinit = egl_vout_deinit, ++}; ++ + +From 183f6ae1b65b695630c8ac8a7cb5659a2ded42f6 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 28 Apr 2021 12:51:22 +0100 +Subject: [PATCH 016/113] V4L2 stateful rework + +--- + libavcodec/Makefile | 3 +- + libavcodec/v4l2_buffers.c | 556 +++++++++++++++++++++++++++----------- + libavcodec/v4l2_buffers.h | 28 +- + libavcodec/v4l2_context.c | 536 +++++++++++++++++++++++++++--------- + libavcodec/v4l2_context.h | 20 +- + libavcodec/v4l2_m2m.c | 20 +- + libavcodec/v4l2_m2m.h | 33 ++- + libavcodec/v4l2_m2m_dec.c | 448 ++++++++++++++++++++++++++---- + 8 files changed, 1288 insertions(+), 356 deletions(-) + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index df7659d0b8..a40c46bf93 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -161,7 +161,8 @@ OBJS-$(CONFIG_VIDEODSP) += videodsp.o + OBJS-$(CONFIG_VP3DSP) += vp3dsp.o + OBJS-$(CONFIG_VP56DSP) += vp56dsp.o + OBJS-$(CONFIG_VP8DSP) += vp8dsp.o +-OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o ++OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ ++ weak_link.o + OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ + v4l2_req_devscan.o weak_link.o + OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 3f5471067a..a003934ca1 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -21,6 +21,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include + #include + #include + #include +@@ -29,12 +30,14 @@ + #include + #include "libavcodec/avcodec.h" + #include "libavutil/pixdesc.h" ++#include "libavutil/hwcontext.h" + #include "v4l2_context.h" + #include "v4l2_buffers.h" + #include "v4l2_m2m.h" ++#include "weak_link.h" + + #define USEC_PER_SEC 1000000 +-static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; ++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; + + static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) + { +@@ -51,34 +54,44 @@ static inline AVCodecContext *logger(V4L2Buffer *buf) + static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) + { + V4L2m2mContext *s = buf_to_m2mctx(avbuf); +- +- if (s->avctx->pkt_timebase.num) +- return s->avctx->pkt_timebase; +- return s->avctx->time_base; ++ const AVRational tb = s->avctx->pkt_timebase.num ? ++ s->avctx->pkt_timebase : ++ s->avctx->time_base; ++ return tb.num && tb.den ? tb : v4l2_timebase; + } + +-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) ++static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale) + { +- int64_t v4l2_pts; +- +- if (pts == AV_NOPTS_VALUE) +- pts = 0; +- + /* convert pts to v4l2 timebase */ +- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); ++ const int64_t v4l2_pts = ++ no_rescale ? pts : ++ pts == AV_NOPTS_VALUE ? 0 : ++ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); + out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; + out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; + } + +-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) ++static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale) + { +- int64_t v4l2_pts; +- + /* convert pts back to encoder timebase */ +- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + ++ const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + + avbuf->buf.timestamp.tv_usec; + +- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++ return ++ no_rescale ? v4l2_pts : ++ v4l2_pts == 0 ? AV_NOPTS_VALUE : ++ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++} ++ ++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) ++{ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ out->planes[plane].bytesused = bytesused; ++ out->planes[plane].length = length; ++ } else { ++ out->buf.bytesused = bytesused; ++ out->buf.length = length; ++ } + } + + static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) +@@ -209,68 +222,143 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) + return AVCOL_TRC_UNSPECIFIED; + } + +-static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) + { +- V4L2Buffer* avbuf = opaque; +- V4L2m2mContext *s = buf_to_m2mctx(avbuf); ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; + +- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) { +- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; + +- if (s->reinit) { +- if (!atomic_load(&s->refcount)) +- sem_post(&s->refsync); +- } else { +- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) { +- /* no need to queue more buffers to the driver */ +- avbuf->status = V4L2BUF_AVAILABLE; +- } +- else if (avbuf->context->streamon) +- ff_v4l2_buffer_enqueue(avbuf); +- } ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; ++ } ++ ++ switch (avbuf->context->av_pix_fmt) { ++ case AV_PIX_FMT_YUYV422: ++ ++ layer->format = DRM_FORMAT_YUYV; ++ layer->nb_planes = 1; ++ ++ break; ++ ++ case AV_PIX_FMT_NV12: ++ case AV_PIX_FMT_NV21: ++ ++ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? ++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ break; ++ ++ case AV_PIX_FMT_YUV420P: ++ ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ++ ((avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height) >> 2); ++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ break; + +- av_buffer_unref(&avbuf->context_ref); ++ default: ++ drm_desc->nb_layers = 0; ++ break; + } ++ ++ return (uint8_t *) drm_desc; + } + +-static int v4l2_buf_increase_ref(V4L2Buffer *in) ++static void v4l2_free_bufref(void *opaque, uint8_t *data) + { +- V4L2m2mContext *s = buf_to_m2mctx(in); ++ AVBufferRef * bufref = (AVBufferRef *)data; ++ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; ++ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); + +- if (in->context_ref) +- atomic_fetch_add(&in->context_refcount, 1); +- else { +- in->context_ref = av_buffer_ref(s->self_ref); +- if (!in->context_ref) +- return AVERROR(ENOMEM); ++ if (ctx != NULL) { ++ // Buffer still attached to context ++ V4L2m2mContext *s = buf_to_m2mctx(avbuf); + +- in->context_refcount = 1; +- } ++ ff_mutex_lock(&ctx->lock); + +- in->status = V4L2BUF_RET_USER; +- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); ++ avbuf->status = V4L2BUF_AVAILABLE; + +- return 0; ++ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); ++ /* no need to queue more buffers to the driver */ ++ } ++ else if (ctx->streamon) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name); ++ avbuf->buf.timestamp.tv_sec = 0; ++ avbuf->buf.timestamp.tv_usec = 0; ++ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER ++ } ++ else { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name); ++ } ++ ++ ff_mutex_unlock(&ctx->lock); ++ } ++ ++ ff_weak_link_unlock(avbuf->context_wl); ++ av_buffer_unref(&bufref); + } + +-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) + { +- int ret; ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; + +- if (plane >= in->num_planes) +- return AVERROR(EINVAL); ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); + +- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ +- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, +- in->plane_info[plane].length, v4l2_free_buffer, in, 0); +- if (!*buf) +- return AVERROR(ENOMEM); ++ expbuf.index = avbuf->buf.index; ++ expbuf.type = avbuf->buf.type; ++ expbuf.plane = i; + +- ret = v4l2_buf_increase_ref(in); +- if (ret) +- av_buffer_unref(buf); ++ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); + +- return ret; ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { ++ /* drm frame */ ++ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; ++ avbuf->drm_frame.objects[i].fd = expbuf.fd; ++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } else { ++ /* drm frame */ ++ avbuf->drm_frame.objects[0].size = avbuf->buf.length; ++ avbuf->drm_frame.objects[0].fd = expbuf.fd; ++ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } ++ } ++ ++ return 0; + } + + static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) +@@ -285,30 +373,50 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i + + memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); + +- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { +- out->planes[plane].bytesused = bytesused; +- out->planes[plane].length = length; +- } else { +- out->buf.bytesused = bytesused; +- out->buf.length = length; +- } ++ set_buf_length(out, plane, bytesused, length); + + return 0; + } + ++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) ++{ ++ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]); ++ AVBufferRef * newbuf; ++ ++ if (!bufref) ++ return NULL; ++ ++ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0); ++ if (newbuf == NULL) ++ av_buffer_unref(&bufref); ++ ++ avbuf->status = V4L2BUF_RET_USER; ++ return newbuf; ++} ++ + static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + { +- int i, ret; ++ int i; + + frame->format = avbuf->context->av_pix_fmt; + +- for (i = 0; i < avbuf->num_planes; i++) { +- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); +- if (ret) +- return ret; ++ frame->buf[0] = wrap_avbuf(avbuf); ++ if (frame->buf[0] == NULL) ++ return AVERROR(ENOMEM); ++ ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ /* 1. get references to the actual data */ ++ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); ++ return 0; ++ } ++ + ++ /* 1. get references to the actual data */ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset; + frame->linesize[i] = avbuf->plane_info[i].bytesperline; +- frame->data[i] = frame->buf[i]->data; + } + + /* fixup special cases */ +@@ -337,68 +445,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + return 0; + } + ++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) ++{ ++ if (dst_stride == src_stride && w + 32 >= dst_stride) { ++ memcpy(dst, src, dst_stride * h); ++ } ++ else { ++ while (--h >= 0) { ++ memcpy(dst, src, w); ++ dst += dst_stride; ++ src += src_stride; ++ } ++ } ++} ++ ++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) ++{ ++ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); ++} ++ + static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { +- int i, ret; +- struct v4l2_format fmt = out->context->format; +- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; +- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.height : fmt.fmt.pix.height; +- int is_planar_format = 0; +- +- switch (pixel_format) { +- case V4L2_PIX_FMT_YUV420M: +- case V4L2_PIX_FMT_YVU420M: +-#ifdef V4L2_PIX_FMT_YUV422M +- case V4L2_PIX_FMT_YUV422M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU422M +- case V4L2_PIX_FMT_YVU422M: +-#endif +-#ifdef V4L2_PIX_FMT_YUV444M +- case V4L2_PIX_FMT_YUV444M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU444M +- case V4L2_PIX_FMT_YVU444M: +-#endif +- case V4L2_PIX_FMT_NV12M: +- case V4L2_PIX_FMT_NV21M: +- case V4L2_PIX_FMT_NV12MT_16X16: +- case V4L2_PIX_FMT_NV12MT: +- case V4L2_PIX_FMT_NV16M: +- case V4L2_PIX_FMT_NV61M: +- is_planar_format = 1; +- } +- +- if (!is_planar_format) { +- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); +- int planes_nb = 0; +- int offset = 0; +- +- for (i = 0; i < desc->nb_components; i++) +- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); +- +- for (i = 0; i < planes_nb; i++) { +- int size, h = height; +- if (i == 1 || i == 2) { ++ int i; ++ int num_planes = 0; ++ int pel_strides[4] = {0}; ++ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ ++ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { ++ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); ++ return -1; ++ } ++ ++ for (i = 0; i != desc->nb_components; ++i) { ++ if (desc->comp[i].plane >= num_planes) ++ num_planes = desc->comp[i].plane + 1; ++ pel_strides[desc->comp[i].plane] = desc->comp[i].step; ++ } ++ ++ if (out->num_planes > 1) { ++ if (num_planes != out->num_planes) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); ++ return -1; ++ } ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ if (is_chroma(desc, i, num_planes)) { ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + } +- size = frame->linesize[i] * h; +- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset); +- if (ret) +- return ret; +- offset += size; ++ ++ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); + } +- return 0; + } ++ else ++ { ++ unsigned int offset = 0; ++ ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ int dst_stride = out->plane_info[0].bytesperline; ++ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; ++ ++ if (is_chroma(desc, i, num_planes)) { ++ // Is chroma ++ dst_stride >>= desc->log2_chroma_w; ++ offset += dst_stride * (out->context->height >> desc->log2_chroma_h); ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); ++ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); ++ } ++ else { ++ // Is luma or alpha ++ offset += dst_stride * out->context->height; ++ } ++ if (offset > out->plane_info[0].length) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length); ++ return -1; ++ } + +- for (i = 0; i < out->num_planes; i++) { +- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0); +- if (ret) +- return ret; ++ cpy_2d(dst, dst_stride, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ } ++ set_buf_length(out, 0, offset, out->plane_info[0].length); + } +- + return 0; + } + +@@ -410,14 +545,15 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + + int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { +- v4l2_set_pts(out, frame->pts); ++ v4l2_set_pts(out, frame->pts, 0); + + return v4l2_buffer_swframe_to_buf(frame, out); + } + +-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) ++int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts) + { + int ret; ++ V4L2Context * const ctx = avbuf->context; + + av_frame_unref(frame); + +@@ -432,13 +568,22 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + frame->colorspace = v4l2_get_color_space(avbuf); + frame->color_range = v4l2_get_color_range(avbuf); + frame->color_trc = v4l2_get_color_trc(avbuf); +- frame->pts = v4l2_get_pts(avbuf); ++ frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); + frame->pkt_dts = AV_NOPTS_VALUE; + + /* these values are updated also during re-init in v4l2_process_driver_event */ +- frame->height = avbuf->context->height; +- frame->width = avbuf->context->width; +- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio; ++ frame->height = ctx->height; ++ frame->width = ctx->width; ++ frame->sample_aspect_ratio = ctx->sample_aspect_ratio; ++ ++ if (ctx->selection.height && ctx->selection.width) { ++ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0; ++ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0; ++ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? ++ frame->width - (ctx->selection.left + ctx->selection.width) : 0; ++ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? ++ frame->height - (ctx->selection.top + ctx->selection.height) : 0; ++ } + + /* 3. report errors upstream */ + if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { +@@ -451,15 +596,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + + int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + { +- int ret; +- + av_packet_unref(pkt); +- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); +- if (ret) +- return ret; ++ ++ pkt->buf = wrap_avbuf(avbuf); ++ if (pkt->buf == NULL) ++ return AVERROR(ENOMEM); + + pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; +- pkt->data = pkt->buf->data; ++ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; + + if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) + pkt->flags |= AV_PKT_FLAG_KEY; +@@ -469,20 +613,27 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + pkt->flags |= AV_PKT_FLAG_CORRUPT; + } + +- pkt->dts = pkt->pts = v4l2_get_pts(avbuf); ++ pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0); + + return 0; + } + +-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, ++ const void *extdata, size_t extlen, int no_rescale_pts) + { + int ret; + +- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0); ++ if (extlen) { ++ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); ++ if (ret) ++ return ret; ++ } ++ ++ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); + if (ret) + return ret; + +- v4l2_set_pts(out, pkt->pts); ++ v4l2_set_pts(out, pkt->pts, no_rescale_pts); + + if (pkt->flags & AV_PKT_FLAG_KEY) + out->flags = V4L2_BUF_FLAG_KEYFRAME; +@@ -490,15 +641,61 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) + return 0; + } + +-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) ++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++{ ++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); ++} ++ ++ ++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) ++{ ++ V4L2Buffer * const avbuf = (V4L2Buffer *)data; ++ int i; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) { ++ struct V4L2Plane_info *p = avbuf->plane_info + i; ++ if (p->mm_addr != NULL) ++ munmap(p->mm_addr, p->length); ++ } ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { ++ if (avbuf->drm_frame.objects[i].fd != -1) ++ close(avbuf->drm_frame.objects[i].fd); ++ } ++ ++ ff_weak_link_unref(&avbuf->context_wl); ++ ++ av_free(avbuf); ++} ++ ++ ++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) + { +- V4L2Context *ctx = avbuf->context; + int ret, i; ++ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); ++ AVBufferRef * bufref; ++ ++ *pbufref = NULL; ++ if (avbuf == NULL) ++ return AVERROR(ENOMEM); ++ ++ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); ++ if (bufref == NULL) { ++ av_free(avbuf); ++ return AVERROR(ENOMEM); ++ } + ++ avbuf->context = ctx; + avbuf->buf.memory = V4L2_MEMORY_MMAP; + avbuf->buf.type = ctx->type; + avbuf->buf.index = index; + ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { ++ avbuf->drm_frame.objects[i].fd = -1; ++ } ++ ++ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master); ++ + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.length = VIDEO_MAX_PLANES; + avbuf->buf.m.planes = avbuf->planes; +@@ -506,7 +703,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); + if (ret < 0) +- return AVERROR(errno); ++ goto fail; + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->num_planes = 0; +@@ -526,25 +723,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); ++ ++ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || ++ !buf_to_m2mctx(avbuf)->output_drm) { ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); ++ } + } else { + avbuf->plane_info[i].length = avbuf->buf.length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); ++ ++ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || ++ !buf_to_m2mctx(avbuf)->output_drm) { ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); ++ } + } + +- if (avbuf->plane_info[i].mm_addr == MAP_FAILED) +- return AVERROR(ENOMEM); ++ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { ++ avbuf->plane_info[i].mm_addr = NULL; ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } + } + + avbuf->status = V4L2BUF_AVAILABLE; + +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- return 0; +- + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.m.planes = avbuf->planes; + avbuf->buf.length = avbuf->num_planes; +@@ -554,7 +759,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + avbuf->buf.length = avbuf->planes[0].length; + } + +- return ff_v4l2_buffer_enqueue(avbuf); ++ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ ret = v4l2_buffer_export_drm(avbuf); ++ if (ret) ++ goto fail; ++ } ++ } ++ ++ *pbufref = bufref; ++ return 0; ++ ++fail: ++ av_buffer_unref(&bufref); ++ return ret; + } + + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) +@@ -563,9 +781,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + + avbuf->buf.flags = avbuf->flags; + ++ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { ++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", ++ avbuf->context->name, avbuf->buf.index, ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, ++ avbuf->context->q_count); ++ } ++ + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); +- if (ret < 0) +- return AVERROR(errno); ++ if (ret < 0) { ++ int err = errno; ++ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n", ++ avbuf->context->name, avbuf->buf.index, ++ err, strerror(err)); ++ return AVERROR(err); ++ } ++ ++ ++avbuf->context->q_count; ++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", ++ avbuf->context->name, avbuf->buf.index, ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, ++ avbuf->context->q_count); + + avbuf->status = V4L2BUF_IN_DRIVER; + +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 3d2ff1b9a5..111526aee3 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -28,27 +28,37 @@ + #include + #include + ++#include "avcodec.h" + #include "libavutil/buffer.h" + #include "libavutil/frame.h" ++#include "libavutil/hwcontext_drm.h" + #include "packet.h" + + enum V4L2Buffer_status { + V4L2BUF_AVAILABLE, + V4L2BUF_IN_DRIVER, ++ V4L2BUF_IN_USE, + V4L2BUF_RET_USER, + }; + + /** + * V4L2Buffer (wrapper for v4l2_buffer management) + */ ++struct V4L2Context; ++struct ff_weak_link_client; ++ + typedef struct V4L2Buffer { +- /* each buffer needs to have a reference to its context */ ++ /* each buffer needs to have a reference to its context ++ * The pointer is good enough for most operation but once the buffer has ++ * been passed to the user the buffer may become orphaned so for free ops ++ * the weak link must be used to ensure that the context is actually ++ * there ++ */ + struct V4L2Context *context; ++ struct ff_weak_link_client *context_wl; + +- /* This object is refcounted per-plane, so we need to keep track +- * of how many context-refs we are holding. */ +- AVBufferRef *context_ref; +- atomic_uint context_refcount; ++ /* DRM descriptor */ ++ AVDRMFrameDescriptor drm_frame; + + /* keep track of the mmap address and mmap length */ + struct V4L2Plane_info { +@@ -73,11 +83,12 @@ typedef struct V4L2Buffer { + * + * @param[in] frame The AVFRame to push the information to + * @param[in] buf The V4L2Buffer to get the information from ++ * @param[in] no_rescale_pts If non-zero do not rescale PTS + * + * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect, + * AVERROR(ENOMEM) if the AVBufferRef can't be created. + */ +-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf); ++int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts); + + /** + * Extracts the data from a V4L2Buffer to an AVPacket +@@ -101,6 +112,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); + */ + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, ++ const void *extdata, size_t extlen, int no_rescale_pts); ++ + /** + * Extracts the data from an AVFrame to a V4L2Buffer + * +@@ -119,7 +133,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); ++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); + + /** + * Enqueues a V4L2Buffer +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index e891649f92..cbbd0551a7 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -27,11 +27,13 @@ + #include + #include + #include ++#include "libavutil/avassert.h" + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" + #include "v4l2_buffers.h" + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" ++#include "weak_link.h" + + struct v4l2_format_update { + uint32_t v4l2_fmt; +@@ -153,21 +155,99 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd + } + } + +-static int v4l2_start_decode(V4L2Context *ctx) ++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r) + { +- struct v4l2_decoder_cmd cmd = { +- .cmd = V4L2_DEC_CMD_START, +- .flags = 0, ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ struct v4l2_selection selection = { ++ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, ++ .target = V4L2_SEL_TGT_COMPOSE + }; +- int ret; + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd); +- if (ret) ++ memset(r, 0, sizeof(*r)); ++ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) + return AVERROR(errno); + ++ *r = selection.r; + return 0; + } + ++static int do_source_change(V4L2m2mContext * const s) ++{ ++ AVCodecContext *const avctx = s->avctx; ++ ++ int ret; ++ int reinit; ++ int full_reinit; ++ struct v4l2_format cap_fmt = s->capture.format; ++ ++ s->resize_pending = 0; ++ s->capture.done = 0; ++ ++ ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); ++ return 0; ++ } ++ ++ s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); ++ ++ get_default_selection(&s->capture, &s->capture.selection); ++ ++ reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ if (reinit) { ++ s->capture.height = v4l2_get_height(&cap_fmt); ++ s->capture.width = v4l2_get_width(&cap_fmt); ++ } ++ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); ++ ++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", ++ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, ++ s->capture.selection.width, s->capture.selection.height, ++ s->capture.selection.left, s->capture.selection.top); ++ ++ s->reinit = 1; ++ ++ if (reinit) { ++ if (avctx) ++ ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); ++ ++ ret = ff_v4l2_m2m_codec_reinit(s); ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); ++ return AVERROR(EINVAL); ++ } ++ goto reinit_run; ++ } ++ ++ /* Buffers are OK so just stream off to ack */ ++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__); ++ ++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); ++ if (ret) ++ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); ++ s->draining = 0; ++ ++ /* reinit executed */ ++reinit_run: ++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); ++ return 1; ++} ++ ++static int ctx_done(V4L2Context * const ctx) ++{ ++ int rv = 0; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ ++ ctx->done = 1; ++ ++ if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ rv = do_source_change(s); ++ ++ return rv; ++} ++ + /** + * handle resolution change event and end of stream event + * returns 1 if reinit was successful, negative if it failed +@@ -175,8 +255,7 @@ static int v4l2_start_decode(V4L2Context *ctx) + */ + static int v4l2_handle_event(V4L2Context *ctx) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); +- struct v4l2_format cap_fmt = s->capture.format; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + struct v4l2_event evt = { 0 }; + int ret; + +@@ -186,44 +265,22 @@ static int v4l2_handle_event(V4L2Context *ctx) + return 0; + } + ++ av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type); ++ + if (evt.type == V4L2_EVENT_EOS) { +- ctx->done = 1; ++// ctx->done = 1; ++ av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); + return 0; + } + + if (evt.type != V4L2_EVENT_SOURCE_CHANGE) + return 0; + +- ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); +- return 0; +- } +- +- if (v4l2_resolution_changed(&s->capture, &cap_fmt)) { +- s->capture.height = v4l2_get_height(&cap_fmt); +- s->capture.width = v4l2_get_width(&cap_fmt); +- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); +- } else { +- v4l2_start_decode(ctx); ++ s->resize_pending = 1; ++ if (!ctx->done) + return 0; +- } +- +- s->reinit = 1; +- +- if (s->avctx) +- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); +- if (ret < 0) +- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); +- +- ret = ff_v4l2_m2m_codec_reinit(s); +- if (ret) { +- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); +- return AVERROR(EINVAL); +- } + +- /* reinit executed */ +- return 1; ++ return do_source_change(s); + } + + static int v4l2_stop_decode(V4L2Context *ctx) +@@ -266,8 +323,26 @@ static int v4l2_stop_encode(V4L2Context *ctx) + return 0; + } + ++static int count_in_driver(const V4L2Context * const ctx) ++{ ++ int i; ++ int n = 0; ++ ++ if (!ctx->bufrefs) ++ return -1; ++ ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status == V4L2BUF_IN_DRIVER) ++ ++n; ++ } ++ return n; ++} ++ + static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) + { ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type); + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + struct v4l2_buffer buf = { 0 }; + V4L2Buffer *avbuf; +@@ -276,50 +351,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) + .fd = ctx_to_m2mctx(ctx)->fd, + }; + int i, ret; ++ int no_rx_means_done = 0; + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { ++ if (is_capture && ctx->bufrefs) { + for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) ++ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status == V4L2BUF_IN_DRIVER) + break; + } + if (i == ctx->num_buffers) +- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " ++ av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " + "userspace. Increase num_capture_buffers " + "to prevent device deadlock or dropped " +- "packets/frames.\n"); ++ "packets/frames.\n", i); + } + ++#if 0 ++ // I think this is true but pointless ++ // we will get some other form of EOF signal ++ + /* if we are draining and there are no more capture buffers queued in the driver we are done */ +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { ++ if (is_capture && ctx_to_m2mctx(ctx)->draining) { + for (i = 0; i < ctx->num_buffers; i++) { + /* capture buffer initialization happens during decode hence + * detection happens at runtime + */ +- if (!ctx->buffers) ++ if (!ctx->bufrefs) + break; + +- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) ++ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status == V4L2BUF_IN_DRIVER) + goto start; + } + ctx->done = 1; + return NULL; + } ++#endif + + start: +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- pfd.events = POLLOUT | POLLWRNORM; +- else { ++ if (is_capture) { + /* no need to listen to requests for more input while draining */ + if (ctx_to_m2mctx(ctx)->draining) + pfd.events = POLLIN | POLLRDNORM | POLLPRI; ++ } else { ++ pfd.events = POLLOUT | POLLWRNORM; + } ++ no_rx_means_done = s->resize_pending && is_capture; + + for (;;) { +- ret = poll(&pfd, 1, timeout); ++ // If we have a resize pending then all buffers should be Qed ++ // With a resize pending we should be in drain but evidence suggests ++ // that not all decoders do this so poll to clear ++ int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout; ++ const int e = pfd.events; ++ ++ ret = poll(&pfd, 1, t2); ++ + if (ret > 0) + break; +- if (errno == EINTR) +- continue; ++ ++ if (ret < 0) { ++ int err = errno; ++ if (err == EINTR) ++ continue; ++ av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n", ++ err, strerror(err), ++ e, count_in_driver(ctx)); ++ return NULL; ++ } ++ ++ // ret == 0 (timeout) ++ if (no_rx_means_done) { ++ av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n"); ++ ret = ctx_done(ctx); ++ if (ret > 0) ++ goto start; ++ } ++ if (timeout == -1) ++ av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; + return NULL; + } + +@@ -329,7 +438,8 @@ start: + no need to raise a warning */ + if (timeout == 0) { + for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) ++ avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status != V4L2BUF_AVAILABLE) + av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); + } + } +@@ -347,22 +457,25 @@ start: + ctx->done = 1; + return NULL; + } +- if (ret) { +- /* if re-init was successful drop the buffer (if there was one) +- * since we had to reconfigure capture (unmap all buffers) +- */ +- return NULL; +- } ++ if (ret > 0) ++ goto start; + } + + /* 2. dequeue the buffer */ + if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ if (is_capture) { + /* there is a capture buffer ready */ + if (pfd.revents & (POLLIN | POLLRDNORM)) + goto dequeue; + ++ // CAPTURE Q drained ++ if (no_rx_means_done) { ++ if (ctx_done(ctx) > 0) ++ goto start; ++ return NULL; ++ } ++ + /* the driver is ready to accept more input; instead of waiting for the capture + * buffer to complete we return NULL so input can proceed (we are single threaded) + */ +@@ -380,37 +493,58 @@ dequeue: + buf.m.planes = planes; + } + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); +- if (ret) { +- if (errno != EAGAIN) { +- ctx->done = 1; +- if (errno != EPIPE) ++ while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) { ++ const int err = errno; ++ if (err == EINTR) ++ continue; ++ if (err != EAGAIN) { ++ // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST ++ if (err != EPIPE || !is_capture) + av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", +- ctx->name, av_err2str(AVERROR(errno))); ++ ctx->name, av_err2str(AVERROR(err))); ++ if (ctx_done(ctx) > 0) ++ goto start; + } + return NULL; + } ++ --ctx->q_count; ++ av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n", ++ ctx->name, buf.index, ++ buf.timestamp.tv_sec, buf.timestamp.tv_usec, ++ ctx->q_count, ++ctx->dq_count); + +- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; ++ avbuf->status = V4L2BUF_AVAILABLE; ++ avbuf->buf = buf; ++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buf.m.planes = avbuf->planes; ++ } ++ ++ if (ctx_to_m2mctx(ctx)->draining && is_capture) { + int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? + buf.m.planes[0].bytesused : buf.bytesused; + if (bytesused == 0) { +- ctx->done = 1; ++ av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n"); ++ ++ // Must reQ so we don't leak ++ // May not matter if the next thing we do is release all the ++ // buffers but better to be tidy. ++ ff_v4l2_buffer_enqueue(avbuf); ++ ++ if (ctx_done(ctx) > 0) ++ goto start; + return NULL; + } + #ifdef V4L2_BUF_FLAG_LAST +- if (buf.flags & V4L2_BUF_FLAG_LAST) +- ctx->done = 1; ++ if (buf.flags & V4L2_BUF_FLAG_LAST) { ++ av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); ++ avbuf->status = V4L2BUF_IN_USE; // Avoid flushing this buffer ++ ctx_done(ctx); ++ } + #endif + } + +- avbuf = &ctx->buffers[buf.index]; +- avbuf->status = V4L2BUF_AVAILABLE; +- avbuf->buf = buf; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memcpy(avbuf->planes, planes, sizeof(planes)); +- avbuf->buf.m.planes = avbuf->planes; +- } + return avbuf; + } + +@@ -429,8 +563,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + } + + for (i = 0; i < ctx->num_buffers; i++) { +- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) +- return &ctx->buffers[i]; ++ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (avbuf->status == V4L2BUF_AVAILABLE) ++ return avbuf; + } + + return NULL; +@@ -438,25 +573,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + + static int v4l2_release_buffers(V4L2Context* ctx) + { +- struct v4l2_requestbuffers req = { +- .memory = V4L2_MEMORY_MMAP, +- .type = ctx->type, +- .count = 0, /* 0 -> unmaps buffers from the driver */ +- }; +- int i, j; ++ int i; ++ int ret = 0; ++ const int fd = ctx_to_m2mctx(ctx)->fd; + +- for (i = 0; i < ctx->num_buffers; i++) { +- V4L2Buffer *buffer = &ctx->buffers[i]; ++ // Orphan any buffers in the wild ++ ff_weak_link_break(&ctx->wl_master); ++ ++ if (ctx->bufrefs) { ++ for (i = 0; i < ctx->num_buffers; i++) ++ av_buffer_unref(ctx->bufrefs + i); ++ } ++ ++ if (fd != -1) { ++ struct v4l2_requestbuffers req = { ++ .memory = V4L2_MEMORY_MMAP, ++ .type = ctx->type, ++ .count = 0, /* 0 -> unmap all buffers from the driver */ ++ }; ++ ++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno == EINTR) ++ continue; ++ ++ ret = AVERROR(errno); + +- for (j = 0; j < buffer->num_planes; j++) { +- struct V4L2Plane_info *p = &buffer->plane_info[j]; +- if (p->mm_addr && p->length) +- if (munmap(p->mm_addr, p->length) < 0) +- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); ++ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", ++ ctx->name, av_err2str(AVERROR(errno))); ++ ++ if (ctx_to_m2mctx(ctx)->output_drm) ++ av_log(logger(ctx), AV_LOG_ERROR, ++ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" ++ "for all buffers: \n" ++ " 1. drmModeRmFB(..)\n" ++ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); + } + } ++ ctx->q_count = 0; + +- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); ++ return ret; + } + + static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) +@@ -485,6 +640,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm + + static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + { ++ V4L2m2mContext* s = ctx_to_m2mctx(ctx); ++ V4L2m2mPriv *priv = s->avctx->priv_data; + enum AVPixelFormat pixfmt = ctx->av_pix_fmt; + struct v4l2_fmtdesc fdesc; + int ret; +@@ -503,6 +660,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + if (ret) + return AVERROR(EINVAL); + ++ if (priv->pix_fmt != AV_PIX_FMT_NONE) { ++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { ++ fdesc.index++; ++ continue; ++ } ++ } ++ + pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); + ret = v4l2_try_raw_format(ctx, pixfmt); + if (ret){ +@@ -555,18 +719,73 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) + * + *****************************************************************************/ + ++ ++static void flush_all_buffers_status(V4L2Context* const ctx) ++{ ++ int i; ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (buf->status == V4L2BUF_IN_DRIVER) ++ buf->status = V4L2BUF_AVAILABLE; ++ } ++ ctx->q_count = 0; ++} ++ ++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) ++{ ++ int i; ++ int rv; ++ ++ if (!ctx->bufrefs) { ++ rv = ff_v4l2_context_init(ctx); ++ if (rv) { ++ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); ++ return rv; ++ } ++ } ++ ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; ++ if (buf->status == V4L2BUF_AVAILABLE) { ++ rv = ff_v4l2_buffer_enqueue(buf); ++ if (rv < 0) ++ return rv; ++ } ++ } ++ return 0; ++} ++ + int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + { + int type = ctx->type; + int ret; ++ AVCodecContext * const avctx = logger(ctx); ++ ++ ff_mutex_lock(&ctx->lock); ++ ++ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ stuff_all_buffers(avctx, ctx); + + ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); +- if (ret < 0) +- return AVERROR(errno); ++ if (ret < 0) { ++ const int err = errno; ++ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); ++ ret = AVERROR(err); ++ } ++ else ++ { ++ if (cmd == VIDIOC_STREAMOFF) ++ flush_all_buffers_status(ctx); + +- ctx->streamon = (cmd == VIDIOC_STREAMON); ++ ctx->streamon = (cmd == VIDIOC_STREAMON); ++ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); ++ } + +- return 0; ++ ff_mutex_unlock(&ctx->lock); ++ ++ return ret; + } + + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) +@@ -594,7 +813,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + return ff_v4l2_buffer_enqueue(avbuf); + } + +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, ++ const void * extdata, size_t extlen, int no_rescale_pts) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); + V4L2Buffer* avbuf; +@@ -602,8 +822,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) + + if (!pkt->size) { + ret = v4l2_stop_decode(ctx); ++ // Log but otherwise ignore stop failure + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); + s->draining = 1; + return 0; + } +@@ -612,14 +833,14 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); + if (ret) + return ret; + + return ff_v4l2_buffer_enqueue(avbuf); + } + +-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) ++int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts) + { + V4L2Buffer *avbuf; + +@@ -636,7 +857,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + return AVERROR(EAGAIN); + } + +- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); ++ return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts); + } + + int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) +@@ -695,54 +916,57 @@ void ff_v4l2_context_release(V4L2Context* ctx) + { + int ret; + +- if (!ctx->buffers) ++ if (!ctx->bufrefs) + return; + + ret = v4l2_release_buffers(ctx); + if (ret) + av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); + +- av_freep(&ctx->buffers); ++ av_freep(&ctx->bufrefs); ++ av_buffer_unref(&ctx->frames_ref); ++ ++ ff_mutex_destroy(&ctx->lock); + } + +-int ff_v4l2_context_init(V4L2Context* ctx) ++ ++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + struct v4l2_requestbuffers req; +- int ret, i; +- +- if (!v4l2_type_supported(ctx)) { +- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); +- return AVERROR_PATCHWELCOME; +- } +- +- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); +- if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); ++ int ret; ++ int i; + + memset(&req, 0, sizeof(req)); +- req.count = ctx->num_buffers; ++ req.count = req_buffers; + req.memory = V4L2_MEMORY_MMAP; + req.type = ctx->type; +- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); +- if (ret < 0) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno)); +- return AVERROR(errno); ++ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno != EINTR) { ++ ret = AVERROR(errno); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret)); ++ return ret; ++ } + } + + ctx->num_buffers = req.count; +- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); +- if (!ctx->buffers) { ++ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs)); ++ if (!ctx->bufrefs) { + av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name); +- return AVERROR(ENOMEM); ++ goto fail_release; + } + +- for (i = 0; i < req.count; i++) { +- ctx->buffers[i].context = ctx; +- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); +- if (ret < 0) { ++ ctx->wl_master = ff_weak_link_new(ctx); ++ if (!ctx->wl_master) { ++ ret = AVERROR(ENOMEM); ++ goto fail_release; ++ } ++ ++ for (i = 0; i < ctx->num_buffers; i++) { ++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); ++ if (ret) { + av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); +- goto error; ++ goto fail_release; + } + } + +@@ -756,10 +980,62 @@ int ff_v4l2_context_init(V4L2Context* ctx) + + return 0; + +-error: ++fail_release: + v4l2_release_buffers(ctx); ++ av_freep(&ctx->bufrefs); ++ return ret; ++} ++ ++int ff_v4l2_context_init(V4L2Context* ctx) ++{ ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); ++ int ret; ++ ++ // It is not valid to reinit a context without a previous release ++ av_assert0(ctx->bufrefs == NULL); ++ ++ if (!v4l2_type_supported(ctx)) { ++ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); ++ return AVERROR_PATCHWELCOME; ++ } ++ ++ ff_mutex_init(&ctx->lock, NULL); + +- av_freep(&ctx->buffers); ++ if (s->output_drm) { ++ AVHWFramesContext *hwframes; ++ ++ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref); ++ if (!ctx->frames_ref) { ++ ret = AVERROR(ENOMEM); ++ goto fail_unlock; ++ } ++ ++ hwframes = (AVHWFramesContext*)ctx->frames_ref->data; ++ hwframes->format = AV_PIX_FMT_DRM_PRIME; ++ hwframes->sw_format = ctx->av_pix_fmt; ++ hwframes->width = ctx->width; ++ hwframes->height = ctx->height; ++ ret = av_hwframe_ctx_init(ctx->frames_ref); ++ if (ret < 0) ++ goto fail_unref_hwframes; ++ } ++ ++ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret)); ++ goto fail_unref_hwframes; ++ } ++ ++ ret = create_buffers(ctx, ctx->num_buffers); ++ if (ret < 0) ++ goto fail_unref_hwframes; ++ ++ return 0; + ++fail_unref_hwframes: ++ av_buffer_unref(&ctx->frames_ref); ++fail_unlock: ++ ff_mutex_destroy(&ctx->lock); + return ret; + } +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 6f7460c89a..59009d11d1 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -32,6 +32,8 @@ + #include "libavutil/rational.h" + #include "codec_id.h" + #include "packet.h" ++#include "libavutil/buffer.h" ++#include "libavutil/thread.h" + #include "v4l2_buffers.h" + + typedef struct V4L2Context { +@@ -71,11 +73,12 @@ typedef struct V4L2Context { + */ + int width, height; + AVRational sample_aspect_ratio; ++ struct v4l2_rect selection; + + /** +- * Indexed array of V4L2Buffers ++ * Indexed array of pointers to V4L2Buffers + */ +- V4L2Buffer *buffers; ++ AVBufferRef **bufrefs; + + /** + * Readonly after init. +@@ -93,6 +96,12 @@ typedef struct V4L2Context { + */ + int done; + ++ AVBufferRef *frames_ref; ++ int q_count; ++ int dq_count; ++ struct ff_weak_link_master *wl_master; ++ ++ AVMutex lock; + } V4L2Context; + + /** +@@ -157,9 +166,12 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); + * @param[in] ctx The V4L2Context to dequeue from. + * @param[inout] f The AVFrame to dequeue to. + * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) ++ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as ++ * timestamp directly) ++ * + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. + */ +-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); ++int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts); + + /** + * Enqueues a buffer to a V4L2Context from an AVPacket +@@ -171,7 +183,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + * @param[in] pkt A pointer to an AVPacket. + * @return 0 in case of success, a negative error otherwise. + */ +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts); + + /** + * Enqueues a buffer to a V4L2Context from an AVFrame +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index 984936004d..50a192933b 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -214,13 +214,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) + av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); + + /* 2. unmap the capture buffers (v4l2 and ffmpeg): +- * we must wait for all references to be released before being allowed +- * to queue new buffers. + */ +- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n"); +- if (atomic_load(&s->refcount)) +- while(sem_wait(&s->refsync) == -1 && errno == EINTR); +- + ff_v4l2_context_release(&s->capture); + + /* 3. get the new capture format */ +@@ -257,6 +251,8 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) + av_frame_free(&s->frame); + av_packet_unref(&s->buf_pkt); + ++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); ++ + av_free(s); + } + +@@ -268,6 +264,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + if (!s) + return 0; + ++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); ++ ++ if (av_codec_is_decoder(s->avctx->codec)) ++ av_packet_unref(&s->buf_pkt); ++ + if (s->fd >= 0) { + ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); + if (ret) +@@ -280,7 +281,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + + ff_v4l2_context_release(&s->output); + ++ close(s->fd); ++ s->fd = -1; ++ + s->self_ref = NULL; ++ // This is only called on avctx close so after this point we don't have that ++ // Crash sooner if we find we are using it (can still log with avctx = NULL) ++ s->avctx = NULL; ++ priv->context = NULL; + av_buffer_unref(&priv->context_ref); + + return 0; +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index b67b216331..24a9c94864 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -30,6 +30,7 @@ + #include + + #include "libavcodec/avcodec.h" ++#include "libavutil/pixfmt.h" + #include "v4l2_context.h" + + #define container_of(ptr, type, member) ({ \ +@@ -38,7 +39,18 @@ + + #define V4L_M2M_DEFAULT_OPTS \ + { "num_output_buffers", "Number of buffers in the output context",\ +- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } ++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } ++ ++#define FF_V4L2_M2M_TRACK_SIZE 128 ++typedef struct V4L2m2mTrackEl { ++ int discard; // If we see this buffer its been flushed, so discard ++ int pkt_size; ++ int64_t pts; ++ int64_t reordered_opaque; ++ int64_t pkt_pos; ++ int64_t pkt_duration; ++ int64_t track_pts; ++} V4L2m2mTrackEl; + + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; +@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext { + sem_t refsync; + atomic_uint refcount; + int reinit; ++ int resize_pending; + + /* null frame/packet received */ + int draining; +@@ -66,6 +79,23 @@ typedef struct V4L2m2mContext { + + /* reference back to V4L2m2mPriv */ + void *priv; ++ ++ AVBufferRef *device_ref; ++ ++ /* generate DRM frames */ ++ int output_drm; ++ ++ /* Frame tracking */ ++ int64_t last_pkt_dts; ++ int64_t last_opaque; ++ unsigned int track_no; ++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++ ++ /* req pkt */ ++ int req_pkt; ++ ++ /* Ext data sent */ ++ int extdata_sent; + } V4L2m2mContext; + + typedef struct V4L2m2mPriv { +@@ -76,6 +106,7 @@ typedef struct V4L2m2mPriv { + + int num_output_buffers; + int num_capture_buffers; ++ enum AVPixelFormat pix_fmt; + } V4L2m2mPriv; + + /** +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 8a51dec3fa..e1d34f4ccd 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -23,6 +23,10 @@ + + #include + #include ++ ++#include "libavutil/avassert.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" + #include "libavutil/pixfmt.h" + #include "libavutil/pixdesc.h" + #include "libavutil/opt.h" +@@ -30,26 +34,51 @@ + #include "codec_internal.h" + #include "libavcodec/decode.h" + ++#include "libavcodec/hwaccels.h" ++#include "libavcodec/internal.h" ++#include "libavcodec/hwconfig.h" ++ + #include "v4l2_context.h" + #include "v4l2_m2m.h" + #include "v4l2_fmt.h" + ++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) ++{ ++ int ret; ++ struct v4l2_decoder_cmd cmd = { ++ .cmd = V4L2_DEC_CMD_START, ++ .flags = 0, ++ }; ++ ++ if (s->output.streamon) ++ return 0; ++ ++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); ++ ++ if (!s->capture.streamon || ret < 0) ++ return ret; ++ ++ ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); ++ else ++ av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n"); ++ ++ return ret; ++} ++ + static int v4l2_try_start(AVCodecContext *avctx) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; + struct v4l2_selection selection = { 0 }; + int ret; + + /* 1. start the output process */ +- if (!output->streamon) { +- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); +- if (ret < 0) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); +- return ret; +- } +- } ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; + + if (capture->streamon) + return 0; +@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext *avctx) + } + + /* 2.1 update the AVCodecContext */ +- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); +- capture->av_pix_fmt = avctx->pix_fmt; ++ capture->av_pix_fmt = ++ ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); ++ if (s->output_drm) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ avctx->sw_pix_fmt = capture->av_pix_fmt; ++ } ++ else ++ avctx->pix_fmt = capture->av_pix_fmt; + + /* 3. set the crop parameters */ ++#if 1 ++ selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ selection.target = V4L2_SEL_TGT_CROP_DEFAULT; ++ ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); ++ av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); ++#else + selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + selection.r.height = avctx->coded_height; + selection.r.width = avctx->coded_width; ++ av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height); + ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); +- if (!ret) { ++ av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); ++ if (1) { + ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); + if (ret) { + av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); +@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext *avctx) + capture->width = selection.r.width; + } + } +- +- /* 4. init the capture context now that we have the capture format */ +- if (!capture->buffers) { +- ret = ff_v4l2_context_init(capture); +- if (ret) { +- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); +- return AVERROR(ENOMEM); +- } +- } ++#endif + + /* 5. start the capture process */ + ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); +@@ -133,50 +168,287 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) + return 0; + } + +-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) ++{ ++ return (int64_t)n; ++} ++ ++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) ++{ ++ return (unsigned int)pts; ++} ++ ++// FFmpeg requires us to propagate a number of vars from the coded pkt into ++// the decoded frame. The only thing that tracks like that in V4L2 stateful ++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no ++// guarantees about PTS being unique or specified for every frame so replace ++// the supplied PTS with a simple incrementing number and keep a circular ++// buffer of all the things we want preserved (including the original PTS) ++// indexed by the tracking no. ++static void ++xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) ++{ ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++s->track_no == 0) ++ s->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, s->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); ++ s->last_pkt_dts = avpkt->dts; ++ s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pkt_size = avpkt->size, ++ .pts = avpkt->pts, ++ .reordered_opaque = avctx->reordered_opaque, ++ .pkt_pos = avpkt->pos, ++ .pkt_duration = avpkt->duration, ++ .track_pts = track_pts ++ }; ++ avpkt->pts = track_pts; ++} ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) ++{ ++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ const V4L2m2mTrackEl *const t = s->track_els + n; ++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) ++ { ++ av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ frame->pts = AV_NOPTS_VALUE; ++ frame->pkt_dts = s->last_pkt_dts; ++ frame->reordered_opaque = s->last_opaque; ++ frame->pkt_pos = -1; ++ frame->pkt_duration = 0; ++ frame->pkt_size = -1; ++ } ++ else if (!t->discard) ++ { ++ frame->pts = t->pts; ++ frame->pkt_dts = s->last_pkt_dts; ++ frame->reordered_opaque = t->reordered_opaque; ++ frame->pkt_pos = t->pkt_pos; ++ frame->pkt_duration = t->pkt_duration; ++ frame->pkt_size = t->pkt_size; ++ ++ s->last_opaque = s->track_els[n].reordered_opaque; ++ s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ frame->best_effort_timestamp = frame->pts; ++ frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); ++ return 0; ++} ++ ++static inline int stream_started(const V4L2m2mContext * const s) { ++ return s->capture.streamon && s->output.streamon; ++} ++ ++#define NQ_OK 0 ++#define NQ_Q_FULL 1 ++#define NQ_SRC_EMPTY 2 ++#define NQ_DRAINING 3 ++#define NQ_DEAD 4 ++ ++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) ++ ++// AVERROR_EOF Flushing an already flushed stream ++// -ve Error (all errors except EOF are unexpected) ++// NQ_OK (0) OK ++// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) ++// NQ_SRC_EMPTY Src empty (do not retry) ++// NQ_DRAINING At EOS, dQ dest until EOS there too ++// NQ_DEAD Not running (do not retry, do not attempt capture dQ) ++ ++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) + { +- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; + int ret; + ++ // If we don't already have a coded packet - get a new one ++ // We will already have a coded pkt if the output Q was full last time we ++ // tried to Q it + if (!s->buf_pkt.size) { + ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ ++ if (ret == AVERROR(EAGAIN)) { ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); ++ return NQ_DEAD; ++ } ++ return NQ_SRC_EMPTY; ++ } ++ ++ if (ret == AVERROR_EOF) { ++ // EOF - enter drain mode ++ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", ++ ret, s->buf_pkt.size, stream_started(s), s->draining); ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); ++ s->draining = 1; ++ s->capture.done = 1; ++ return AVERROR_EOF; ++ } ++ ++ if (!s->draining) { ++ // Calling enqueue with an empty pkt starts drain ++ av_assert0(s->buf_pkt.size == 0); ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1); ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); ++ return ret; ++ } ++ } ++ return NQ_DRAINING; ++ } ++ + if (ret < 0) { +- if (ret == AVERROR(EAGAIN)) +- return ff_v4l2_context_dequeue_frame(capture, frame, 0); +- else if (ret != AVERROR_EOF) +- return ret; ++ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); ++ return ret; + } ++ ++ xlat_pts_in(avctx, s, &s->buf_pkt); + } + +- if (s->draining) +- goto dequeue; ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; + +- ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); +- if (ret < 0 && ret != AVERROR(EAGAIN)) +- goto fail; ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, ++ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size, ++ 1); + +- /* if EAGAIN don't unref packet and try to enqueue in the next iteration */ +- if (ret != AVERROR(EAGAIN)) ++ if (ret == AVERROR(EAGAIN)) { ++ // Out of input buffers - keep packet ++ ret = NQ_Q_FULL; ++ } ++ else { ++ // In all other cases we are done with this packet + av_packet_unref(&s->buf_pkt); ++ s->extdata_sent = 1; + +- if (!s->draining) { +- ret = v4l2_try_start(avctx); + if (ret) { +- /* cant recover */ +- if (ret != AVERROR(ENOMEM)) +- ret = 0; +- goto fail; ++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); ++ return ret; ++ } ++ } ++ ++ // Start if we haven't ++ { ++ const int ret2 = v4l2_try_start(avctx); ++ if (ret2) { ++ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); ++ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; ++ } ++ } ++ ++ return ret; ++} ++ ++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; ++ int src_rv; ++ int dst_rv = 1; // Non-zero (done), non-negative (error) number ++ ++ do { ++ src_rv = try_enqueue_src(avctx, s); ++ ++ // If we got a frame last time and we have nothing to enqueue then ++ // return now. rv will be AVERROR(EAGAIN) indicating that we want more input ++ // This should mean that once decode starts we enter a stable state where ++ // we alternately ask for input and produce output ++ if (s->req_pkt && src_rv == NQ_SRC_EMPTY) ++ break; ++ ++ if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) { ++ av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); ++ src_rv = NQ_SRC_EMPTY; // If we can't enqueue pretend that there is nothing to enqueue ++ } ++ ++ // Try to get a new frame if ++ // (a) we haven't already got one AND ++ // (b) enqueue returned a status indicating that decode should be attempted ++ if (dst_rv != 0 && TRY_DQ(src_rv)) { ++ do { ++ // Dequeue frame will unref any previous contents of frame ++ // if it returns success so we don't need an explicit unref ++ // when discarding ++ // This returns AVERROR(EAGAIN) if there isn't a frame ready yet ++ // but there is room in the input Q ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1); ++ ++ if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) ++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", ++ s->draining, s->capture.done); ++ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) ++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", ++ s->draining, s->capture.done, dst_rv); ++ ++ // Go again if we got a frame that we need to discard ++ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); ++ } ++ ++ // Continue trying to enqueue packets if either ++ // (a) we succeeded last time OR ++ // (b) enqueue failed due to input Q full AND there is now room ++ } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) ); ++ ++ // Ensure that the frame contains nothing if we aren't returning a frame ++ // (might happen when discarding) ++ if (dst_rv) ++ av_frame_unref(frame); ++ ++ // If we got a frame this time ask for a pkt next time ++ s->req_pkt = (dst_rv == 0); ++ ++#if 0 ++ if (dst_rv == 0) ++ { ++ static int z = 0; ++ if (++z > 50) { ++ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); ++ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); ++ return -1; + } + } ++#endif ++ ++ return dst_rv == 0 ? 0 : ++ src_rv < 0 ? src_rv : ++ dst_rv < 0 ? dst_rv : ++ AVERROR(EAGAIN); ++} ++ ++#if 0 ++#include ++static int64_t us_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; ++} + +-dequeue: +- return ff_v4l2_context_dequeue_frame(capture, frame, -1); +-fail: +- av_packet_unref(&s->buf_pkt); ++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ int ret; ++ const int64_t now = us_time(); ++ int64_t done; ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ret = v4l2_receive_frame2(avctx, frame); ++ done = us_time(); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); + return ret; + } ++#endif + + static av_cold int v4l2_decode_init(AVCodecContext *avctx) + { +@@ -185,6 +457,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + V4L2m2mPriv *priv = avctx->priv_data; + int ret; + ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -205,6 +480,28 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; + ++ /* the client requests the codec to generate DRM frames: ++ * - data[0] will therefore point to the returned AVDRMFrameDescriptor ++ * check the ff_v4l2_buffer_to_avframe conversion function. ++ * - the DRM frame format is passed in the DRM frame descriptor layer. ++ * check the v4l2_get_drm_frame function. ++ */ ++ switch (ff_get_format(avctx, avctx->codec->pix_fmts)) { ++ default: ++ s->output_drm = 1; ++ break; ++ } ++ ++ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); ++ if (!s->device_ref) { ++ ret = AVERROR(ENOMEM); ++ return ret; ++ } ++ ++ ret = av_hwdevice_ctx_init(s->device_ref); ++ if (ret < 0) ++ return ret; ++ + s->avctx = avctx; + ret = ff_v4l2_m2m_codec_init(priv); + if (ret) { +@@ -217,7 +514,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + static av_cold int v4l2_decode_close(AVCodecContext *avctx) + { +- return ff_v4l2_m2m_codec_end(avctx->priv_data); ++ int rv; ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ rv = ff_v4l2_m2m_codec_end(avctx->priv_data); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv); ++ return rv; ++} ++ ++static void v4l2_decode_flush(AVCodecContext *avctx) ++{ ++ // An alternatve and more drastic form of flush is to simply do this: ++ // v4l2_decode_close(avctx); ++ // v4l2_decode_init(avctx); ++ // The downside is that this keeps a decoder open until all the frames ++ // associated with it have been returned. This is a bit wasteful on ++ // possibly limited h/w resources and fails on a Pi for this reason unless ++ // more GPU mem is allocated than is the default. ++ ++ V4L2m2mPriv * const priv = avctx->priv_data; ++ V4L2m2mContext * const s = priv->context; ++ V4L2Context * const output = &s->output; ++ V4L2Context * const capture = &s->capture; ++ int ret, i; ++ ++ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); ++ ++ // Reflushing everything is benign, quick and avoids having to worry about ++ // states like EOS processing so don't try to optimize out (having got it ++ // wrong once) ++ ++ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); ++ ++ // V4L2 makes no guarantees about whether decoded frames are flushed or not ++ // so mark all frames we are tracking to be discarded if they appear ++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) ++ s->track_els[i].discard = 1; ++ ++ // resend extradata ++ s->extdata_sent = 0; ++ // clear EOS status vars ++ s->draining = 0; ++ output->done = 0; ++ capture->done = 0; ++ ++ // Stream on will occur when we actually submit a new frame ++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); + } + + #define OFFSET(x) offsetof(V4L2m2mPriv, x) +@@ -226,10 +569,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) + static const AVOption options[] = { + V4L_M2M_DEFAULT_OPTS, + { "num_capture_buffers", "Number of buffers in the capture context", +- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS }, ++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, ++ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, + { NULL}, + }; + ++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { ++ HW_CONFIG_INTERNAL(DRM_PRIME), ++ NULL ++}; ++ + #define M2MDEC_CLASS(NAME) \ + static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ + .class_name = #NAME "_v4l2m2m_decoder", \ +@@ -250,10 +599,15 @@ static const AVOption options[] = { + .init = v4l2_decode_init, \ + FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \ + .close = v4l2_decode_close, \ ++ .flush = v4l2_decode_flush, \ + .bsfs = bsf_name, \ + .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ + .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ + .p.wrapper_name = "v4l2m2m", \ ++ .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ ++ AV_PIX_FMT_NV12, \ ++ AV_PIX_FMT_NONE}, \ ++ .hw_configs = v4l2_m2m_hw_configs, \ + } + + M2MDEC(h264, "H.264", AV_CODEC_ID_H264, "h264_mp4toannexb"); + +From d01e2d278c959db020910e62071dd0abe460b0e7 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 10 Jun 2021 18:46:21 +0100 +Subject: [PATCH 017/113] Fix crash in hw_device_default_name if type not found + (NONE) + +--- + fftools/ffmpeg_hw.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c +index 14e702bd92..4194647faa 100644 +--- a/fftools/ffmpeg_hw.c ++++ b/fftools/ffmpeg_hw.c +@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type) + char *name; + size_t index_pos; + int index, index_limit = 1000; ++ if (!type_name) ++ return NULL; + index_pos = strlen(type_name); + name = av_malloc(index_pos + 4); + if (!name) + +From 40f4215a999a91398f8e627efe4de2674610d0c6 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 10 Jun 2021 18:59:18 +0100 +Subject: [PATCH 018/113] Allow v4l2m2m to select non-drm_prime output formats + +--- + libavcodec/v4l2_buffers.c | 2 +- + libavcodec/v4l2_m2m_dec.c | 14 ++++++++++---- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index a003934ca1..1ca1128db6 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -524,7 +524,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + offset += dst_stride * out->context->height; + } + if (offset > out->plane_info[0].length) { +- av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length); ++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); + return -1; + } + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index e1d34f4ccd..d258fed28b 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -455,10 +455,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + V4L2Context *capture, *output; + V4L2m2mContext *s; + V4L2m2mPriv *priv = avctx->priv_data; ++ int gf_pix_fmt; + int ret; + + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); +- avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) +@@ -486,10 +486,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + * - the DRM frame format is passed in the DRM frame descriptor layer. + * check the v4l2_get_drm_frame function. + */ +- switch (ff_get_format(avctx, avctx->codec->pix_fmts)) { +- default: ++ ++ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); ++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", ++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); ++ ++ s->output_drm = 0; ++ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + s->output_drm = 1; +- break; + } + + s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); +@@ -606,6 +611,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { + .p.wrapper_name = "v4l2m2m", \ + .p.pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ + AV_PIX_FMT_NV12, \ ++ AV_PIX_FMT_YUV420P, \ + AV_PIX_FMT_NONE}, \ + .hw_configs = v4l2_m2m_hw_configs, \ + } + +From 353911de1297691582400e34c2edc23b128b1d09 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 10 Jun 2021 18:59:38 +0100 +Subject: [PATCH 019/113] Fix YUV420P output from v4l2m2m + +Also put get_width get_height inlines in header as they are generally +useful. +--- + libavcodec/v4l2_buffers.c | 12 ++++++------ + libavcodec/v4l2_context.c | 22 ++++++---------------- + libavcodec/v4l2_m2m.h | 12 ++++++++++++ + 3 files changed, 24 insertions(+), 22 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 1ca1128db6..f4c11ca8d0 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -425,17 +425,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + case AV_PIX_FMT_NV21: + if (avbuf->num_planes > 1) + break; +- frame->linesize[1] = avbuf->plane_info[0].bytesperline; +- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; ++ frame->linesize[1] = frame->linesize[0]; ++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); + break; + + case AV_PIX_FMT_YUV420P: + if (avbuf->num_planes > 1) + break; +- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1; +- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1; +- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; +- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2); ++ frame->linesize[1] = frame->linesize[0] / 2; ++ frame->linesize[2] = frame->linesize[1]; ++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); ++ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; + break; + + default: +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index cbbd0551a7..5404fb1e94 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -55,16 +55,6 @@ static inline AVCodecContext *logger(V4L2Context *ctx) + return ctx_to_m2mctx(ctx)->avctx; + } + +-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) +-{ +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; +-} +- +-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) +-{ +- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; +-} +- + static AVRational v4l2_get_sar(V4L2Context *ctx) + { + struct AVRational sar = { 0, 1 }; +@@ -96,8 +86,8 @@ static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2 + if (ret) + av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", + ctx->name, +- v4l2_get_width(fmt1), v4l2_get_height(fmt1), +- v4l2_get_width(fmt2), v4l2_get_height(fmt2)); ++ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), ++ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); + + return ret; + } +@@ -195,8 +185,8 @@ static int do_source_change(V4L2m2mContext * const s) + + reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); + if (reinit) { +- s->capture.height = v4l2_get_height(&cap_fmt); +- s->capture.width = v4l2_get_width(&cap_fmt); ++ s->capture.height = ff_v4l2_get_format_height(&cap_fmt); ++ s->capture.width = ff_v4l2_get_format_width(&cap_fmt); + } + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + +@@ -973,8 +963,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers + av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), + req.count, +- v4l2_get_width(&ctx->format), +- v4l2_get_height(&ctx->format), ++ ff_v4l2_get_format_width(&ctx->format), ++ ff_v4l2_get_format_height(&ctx->format), + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, + V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 24a9c94864..8f054f2f50 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -160,4 +160,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); + */ + int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); + ++ ++static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++} ++ ++static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++} ++ ++ + #endif /* AVCODEC_V4L2_M2M_H */ + +From 1dec95facf260599f9297a9bebc6227fddfebb07 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 10 Jun 2021 19:23:44 +0100 +Subject: [PATCH 020/113] Report buffer overflows in v4l2m2m + +--- + libavcodec/v4l2_buffers.c | 14 ++++++++++---- + libavcodec/v4l2_context.c | 5 ++++- + 2 files changed, 14 insertions(+), 5 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index f4c11ca8d0..de31f7ced9 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -364,6 +364,7 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) + static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) + { + unsigned int bytesused, length; ++ int rv = 0; + + if (plane >= out->num_planes) + return AVERROR(EINVAL); +@@ -371,11 +372,16 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i + length = out->plane_info[plane].length; + bytesused = FFMIN(size+offset, length); + +- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); ++ if (size > length - offset) { ++ size = length - offset; ++ rv = AVERROR(ENOMEM); ++ } ++ ++ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); + + set_buf_length(out, plane, bytesused, length); + +- return 0; ++ return rv; + } + + static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) +@@ -630,7 +636,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + } + + ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); +- if (ret) ++ if (ret && ret != AVERROR(ENOMEM)) + return ret; + + v4l2_set_pts(out, pkt->pts, no_rescale_pts); +@@ -638,7 +644,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + if (pkt->flags & AV_PKT_FLAG_KEY) + out->flags = V4L2_BUF_FLAG_KEYFRAME; + +- return 0; ++ return ret; + } + + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 5404fb1e94..5f89b5047c 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -824,7 +824,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + return AVERROR(EAGAIN); + + ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); +- if (ret) ++ if (ret == AVERROR(ENOMEM)) ++ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", ++ __func__, pkt->size, avbuf->planes[0].length); ++ else if (ret) + return ret; + + return ff_v4l2_buffer_enqueue(avbuf); + +From 6b1946de881162e8d2e256b029a75f1e5804332f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 14 Jun 2021 11:55:16 +0100 +Subject: [PATCH 021/113] Increase V4L2 H264 stateful coded buffer size + +Try to set a min size of frame size / 2 for bitbuffers passed to V4l2. +This fixes a few streams that have large I-frames. You would hope +Annex-A gave useful minCR so an appropriate size could be calculated +but it doesn't really. It gives good guidance for bits required over +time but the instantaneous limits are very weak so it is possible +that even this won't be enough. The correct long term solution would +be to have resizable dmabufs but that is a greter rewrite than seems +sensible now. +--- + libavcodec/v4l2_context.c | 24 +++++++++++++++++++++++- + libavcodec/v4l2_context.h | 6 ++++++ + libavcodec/v4l2_m2m_dec.c | 24 ++++++++++++++++++++++++ + 3 files changed, 53 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 5f89b5047c..fc8325a18d 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -902,7 +902,29 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) + + int ff_v4l2_context_set_format(V4L2Context* ctx) + { +- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ int ret; ++ ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ if (ret != 0) ++ return ret; ++ ++ // Check returned size against min size and if smaller have another go ++ // Only worry about plane[0] as this is meant to enforce limits for ++ // encoded streams where we might know a bit more about the shape ++ // than the driver ++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { ++ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage) ++ return 0; ++ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size; ++ } ++ else { ++ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage) ++ return 0; ++ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size; ++ } ++ ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); ++ return ret; + } + + void ff_v4l2_context_release(V4L2Context* ctx) +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 59009d11d1..37b0431400 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -75,6 +75,12 @@ typedef struct V4L2Context { + AVRational sample_aspect_ratio; + struct v4l2_rect selection; + ++ /** ++ * If the default size of buffer is less than this then try to ++ * set to this. ++ */ ++ uint32_t min_buf_size; ++ + /** + * Indexed array of pointers to V4L2Buffers + */ +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index d258fed28b..b37b005d3f 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -450,6 +450,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + #endif + ++static uint32_t max_coded_size(const AVCodecContext * const avctx) ++{ ++ uint32_t wxh = avctx->coded_width * avctx->coded_height; ++ uint32_t size; ++ ++ // Currently the only thing we try to set our own limits for is H264 ++ if (avctx->codec_id != AV_CODEC_ID_H264) ++ return 0; ++ ++ size = wxh * 3 / 2; ++ // H.264 Annex A table A-1 gives minCR which is either 2 or 4 ++ // unfortunately that doesn't yield an actually useful limit ++ // and it should be noted that frame 0 is special cased to allow ++ // a bigger number which really isn't helpful for us. So just pick ++ // frame_size / 2 ++ size /= 2; ++ // Add 64k to allow for any overheads and/or encoder hopefulness ++ // with small WxH ++ return size + (1 << 16); ++} ++ + static av_cold int v4l2_decode_init(AVCodecContext *avctx) + { + V4L2Context *capture, *output; +@@ -460,6 +481,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + ++ av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -476,9 +498,11 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + output->av_codec_id = avctx->codec_id; + output->av_pix_fmt = AV_PIX_FMT_NONE; ++ output->min_buf_size = max_coded_size(avctx); + + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; ++ capture->min_buf_size = 0; + + /* the client requests the codec to generate DRM frames: + * - data[0] will therefore point to the returned AVDRMFrameDescriptor + +From 2a241b27bf5862a2faadecff08c2d936bb40b087 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 28 Jun 2021 12:13:35 +0100 +Subject: [PATCH 022/113] Fix raw video s.t. it respects any remaining cropping + +This fixes the long standing CONFWIN_A conformance test failure for drm. +--- + libavcodec/rawenc.c | 32 ++++++++--- + libavutil/hwcontext_drm.c | 112 ++++++++++++++++++++++++++++++++++++-- + 2 files changed, 130 insertions(+), 14 deletions(-) + +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index 0cd8eaffee..f80d402ce3 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -124,32 +124,41 @@ static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + + + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, +- const AVFrame *frame, int *got_packet) ++ const AVFrame *src_frame, int *got_packet) + { + int ret; ++ AVFrame * frame = NULL; + + #if CONFIG_SAND +- if (av_rpi_is_sand_frame(frame)) { +- ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : +- av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) : +- av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1; ++ if (av_rpi_is_sand_frame(src_frame)) { ++ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) : ++ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) : ++ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1; + *got_packet = (ret == 0); + return ret; + } + #endif + ++ if ((frame = av_frame_clone(src_frame)) == NULL) { ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0) ++ goto fail; ++ + ret = av_image_get_buffer_size(frame->format, + frame->width, frame->height, 1); + if (ret < 0) +- return ret; ++ goto fail; + + if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0) +- return ret; ++ goto fail; + if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, + (const uint8_t **)frame->data, frame->linesize, + frame->format, + frame->width, frame->height, 1)) < 0) +- return ret; ++ goto fail; + + if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && + frame->format == AV_PIX_FMT_YUYV422) { +@@ -165,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16); + } + } ++ pkt->flags |= AV_PKT_FLAG_KEY; ++ av_frame_free(&frame); + *got_packet = 1; + return 0; ++ ++fail: ++ av_frame_free(&frame); ++ *got_packet = 0; ++ return ret; + } + + const FFCodec ff_rawvideo_encoder = { +diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c +index 7a9fdbd263..baf18920fa 100644 +--- a/libavutil/hwcontext_drm.c ++++ b/libavutil/hwcontext_drm.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + /* This was introduced in version 4.6. And may not exist all without an + * optional package. So to prevent a hard dependency on needing the Linux +@@ -31,6 +32,7 @@ + #endif + + #include ++#include + #include + + #include "avassert.h" +@@ -38,7 +40,9 @@ + #include "hwcontext_drm.h" + #include "hwcontext_internal.h" + #include "imgutils.h" +- ++#if CONFIG_SAND ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static void drm_device_free(AVHWDeviceContext *hwdev) + { +@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, + AVDRMDeviceContext *hwctx = hwdev->hwctx; + drmVersionPtr version; + ++ if (device == NULL) { ++ hwctx->fd = -1; ++ return 0; ++ } ++ + hwctx->fd = open(device, O_RDWR); + if (hwctx->fd < 0) + return AVERROR(errno); +@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + if (flags & AV_HWFRAME_MAP_WRITE) + mmap_prot |= PROT_WRITE; + ++ if (dst->format == AV_PIX_FMT_NONE) ++ dst->format = hwfc->sw_format; + #if HAVE_LINUX_DMA_BUF_H + if (flags & AV_HWFRAME_MAP_READ) + map->sync_flags |= DMA_BUF_SYNC_READ; +@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + + dst->width = src->width; + dst->height = src->height; ++ dst->crop_top = src->crop_top; ++ dst->crop_bottom = src->crop_bottom; ++ dst->crop_left = src->crop_left; ++ dst->crop_right = src->crop_right; ++ ++#if CONFIG_SAND ++ // Rework for sand frames ++ if (av_rpi_is_sand_frame(dst)) { ++ // As it stands the sand formats hold stride2 in linesize[3] ++ // linesize[0] & [1] contain stride1 which is always 128 for everything we do ++ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] ++ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); ++ dst->linesize[0] = 128; ++ dst->linesize[1] = 128; ++ // *** Are we sure src->height is actually what we want ??? ++ } ++#endif + + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, + &drm_unmap_frame, map); +@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, + if (!pix_fmts) + return AVERROR(ENOMEM); + +- pix_fmts[0] = ctx->sw_format; ++ // **** Offer native sand too ???? ++ pix_fmts[0] = ++#if CONFIG_SAND ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? ++ AV_PIX_FMT_YUV420P : ++ ctx->sw_format == AV_PIX_FMT_RPI4_10 ? ++ AV_PIX_FMT_YUV420P10LE : ++#endif ++ ctx->sw_format; + pix_fmts[1] = AV_PIX_FMT_NONE; + + *formats = pix_fmts; +@@ -231,18 +267,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, + map = av_frame_alloc(); + if (!map) + return AVERROR(ENOMEM); +- map->format = dst->format; + ++ // Map to default ++ map->format = AV_PIX_FMT_NONE; + err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); + if (err) + goto fail; + +- map->width = dst->width; +- map->height = dst->height; ++#if 0 ++ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, ++ hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, ++ map->width, map->height, ++ map->linesize[0], ++ map->linesize[1], ++ map->linesize[2], ++ map->linesize[3], ++ dst->width, dst->height, ++ dst->linesize[0], ++ dst->linesize[1], ++ dst->linesize[2]); ++#endif ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(map)) { ++ // Preserve crop - later ffmpeg code assumes that we have in that it ++ // overwrites any crop that we create with the old values ++ const unsigned int w = FFMIN(dst->width, map->width); ++ const unsigned int h = FFMIN(dst->height, map->height); ++ ++ if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ map->data[0], ++ 128, stride2, ++ 0, 0, w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ map->data[1], ++ 128, stride2, ++ 0, 0, w / 2, h / 2); ++ } ++ else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { ++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], ++ map->data[0], ++ 128, stride2, ++ 0, 0, w, h); ++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ map->data[1], ++ 128, stride2, ++ 0, 0, w / 2, h / 2); ++ } ++ else ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); ++ err = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ dst->width = w; ++ dst->height = h; ++ } ++ else ++#endif ++ { ++ // Kludge mapped h/w s.t. frame_copy works ++ map->width = dst->width; ++ map->height = dst->height; ++ err = av_frame_copy(dst, map); ++ } + +- err = av_frame_copy(dst, map); + if (err) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); + goto fail; ++ } + + err = 0; + fail: +@@ -257,7 +354,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, + int err; + + if (src->width > hwfc->width || src->height > hwfc->height) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); + return AVERROR(EINVAL); ++ } + + map = av_frame_alloc(); + if (!map) + +From c29e1648b05579d84323a5bd5ce8987228115e1c Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 13 Aug 2021 15:38:28 +0100 +Subject: [PATCH 023/113] Set frame interlace from V4L2 buffer field + +--- + libavcodec/v4l2_buffers.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index de31f7ced9..97b8eb1db3 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -222,6 +222,16 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) + return AVCOL_TRC_UNSPECIFIED; + } + ++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) ++{ ++ return V4L2_FIELD_IS_INTERLACED(buf->buf.field); ++} ++ ++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) ++{ ++ return buf->buf.field == V4L2_FIELD_INTERLACED_TB; ++} ++ + static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) + { + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; +@@ -576,6 +586,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc + frame->color_trc = v4l2_get_color_trc(avbuf); + frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); + frame->pkt_dts = AV_NOPTS_VALUE; ++ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); ++ frame->top_field_first = v4l2_buf_is_top_first(avbuf); + + /* these values are updated also during re-init in v4l2_process_driver_event */ + frame->height = ctx->height; + +From ad065684c270f1e44d53188f89affd913e338696 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 13 Aug 2021 16:11:53 +0100 +Subject: [PATCH 024/113] Fix V4L2 stateful to avoid crash if flush before + start + +--- + libavcodec/v4l2_context.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index fc8325a18d..d6243f6b80 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -713,6 +713,10 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) + static void flush_all_buffers_status(V4L2Context* const ctx) + { + int i; ++ ++ if (!ctx->bufrefs) ++ return; ++ + for (i = 0; i < ctx->num_buffers; ++i) { + struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; + if (buf->status == V4L2BUF_IN_DRIVER) + +From 3769927973cd27b00743800c8680d33d5982a4fe Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 9 Sep 2021 17:44:13 +0100 +Subject: [PATCH 025/113] Copy properties from frame to v4l2 buffer + +Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that +ff_v4l2_buffer_buf_to_avframe copies +--- + libavcodec/v4l2_buffers.c | 126 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 126 insertions(+) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 97b8eb1db3..126d2a17f4 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -128,6 +128,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) + return AVCOL_PRI_UNSPECIFIED; + } + ++static void v4l2_set_color(V4L2Buffer *buf, ++ const enum AVColorPrimaries avcp, ++ const enum AVColorSpace avcs, ++ const enum AVColorTransferCharacteristic avxc) ++{ ++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; ++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; ++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; ++ ++ switch (avcp) { ++ case AVCOL_PRI_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ ycbcr = V4L2_YCBCR_ENC_709; ++ break; ++ case AVCOL_PRI_BT470M: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ ycbcr = V4L2_YCBCR_ENC_601; ++ break; ++ case AVCOL_PRI_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_PRI_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_PRI_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_PRI_BT2020: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ case AVCOL_PRI_SMPTE428: ++ case AVCOL_PRI_SMPTE431: ++ case AVCOL_PRI_SMPTE432: ++ case AVCOL_PRI_EBU3213: ++ case AVCOL_PRI_RESERVED: ++ case AVCOL_PRI_FILM: ++ case AVCOL_PRI_UNSPECIFIED: ++ default: ++ break; ++ } ++ ++ switch (avcs) { ++ case AVCOL_SPC_RGB: ++ cs = V4L2_COLORSPACE_SRGB; ++ break; ++ case AVCOL_SPC_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ break; ++ case AVCOL_SPC_FCC: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ break; ++ case AVCOL_SPC_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_SPC_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_SPC_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_SPC_BT2020_CL: ++ cs = V4L2_COLORSPACE_BT2020; ++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; ++ break; ++ case AVCOL_SPC_BT2020_NCL: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ default: ++ break; ++ } ++ ++ switch (xfer) { ++ case AVCOL_TRC_BT709: ++ xfer = V4L2_XFER_FUNC_709; ++ break; ++ case AVCOL_TRC_IEC61966_2_1: ++ xfer = V4L2_XFER_FUNC_SRGB; ++ break; ++ case AVCOL_TRC_SMPTE240M: ++ xfer = V4L2_XFER_FUNC_SMPTE240M; ++ break; ++ case AVCOL_TRC_SMPTE2084: ++ xfer = V4L2_XFER_FUNC_SMPTE2084; ++ break; ++ default: ++ break; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { ++ buf->context->format.fmt.pix_mp.colorspace = cs; ++ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; ++ buf->context->format.fmt.pix_mp.xfer_func = xfer; ++ } else { ++ buf->context->format.fmt.pix.colorspace = cs; ++ buf->context->format.fmt.pix.ycbcr_enc = ycbcr; ++ buf->context->format.fmt.pix.xfer_func = xfer; ++ } ++} ++ + static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) + { + enum v4l2_quantization qt; +@@ -146,6 +245,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) + return AVCOL_RANGE_UNSPECIFIED; + } + ++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) ++{ ++ const enum v4l2_quantization q = ++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : ++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : ++ V4L2_QUANTIZATION_DEFAULT; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { ++ buf->context->format.fmt.pix_mp.quantization = q; ++ } else { ++ buf->context->format.fmt.pix.quantization = q; ++ } ++} ++ + static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) + { + enum v4l2_ycbcr_encoding ycbcr; +@@ -232,6 +345,12 @@ static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) + return buf->buf.field == V4L2_FIELD_INTERLACED_TB; + } + ++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) ++{ ++ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : ++ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; ++} ++ + static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) + { + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; +@@ -561,7 +680,14 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + + int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { ++ out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME); ++ // Beware that colour info is held in format rather than the actual ++ // v4l2 buffer struct so this may not be as useful as you might hope ++ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); ++ v4l2_set_color_range(out, frame->color_range); ++ // PTS & interlace are buffer vars + v4l2_set_pts(out, frame->pts, 0); ++ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); + + return v4l2_buffer_swframe_to_buf(frame, out); + } + +From c2c1b11af72cd8bba8140922ff4adab1132f5c3a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 17 Nov 2021 16:49:01 +0000 +Subject: [PATCH 026/113] ffmpeg: Do not inc DTS on no decode output + +V4L2 H264 decode has long latency and sometimes spits out a long stream +of output without input. In this case incrementing DTS is wrong. There +may be cases where the condition as written is correct so only "fix" in +the cases which cause problems +--- + fftools/ffmpeg.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index e98f0b8149..517324df3a 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -2420,7 +2420,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo + case AVMEDIA_TYPE_VIDEO: + ret = decode_video (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt, + &decode_failed); +- if (!repeating || !pkt || got_output) { ++ // Pi: Do not inc dts if no_cvt_hw set ++ // V4L2 H264 decode has long latency and sometimes spits out a long ++ // stream of output without input. In this case incrementing DTS is wrong. ++ // There may be cases where the condition as written is correct so only ++ // "fix" in the cases which cause problems ++ if (!repeating || !pkt || (got_output && !no_cvt_hw)) { + if (pkt && pkt->duration) { + duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); + } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { + +From 4aec8304fb35312e36766efa6844cb4be3b03f50 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 17 Nov 2021 17:32:59 +0000 +Subject: [PATCH 027/113] v4l2_m2m_dec: Adjust timebase if H264 + +Adjust AVCodecContext time_base if H264 in the same way that the +software decoder does. +--- + libavcodec/v4l2_m2m_dec.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index b37b005d3f..5c4c199a79 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -481,6 +481,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + ++ if (avctx->codec_id == AV_CODEC_ID_H264) { ++ if (avctx->ticks_per_frame == 1) { ++ if(avctx->time_base.den < INT_MAX/2) { ++ avctx->time_base.den *= 2; ++ } else ++ avctx->time_base.num /= 2; ++ } ++ avctx->ticks_per_frame = 2; ++ } ++ + av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + +From 818e77a19bbaa4c8d51d68338f801ae4c236fd81 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 17 Nov 2021 17:38:27 +0000 +Subject: [PATCH 028/113] v4l2_m2m_dec: Produce best guess PTSs if none + supplied + +Filter scheduling gets confused by missing PTSs and makes poor guesses +more often than not. Try to generate plausible timestamps where we are +missing them. +--- + libavcodec/v4l2_m2m.h | 12 ++++++++ + libavcodec/v4l2_m2m_dec.c | 64 +++++++++++++++++++++++++++++++++++++-- + 2 files changed, 74 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 8f054f2f50..82feb0afdb 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -52,6 +52,16 @@ typedef struct V4L2m2mTrackEl { + int64_t track_pts; + } V4L2m2mTrackEl; + ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++ int64_t guess; ++} pts_stats_t; ++ + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; + int fd; +@@ -91,6 +101,8 @@ typedef struct V4L2m2mContext { + unsigned int track_no; + V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; + ++ pts_stats_t pts_stat; ++ + /* req pkt */ + int req_pkt; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 5c4c199a79..eb19ec422f 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -42,6 +42,62 @@ + #include "v4l2_m2m.h" + #include "v4l2_fmt.h" + ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++ ++static int64_t pts_stats_guess(const pts_stats_t * const stats) ++{ ++ if (stats->last_pts == AV_NOPTS_VALUE || ++ stats->last_interval == 0 || ++ stats->last_count >= STATS_LAST_COUNT_MAX) ++ return AV_NOPTS_VALUE; ++ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; ++} ++ ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; ++ } ++ ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; ++ ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; ++ } ++ } ++ ++ stats->last_pts = pts; ++ stats->last_count = 1; ++} ++ ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) ++{ ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; ++} ++ + static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) + { + int ret; +@@ -244,9 +300,11 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons + return -1; + } + +- frame->best_effort_timestamp = frame->pts; ++ pts_stats_add(&s->pts_stat, frame->pts); ++ ++ frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? +- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); + return 0; + } + +@@ -496,6 +554,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + if (ret < 0) + return ret; + ++ pts_stats_init(&s->pts_stat, avctx, "decoder"); ++ + capture = &s->capture; + output = &s->output; + + +From 95182698a54c9656e004479023bcc3bef5b4c6fa Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 17 Nov 2021 17:59:27 +0000 +Subject: [PATCH 029/113] v4l2_m2m_dec: Try harder to get an initial frame + +If the input Q is full then wait on a short timeout for a capture frame +rather than stuffing yet still another frame into the input if we could +do that first. This attempts to restrict the sometimes daft initial +buffering that ends up confusing the rest of the system. +--- + libavcodec/v4l2_context.c | 2 +- + libavcodec/v4l2_m2m_dec.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index d6243f6b80..3c01e95ea1 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -381,7 +381,7 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) + start: + if (is_capture) { + /* no need to listen to requests for more input while draining */ +- if (ctx_to_m2mctx(ctx)->draining) ++ if (ctx_to_m2mctx(ctx)->draining || timeout > 0) + pfd.events = POLLIN | POLLRDNORM | POLLPRI; + } else { + pfd.events = POLLOUT | POLLWRNORM; +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index eb19ec422f..f05f2927e6 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -442,7 +442,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // when discarding + // This returns AVERROR(EAGAIN) if there isn't a frame ready yet + // but there is room in the input Q +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1); ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1); + + if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) + av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", + +From e979ff9809197ad46c299031c4ee39facdadef31 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 17 Nov 2021 18:04:56 +0000 +Subject: [PATCH 030/113] Add a V4L2 M2M deinterlace filter + +Add a V4L2 deinterlace filter that will accept DRMPRIME frames. + +Multiple people have contributed to this: +Jernej Skrabec +Alex Bee +popcornmix +John Cox + +There is an unknown delay through the filter of typically one or three +fields which translates to 1 or 2 frames. Frames that are delayed are +lost at end of stream as the V4L2 filter has no flush control. +--- + libavcodec/v4l2_context.c | 4 +- + libavfilter/Makefile | 1 + + libavfilter/allfilters.c | 1 + + libavfilter/vf_deinterlace_v4l2m2m.c | 1269 ++++++++++++++++++++++++++ + 4 files changed, 1273 insertions(+), 2 deletions(-) + create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 3c01e95ea1..964c17ac0e 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -498,10 +498,10 @@ dequeue: + return NULL; + } + --ctx->q_count; +- av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n", ++ av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n", + ctx->name, buf.index, + buf.timestamp.tv_sec, buf.timestamp.tv_usec, +- ctx->q_count, ++ctx->dq_count); ++ ctx->q_count, ++ctx->dq_count, buf.field); + + avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; + avbuf->status = V4L2BUF_AVAILABLE; +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index 6db336e74a..394e61a0c2 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -254,6 +254,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o + OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o + OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o + OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o ++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o + OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o + OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o + OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 4bf3a5cfd8..9b17c37eb3 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -242,6 +242,7 @@ extern const AVFilter ff_vf_derain; + extern const AVFilter ff_vf_deshake; + extern const AVFilter ff_vf_deshake_opencl; + extern const AVFilter ff_vf_despill; ++extern const AVFilter ff_vf_deinterlace_v4l2m2m; + extern const AVFilter ff_vf_detelecine; + extern const AVFilter ff_vf_dilation; + extern const AVFilter ff_vf_dilation_opencl; +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +new file mode 100644 +index 0000000000..1a933b7e0a +--- /dev/null ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -0,0 +1,1269 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * deinterlace video filter - V4L2 M2M ++ */ ++ ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "libavutil/avassert.h" ++#include "libavutil/avstring.h" ++#include "libavutil/common.h" ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavutil/internal.h" ++#include "libavutil/mathematics.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/time.h" ++ ++#define FF_INTERNAL_FIELDS 1 ++#include "framequeue.h" ++#include "filters.h" ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct V4L2Queue V4L2Queue; ++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; ++ ++typedef struct V4L2PlaneInfo { ++ int bytesperline; ++ size_t length; ++} V4L2PlaneInfo; ++ ++typedef struct V4L2Buffer { ++ int enqueued; ++ int reenqueue; ++ int fd; ++ struct v4l2_buffer buffer; ++ AVFrame frame; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int num_planes; ++ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; ++ AVDRMFrameDescriptor drm_frame; ++ V4L2Queue *q; ++} V4L2Buffer; ++ ++typedef struct V4L2Queue { ++ struct v4l2_format format; ++ int num_buffers; ++ V4L2Buffer *buffers; ++ DeintV4L2M2MContextShared *ctx; ++} V4L2Queue; ++ ++typedef struct pts_stats_s ++{ ++ void * logctx; ++ const char * name; // For debug ++ unsigned int last_count; ++ unsigned int last_interval; ++ int64_t last_pts; ++} pts_stats_t; ++ ++#define PTS_TRACK_SIZE 32 ++typedef struct pts_track_el_s ++{ ++ uint32_t n; ++ unsigned int interval; ++ AVFrame * props; ++} pts_track_el_t; ++ ++typedef struct pts_track_s ++{ ++ uint32_t n; ++ uint32_t last_n; ++ int got_2; ++ void * logctx; ++ pts_stats_t stats; ++ pts_track_el_t a[PTS_TRACK_SIZE]; ++} pts_track_t; ++ ++typedef struct DeintV4L2M2MContextShared { ++ void * logctx; // For logging - will be NULL when done ++ ++ int fd; ++ int done; + int width; + int height; -+} test; ++ int orig_width; ++ int orig_height; ++ atomic_uint refcount; + -+typedef struct matrix { -+ size_t width; -+ size_t height; -+ float d[]; -+} matrix; ++ AVBufferRef *hw_frames_ctx; + -+static const matrix T8 = { 8, 8, { -+ 12, 12, 12, 12, 12, 12, 12, 12, -+ 16, 15, 9, 4, -4, -9, -15, -16, -+ 16, 6, -6, -16, -16, -6, 6, 16, -+ 15, -4, -16, -9, 9, 16, 4, -15, -+ 12, -12, -12, 12, 12, -12, -12, 12, -+ 9, -16, 4, 15, -15, -4, 16, -9, -+ 6, -16, 16, -6, -6, 16, -16, 6, -+ 4, -9, 15, -16, 16, -15, 9, -4 -+} }; ++ unsigned int field_order; + -+static const matrix T4 = { 4, 4, { -+ 17, 17, 17, 17, -+ 22, 10, -10, -22, -+ 17, -17, -17, 17, -+ 10, -22, 22, -10 -+} }; ++ pts_track_t track; + -+static const matrix T8t = { 8, 8, { -+ 12, 16, 16, 15, 12, 9, 6, 4, -+ 12, 15, 6, -4, -12, -16, -16, -9, -+ 12, 9, -6, -16, -12, 4, 16, 15, -+ 12, 4, -16, -9, 12, 15, -6, -16, -+ 12, -4, -16, 9, 12, -15, -6, 16, -+ 12, -9, -6, 16, -12, -4, 16, -15, -+ 12, -15, 6, 4, -12, 16, -16, 9, -+ 12, -16, 16, -15, 12, -9, 6, -4 -+} }; ++ V4L2Queue output; ++ V4L2Queue capture; ++} DeintV4L2M2MContextShared; + -+static const matrix T4t = { 4, 4, { -+ 17, 22, 17, 10, -+ 17, 10, -17, -22, -+ 17, -10, -17, 22, -+ 17, -22, 17, -10 -+} }; ++typedef struct DeintV4L2M2MContext { ++ const AVClass *class; + -+static matrix *new_matrix(size_t width, size_t height) ++ DeintV4L2M2MContextShared *shared; ++} DeintV4L2M2MContext; ++ ++static unsigned int pts_stats_interval(const pts_stats_t * const stats) +{ -+ matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float)); -+ if (out == NULL) { -+ fprintf(stderr, "Memory allocation failure\n"); -+ exit(EXIT_FAILURE); ++ return stats->last_interval; ++} ++ ++// Pick 64 for max last count - that is >1sec at 60fps ++#define STATS_LAST_COUNT_MAX 64 ++#define STATS_INTERVAL_MAX (1 << 30) ++static void pts_stats_add(pts_stats_t * const stats, int64_t pts) ++{ ++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { ++ if (stats->last_count < STATS_LAST_COUNT_MAX) ++ ++stats->last_count; ++ return; + } -+ out->width = width; -+ out->height = height; -+ return out; -+} + -+static matrix *multiply(const matrix *a, const matrix *b) -+{ -+ matrix *out; -+ if (a->width != b->height) { -+ fprintf(stderr, "Incompatible multiplication\n"); -+ exit(EXIT_FAILURE); ++ if (stats->last_pts != AV_NOPTS_VALUE) { ++ const int64_t interval = pts - stats->last_pts; ++ ++ if (interval < 0 || interval >= STATS_INTERVAL_MAX || ++ stats->last_count >= STATS_LAST_COUNT_MAX) { ++ if (stats->last_interval != 0) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", ++ __func__, stats->name, interval, stats->last_count); ++ stats->last_interval = 0; ++ } ++ else { ++ const int64_t frame_time = interval / (int64_t)stats->last_count; ++ ++ if (frame_time != stats->last_interval) ++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", ++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); ++ stats->last_interval = frame_time; ++ } + } -+ out = new_matrix(b->width, a->height); -+ for (int j = 0; j < out->height; ++j) -+ for (int i = 0; i < out->width; ++i) { -+ float sum = 0; -+ for (int k = 0; k < a->width; ++k) -+ sum += a->d[j * a->width + k] * b->d[k * b->width + i]; -+ out->d[j * out->width + i] = sum; -+ } -+ return out; ++ ++ stats->last_pts = pts; ++ stats->last_count = 1; +} + -+static void normalise(matrix *a) ++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) +{ -+ for (int j = 0; j < a->height; ++j) -+ for (int i = 0; i < a->width; ++i) { -+ float *p = a->d + j * a->width + i; -+ *p *= 64; -+ if (a->height == 4) -+ *p /= (const unsigned[]) { 289, 292, 289, 292 } [j]; -+ else -+ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j]; -+ if (a->width == 4) -+ *p /= (const unsigned[]) { 289, 292, 289, 292 } [i]; -+ else -+ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i]; -+ } ++ *stats = (pts_stats_t){ ++ .logctx = logctx, ++ .name = name, ++ .last_count = 1, ++ .last_interval = 0, ++ .last_pts = AV_NOPTS_VALUE ++ }; +} + -+static void divide_and_round_nearest(matrix *a, float by) ++static inline uint32_t pts_track_next_n(pts_track_t * const trk) +{ -+ for (int j = 0; j < a->height; ++j) -+ for (int i = 0; i < a->width; ++i) { -+ float *p = a->d + j * a->width + i; -+ *p = rintf(*p / by); -+ } ++ if (++trk->n == 0) ++ trk->n = 1; ++ return trk->n; +} + -+static void tweak(matrix *a) ++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) +{ -+ for (int j = 4; j < a->height; ++j) -+ for (int i = 0; i < a->width; ++i) { -+ float *p = a->d + j * a->width + i; -+ *p += 1; -+ } ++ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); ++ pts_track_el_t * t; ++ ++ // As a first guess assume that n==0 means last frame ++ if (n == 0) { ++ n = trk->last_n; ++ if (n == 0) ++ goto fail; ++ } ++ ++ t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ if (t->n != n) { ++ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); ++ goto fail; ++ } ++ ++ // 1st frame is simple - just believe it ++ if (n != trk->last_n) { ++ trk->last_n = n; ++ trk->got_2 = 0; ++ return av_frame_copy_props(dst, t->props); ++ } ++ ++ // Only believe in a single interpolated frame ++ if (trk->got_2) ++ goto fail; ++ trk->got_2 = 1; ++ ++ av_frame_copy_props(dst, t->props); ++ ++ ++ // If we can't guess - don't ++ if (t->interval == 0) { ++ dst->best_effort_timestamp = AV_NOPTS_VALUE; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ } ++ else { ++ if (dst->best_effort_timestamp != AV_NOPTS_VALUE) ++ dst->best_effort_timestamp += t->interval / 2; ++ if (dst->pts != AV_NOPTS_VALUE) ++ dst->pts += t->interval / 2; ++ if (dst->pkt_dts != AV_NOPTS_VALUE) ++ dst->pkt_dts += t->interval / 2; ++ } ++ ++ return 0; ++ ++fail: ++ trk->last_n = 0; ++ trk->got_2 = 0; ++ dst->pts = AV_NOPTS_VALUE; ++ dst->pkt_dts = AV_NOPTS_VALUE; ++ return 0; +} + -+/* The VC-1 spec places restrictions on the values permitted at three -+ * different stages: -+ * - D: the input coefficients in frequency domain -+ * - E: the intermediate coefficients, inverse-transformed only horizontally -+ * - R: the fully inverse-transformed coefficients ++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) ++{ ++ const uint32_t n = pts_track_next_n(trk); ++ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); ++ ++ pts_stats_add(&trk->stats, src->pts); ++ ++ t->n = n; ++ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last ++ av_frame_unref(t->props); ++ av_frame_copy_props(t->props, src); ++ ++ // We now know what the previous interval was, rather than having to guess, ++ // so set it. There is a better than decent chance that this is before ++ // we use it. ++ if (t->interval != 0) { ++ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); ++ prev_t->interval = t->interval; ++ } ++ ++ // In case deinterlace interpolates frames use every other usec ++ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; ++} ++ ++static void pts_track_uninit(pts_track_t * const trk) ++{ ++ unsigned int i; ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ av_frame_free(&trk->a[i].props); ++ } ++} ++ ++static int pts_track_init(pts_track_t * const trk, void *logctx) ++{ ++ unsigned int i; ++ trk->n = 1; ++ pts_stats_init(&trk->stats, logctx, "track"); ++ for (i = 0; i != PTS_TRACK_SIZE; ++i) { ++ trk->a[i].n = 0; ++ if ((trk->a[i].props = av_frame_alloc()) == NULL) { ++ pts_track_uninit(trk); ++ return AVERROR(ENOMEM); ++ } ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) ++{ ++ struct v4l2_capability cap; ++ int ret; ++ ++ memset(&cap, 0, sizeof(cap)); ++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); ++ if (ret < 0) ++ return ret; ++ ++ if (!(cap.capabilities & V4L2_CAP_STREAMING)) ++ return AVERROR(EINVAL); ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ ++ return 0; ++ } ++ ++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { ++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ ++ return 0; ++ } ++ ++ return AVERROR(EINVAL); ++} ++ ++static int deint_v4l2m2m_try_format(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret, field; ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); ++ ++ if (V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ field = V4L2_FIELD_INTERLACED_TB; ++ else ++ field = V4L2_FIELD_NONE; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = ctx->width; ++ fmt->fmt.pix_mp.height = ctx->height; ++ } else { ++ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = ctx->width; ++ fmt->fmt.pix.height = ctx->height; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, ++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, ++ fmt->fmt.pix_mp.pixelformat, ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); ++ if (ret) ++ return AVERROR(EINVAL); ++ ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, ++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, ++ fmt->fmt.pix_mp.pixelformat, ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 || ++ fmt->fmt.pix_mp.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } else { ++ if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 || ++ fmt->fmt.pix.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ ++ return AVERROR(EINVAL); ++ } ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ int ret; ++ ++ struct v4l2_selection sel = { ++ .type = fmt->type, ++ .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, ++ }; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = width; ++ fmt->fmt.pix_mp.height = ysize / pitch; ++ fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; ++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); ++ } else { ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = width; ++ fmt->fmt.pix.height = height; ++ fmt->fmt.pix.sizeimage = 0; ++ fmt->fmt.pix.bytesperline = 0; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); ++ ++ ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret); ++ ++ sel.r.width = width; ++ sel.r.height = height; ++ sel.r.left = 0; ++ sel.r.top = 0; ++ sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, ++ sel.flags = V4L2_SEL_FLAG_LE; ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); ++ if (ret) ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret); ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) ++{ ++ int ret; ++ ++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); ++ if (ctx->fd < 0) ++ return AVERROR(errno); ++ ++ ret = deint_v4l2m2m_prepare_context(ctx); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->capture); ++ if (ret) ++ goto fail; ++ ++ ret = deint_v4l2m2m_try_format(&ctx->output); ++ if (ret) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ close(ctx->fd); ++ ctx->fd = -1; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) ++{ ++ int ret = AVERROR(EINVAL); ++ struct dirent *entry; ++ char node[PATH_MAX]; ++ DIR *dirp; ++ ++ dirp = opendir("/dev"); ++ if (!dirp) ++ return AVERROR(errno); ++ ++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) { ++ ++ if (strncmp(entry->d_name, "video", 5)) ++ continue; ++ ++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); ++ ret = deint_v4l2m2m_probe_device(ctx, node); ++ if (!ret) ++ break; ++ } ++ ++ closedir(dirp); ++ ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); ++ ctx->fd = -1; ++ ++ return ret; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) ++{ ++ int ret; ++ ++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ buf->enqueued = 1; ++ ++ return 0; ++} ++ ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) ++{ ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; ++ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buffer.index; ++ expbuf.type = avbuf->buffer.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ avbuf->fd = expbuf.fd; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { ++ /* drm frame */ ++ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; ++ avbuf->drm_frame.objects[i].fd = expbuf.fd; ++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } else { ++ /* drm frame */ ++ avbuf->drm_frame.objects[0].size = avbuf->buffer.length; ++ avbuf->drm_frame.objects[0].fd = expbuf.fd; ++ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } ++ } ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) ++{ ++ struct v4l2_format *fmt = &queue->format; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_requestbuffers req; ++ int ret, i, j, multiplanar; ++ uint32_t memory; ++ ++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? ++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; ++ ++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); ++ ++ memset(&req, 0, sizeof(req)); ++ req.count = queue->num_buffers; ++ req.memory = memory; ++ req.type = fmt->type; ++ ++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); ++ if (ret < 0) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); ++ ++ return AVERROR(errno); ++ } ++ ++ queue->num_buffers = req.count; ++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); ++ if (!queue->buffers) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); ++ ++ return AVERROR(ENOMEM); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer *buf = &queue->buffers[i]; ++ ++ buf->enqueued = 0; ++ buf->fd = -1; ++ buf->q = queue; ++ ++ buf->buffer.type = fmt->type; ++ buf->buffer.memory = memory; ++ buf->buffer.index = i; ++ ++ if (multiplanar) { ++ buf->buffer.length = VIDEO_MAX_PLANES; ++ buf->buffer.m.planes = buf->planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); ++ if (ret < 0) { ++ ret = AVERROR(errno); ++ ++ goto fail; ++ } ++ ++ if (multiplanar) ++ buf->num_planes = buf->buffer.length; ++ else ++ buf->num_planes = 1; ++ ++ for (j = 0; j < buf->num_planes; j++) { ++ V4L2PlaneInfo *info = &buf->plane_info[j]; ++ ++ if (multiplanar) { ++ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; ++ info->length = buf->buffer.m.planes[j].length; ++ } else { ++ info->bytesperline = fmt->fmt.pix.bytesperline; ++ info->length = buf->buffer.length; ++ } ++ } ++ ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { ++ ret = deint_v4l2m2m_enqueue_buffer(buf); ++ if (ret) ++ goto fail; ++ ++ ret = v4l2_buffer_export_drm(buf); ++ if (ret) ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].fd >= 0) ++ close(queue->buffers[i].fd); ++ av_free(queue->buffers); ++ queue->buffers = NULL; ++ ++ return ret; ++} ++ ++static int deint_v4l2m2m_streamon(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++static int deint_v4l2m2m_streamoff(V4L2Queue *queue) ++{ ++ DeintV4L2M2MContextShared * const ctx = queue->ctx; ++ int type = queue->format.type; ++ int ret; ++ ++ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ return 0; ++} ++ ++// timeout in ms ++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) ++{ ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ struct v4l2_buffer buf = { 0 }; ++ V4L2Buffer* avbuf = NULL; ++ struct pollfd pfd; ++ short events; ++ int ret; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ events = POLLOUT | POLLWRNORM; ++ else ++ events = POLLIN | POLLRDNORM; ++ ++ pfd.events = events; ++ pfd.fd = ctx->fd; ++ ++ for (;;) { ++ ret = poll(&pfd, 1, timeout); ++ if (ret > 0) ++ break; ++ if (errno == EINTR) ++ continue; ++ return NULL; ++ } ++ ++ if (pfd.revents & POLLERR) ++ return NULL; ++ ++ if (pfd.revents & events) { ++ memset(&buf, 0, sizeof(buf)); ++ buf.memory = V4L2_MEMORY_MMAP; ++ buf.type = queue->format.type; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; ++ } ++ ++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); ++ if (ret) { ++ if (errno != EAGAIN) ++ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", ++ av_err2str(AVERROR(errno))); ++ return NULL; ++ } ++ ++ avbuf = &queue->buffers[buf.index]; ++ avbuf->enqueued = 0; ++ avbuf->buffer = buf; ++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buffer.m.planes = avbuf->planes; ++ } ++ return avbuf; ++ } ++ ++ return NULL; ++} ++ ++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (!queue->buffers[i].enqueued) { ++ buf = &queue->buffers[i]; ++ break; ++ } ++ return buf; ++} ++ ++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) ++{ ++ int i; ++ V4L2Buffer *buf = NULL; ++ ++ if (!queue || !queue->buffers) ++ return; ++ for (i = 0; i < queue->num_buffers; i++) { ++ buf = &queue->buffers[i]; ++ if (queue->buffers[i].enqueued) ++ av_frame_unref(&buf->frame); ++ } ++} ++ ++static void recycle_q(V4L2Queue * const queue) ++{ ++ V4L2Buffer* avbuf; ++ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { ++ av_frame_unref(&avbuf->frame); ++ } ++} ++ ++static int count_enqueued(V4L2Queue *queue) ++{ ++ int i; ++ int n = 0; ++ ++ if (queue->buffers == NULL) ++ return 0; ++ ++ for (i = 0; i < queue->num_buffers; i++) ++ if (queue->buffers[i].enqueued) ++ ++n; ++ return n; ++} ++ ++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) ++{ ++ DeintV4L2M2MContextShared *const ctx = queue->ctx; ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ V4L2Buffer *buf; ++ int i; ++ ++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) ++ recycle_q(queue); ++ ++ buf = deint_v4l2m2m_find_free_buf(queue); ++ if (!buf) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); ++ return AVERROR(EAGAIN); ++ } ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) ++ for (i = 0; i < drm_desc->nb_objects; i++) ++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; ++ else ++ buf->buffer.m.fd = drm_desc->objects[0].fd; ++ ++ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : ++ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : ++ V4L2_FIELD_INTERLACED_BT; ++ ++ if (ctx->field_order != buf->buffer.field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); ++ ctx->field_order = buf->buffer.field; ++ } ++ ++ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); ++ ++ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; ++ ++ av_frame_move_ref(&buf->frame, frame); ++ ++ return deint_v4l2m2m_enqueue_buffer(buf); ++} ++ ++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) ++{ ++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int i; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); ++ ++ if (ctx->fd >= 0) { ++ deint_v4l2m2m_streamoff(capture); ++ deint_v4l2m2m_streamoff(output); ++ } ++ ++ if (capture->buffers) ++ for (i = 0; i < capture->num_buffers; i++) { ++ capture->buffers[i].q = NULL; ++ if (capture->buffers[i].fd >= 0) ++ close(capture->buffers[i].fd); ++ } ++ ++ deint_v4l2m2m_unref_queued(output); ++ ++ av_buffer_unref(&ctx->hw_frames_ctx); ++ ++ if (capture->buffers) ++ av_free(capture->buffers); ++ ++ if (output->buffers) ++ av_free(output->buffers); ++ ++ if (ctx->fd >= 0) { ++ close(ctx->fd); ++ ctx->fd = -1; ++ } ++ ++ av_free(ctx); ++ } ++} ++ ++static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++{ ++ V4L2Buffer *buf = opaque; ++ DeintV4L2M2MContextShared *ctx = buf->q->ctx; ++ ++ if (!ctx->done) ++ deint_v4l2m2m_enqueue_buffer(buf); ++ ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) ++{ ++ int av_pix_fmt = AV_PIX_FMT_YUV420P; ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; ++ ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; ++ } ++ ++ switch (av_pix_fmt) { ++ case AV_PIX_FMT_YUYV422: ++ ++ layer->format = DRM_FORMAT_YUYV; ++ layer->nb_planes = 1; ++ ++ break; ++ ++ case AV_PIX_FMT_NV12: ++ case AV_PIX_FMT_NV21: ++ ++ layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ? ++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ break; ++ ++ case AV_PIX_FMT_YUV420P: ++ ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ++ ((avbuf->plane_info[0].bytesperline * ++ height) >> 2); ++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ break; ++ } ++ ++ return (uint8_t *) drm_desc; ++} ++ ++// timeout in ms ++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) ++{ ++ DeintV4L2M2MContextShared *ctx = queue->ctx; ++ V4L2Buffer* avbuf; ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); ++ if (!avbuf) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); ++ return AVERROR(EAGAIN); ++ } ++ ++ // Fill in PTS and anciliary info from src frame ++ // we will want to overwrite some fields as only the pts/dts ++ // fields are updated with new timing in this fn ++ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); ++ ++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, ++ sizeof(avbuf->drm_frame), v4l2_free_buffer, ++ avbuf, AV_BUFFER_FLAG_READONLY); ++ if (!frame->buf[0]) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ ++ atomic_fetch_add(&ctx->refcount, 1); ++ ++ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (ctx->hw_frames_ctx) ++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); ++ frame->height = ctx->height; ++ frame->width = ctx->width; ++ ++ // Not interlaced now ++ frame->interlaced_frame = 0; ++ frame->top_field_first = 0; ++ // Pkt duration halved ++ frame->pkt_duration /= 2; ++ ++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); ++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; ++ } ++ ++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); ++ return 0; ++} ++ ++static int deint_v4l2m2m_config_props(AVFilterLink *outlink) ++{ ++ AVFilterLink *inlink = outlink->src->inputs[0]; ++ AVFilterContext *avctx = outlink->src; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ int ret; ++ ++ ctx->height = avctx->inputs[0]->h; ++ ctx->width = avctx->inputs[0]->w; ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); ++ ++ outlink->time_base = inlink->time_base; ++ outlink->w = inlink->w; ++ outlink->h = inlink->h; ++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; ++ outlink->format = inlink->format; ++ outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate ++ ++ ret = deint_v4l2m2m_find_device(ctx); ++ if (ret) ++ return ret; ++ ++ if (inlink->hw_frames_ctx) { ++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); ++ if (!ctx->hw_frames_ctx) ++ return AVERROR(ENOMEM); ++ } ++ return 0; ++} ++ ++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterContext *avctx = link->dst; ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ V4L2Queue *capture = &ctx->capture; ++ V4L2Queue *output = &ctx->output; ++ int ret; ++ ++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", ++ __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); ++ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, ++ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); ++ ++ if (ctx->field_order == V4L2_FIELD_ANY) { ++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch; ++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, ++ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); ++ ++ if (in->top_field_first) ++ ctx->field_order = V4L2_FIELD_INTERLACED_TB; ++ else ++ ctx->field_order = V4L2_FIELD_INTERLACED_BT; ++ ++ ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(capture); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_allocate_buffers(output); ++ if (ret) ++ return ret; ++ ++ ret = deint_v4l2m2m_streamon(output); ++ if (ret) ++ return ret; ++ } ++ ++ ret = deint_v4l2m2m_enqueue_frame(output, in); ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); ++ return ret; ++} ++ ++static int deint_v4l2m2m_activate(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared *const s = priv->shared; ++ AVFilterLink * const outlink = avctx->outputs[0]; ++ AVFilterLink * const inlink = avctx->inputs[0]; ++ int n = 0; ++ int cn = 99; ++ int instatus = 0; ++ int64_t inpts = 0; ++ int did_something = 0; ++ ++ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); ++ ++ ff_inlink_acknowledge_status(inlink, &instatus, &inpts); ++ ++ if (!ff_outlink_frame_wanted(outlink)) { ++ av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); ++ } ++ else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! ++ { ++ AVFrame * frame = av_frame_alloc(); ++ int rv; ++ ++again: ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); ++ return AVERROR(ENOMEM); ++ } ++ ++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); ++ if (rv != 0) { ++ av_frame_free(&frame); ++ if (rv != AVERROR(EAGAIN)) { ++ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ } ++ else { ++ frame->interlaced_frame = 0; ++ // frame is always consumed by filter_frame - even on error despite ++ // a somewhat confusing comment in the header ++ rv = ff_filter_frame(outlink, frame); ++ ++ if (instatus != 0) { ++ av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); ++ goto again; ++ } ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); ++ did_something = 1; ++ } ++ ++ cn = count_enqueued(&s->capture); ++ } ++ ++ if (instatus != 0) { ++ ff_outlink_set_status(outlink, instatus, inpts); ++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); ++ return 0; ++ } ++ ++ { ++ AVFrame * frame; ++ int rv; ++ ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ while (n < 6) { ++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { ++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } ++ ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); ++ break; ++ } ++ ++ deint_v4l2m2m_filter_frame(inlink, frame); ++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); ++ ++n; ++ } ++ } ++ ++ if (n < 6) { ++ ff_inlink_request_frame(inlink); ++ did_something = 1; ++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); ++ } ++ ++ if (n > 4 && ff_outlink_frame_wanted(outlink)) { ++ ff_filter_set_ready(avctx, 1); ++ did_something = 1; ++ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); ++ } ++ ++ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); ++ return did_something ? 0 : FFERROR_NOT_READY; ++} ++ ++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext * const priv = avctx->priv; ++ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); ++ ++ if (!ctx) { ++ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); ++ return AVERROR(ENOMEM); ++ } ++ priv->shared = ctx; ++ ctx->logctx = priv; ++ ctx->fd = -1; ++ ctx->output.ctx = ctx; ++ ctx->output.num_buffers = 8; ++ ctx->capture.ctx = ctx; ++ ctx->capture.num_buffers = 12; ++ ctx->done = 0; ++ ctx->field_order = V4L2_FIELD_ANY; ++ ++ pts_track_init(&ctx->track, priv); ++ ++ atomic_init(&ctx->refcount, 1); ++ ++ return 0; ++} ++ ++static void deint_v4l2m2m_uninit(AVFilterContext *avctx) ++{ ++ DeintV4L2M2MContext *priv = avctx->priv; ++ DeintV4L2M2MContextShared *ctx = priv->shared; ++ ++ ctx->done = 1; ++ ctx->logctx = NULL; // Log to NULL works, log to missing crashes ++ pts_track_uninit(&ctx->track); ++ deint_v4l2m2m_destroy_context(ctx); ++} ++ ++static const AVOption deinterlace_v4l2m2m_options[] = { ++ { NULL }, ++}; ++ ++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); ++ ++static const AVFilterPad deint_v4l2m2m_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ }, ++}; ++ ++static const AVFilterPad deint_v4l2m2m_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .config_props = deint_v4l2m2m_config_props, ++ }, ++}; ++ ++AVFilter ff_vf_deinterlace_v4l2m2m = { ++ .name = "deinterlace_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &deint_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ FILTER_INPUTS(deint_v4l2m2m_inputs), ++ FILTER_OUTPUTS(deint_v4l2m2m_outputs), ++ FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME), ++ .priv_class = &deinterlace_v4l2m2m_class, ++ .activate = deint_v4l2m2m_activate, ++}; + +From 6f07e022369785213d78b1f211e47adab47f4df2 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 2 Dec 2021 17:49:55 +0000 +Subject: [PATCH 031/113] Put no_pts_rescale in context which makes more sense + than an arg + +--- + libavcodec/v4l2_buffers.c | 28 ++++++++++++++-------------- + libavcodec/v4l2_buffers.h | 5 ++--- + libavcodec/v4l2_context.c | 8 ++++---- + libavcodec/v4l2_context.h | 13 +++++++++---- + libavcodec/v4l2_m2m_dec.c | 9 +++++---- + 5 files changed, 34 insertions(+), 29 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 126d2a17f4..22da6bd722 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -39,7 +39,7 @@ + #define USEC_PER_SEC 1000000 + static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; + +-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) ++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) + { + return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? + container_of(buf->context, V4L2m2mContext, output) : +@@ -51,34 +51,34 @@ static inline AVCodecContext *logger(V4L2Buffer *buf) + return buf_to_m2mctx(buf)->avctx; + } + +-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) ++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) + { +- V4L2m2mContext *s = buf_to_m2mctx(avbuf); ++ const V4L2m2mContext *s = buf_to_m2mctx(avbuf); + const AVRational tb = s->avctx->pkt_timebase.num ? + s->avctx->pkt_timebase : + s->avctx->time_base; + return tb.num && tb.den ? tb : v4l2_timebase; + } + +-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale) ++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) + { + /* convert pts to v4l2 timebase */ + const int64_t v4l2_pts = +- no_rescale ? pts : ++ out->context->no_pts_rescale ? pts : + pts == AV_NOPTS_VALUE ? 0 : + av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); + out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; + out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; + } + +-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale) ++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) + { + /* convert pts back to encoder timebase */ + const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + + avbuf->buf.timestamp.tv_usec; + + return +- no_rescale ? v4l2_pts : ++ avbuf->context->no_pts_rescale ? v4l2_pts : + v4l2_pts == 0 ? AV_NOPTS_VALUE : + av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); + } +@@ -686,13 +686,13 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); + v4l2_set_color_range(out, frame->color_range); + // PTS & interlace are buffer vars +- v4l2_set_pts(out, frame->pts, 0); ++ v4l2_set_pts(out, frame->pts); + v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); + + return v4l2_buffer_swframe_to_buf(frame, out); + } + +-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts) ++int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + { + int ret; + V4L2Context * const ctx = avbuf->context; +@@ -710,7 +710,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc + frame->colorspace = v4l2_get_color_space(avbuf); + frame->color_range = v4l2_get_color_range(avbuf); + frame->color_trc = v4l2_get_color_trc(avbuf); +- frame->pts = v4l2_get_pts(avbuf, no_rescale_pts); ++ frame->pts = v4l2_get_pts(avbuf); + frame->pkt_dts = AV_NOPTS_VALUE; + frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); + frame->top_field_first = v4l2_buf_is_top_first(avbuf); +@@ -757,13 +757,13 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + pkt->flags |= AV_PKT_FLAG_CORRUPT; + } + +- pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0); ++ pkt->dts = pkt->pts = v4l2_get_pts(avbuf); + + return 0; + } + + int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, +- const void *extdata, size_t extlen, int no_rescale_pts) ++ const void *extdata, size_t extlen) + { + int ret; + +@@ -777,7 +777,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + if (ret && ret != AVERROR(ENOMEM)) + return ret; + +- v4l2_set_pts(out, pkt->pts, no_rescale_pts); ++ v4l2_set_pts(out, pkt->pts); + + if (pkt->flags & AV_PKT_FLAG_KEY) + out->flags = V4L2_BUF_FLAG_KEYFRAME; +@@ -787,7 +787,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) + { +- return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); ++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); + } + + +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 111526aee3..641e0e147b 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -83,12 +83,11 @@ typedef struct V4L2Buffer { + * + * @param[in] frame The AVFRame to push the information to + * @param[in] buf The V4L2Buffer to get the information from +- * @param[in] no_rescale_pts If non-zero do not rescale PTS + * + * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect, + * AVERROR(ENOMEM) if the AVBufferRef can't be created. + */ +-int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts); ++int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf); + + /** + * Extracts the data from a V4L2Buffer to an AVPacket +@@ -113,7 +112,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + + int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, +- const void *extdata, size_t extlen, int no_rescale_pts); ++ const void *extdata, size_t extlen); + + /** + * Extracts the data from an AVFrame to a V4L2Buffer +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 964c17ac0e..9de5838256 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -808,7 +808,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + } + + int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, +- const void * extdata, size_t extlen, int no_rescale_pts) ++ const void * extdata, size_t extlen) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); + V4L2Buffer* avbuf; +@@ -827,7 +827,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts); ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); + if (ret == AVERROR(ENOMEM)) + av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", + __func__, pkt->size, avbuf->planes[0].length); +@@ -837,7 +837,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + return ff_v4l2_buffer_enqueue(avbuf); + } + +-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts) ++int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + { + V4L2Buffer *avbuf; + +@@ -854,7 +854,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, + return AVERROR(EAGAIN); + } + +- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts); ++ return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); + } + + int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 37b0431400..4cc164886c 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -102,6 +102,13 @@ typedef struct V4L2Context { + */ + int done; + ++ /** ++ * PTS rescale not wanted ++ * If the PTS is just a dummy frame count then rescale is ++ * actively harmful ++ */ ++ int no_pts_rescale; ++ + AVBufferRef *frames_ref; + int q_count; + int dq_count; +@@ -172,12 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); + * @param[in] ctx The V4L2Context to dequeue from. + * @param[inout] f The AVFrame to dequeue to. + * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) +- * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as +- * timestamp directly) + * + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. + */ +-int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts); ++int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + + /** + * Enqueues a buffer to a V4L2Context from an AVPacket +@@ -189,7 +194,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int + * @param[in] pkt A pointer to an AVPacket. + * @return 0 in case of success, a negative error otherwise. + */ +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts); ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); + + /** + * Enqueues a buffer to a V4L2Context from an AVFrame +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index f05f2927e6..d341f14a3d 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -360,7 +360,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + if (!s->draining) { + // Calling enqueue with an empty pkt starts drain + av_assert0(s->buf_pkt.size == 0); +- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1); ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + if (ret) { + av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); + return ret; +@@ -381,8 +381,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + return ret; + + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, +- avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size, +- 1); ++ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); + + if (ret == AVERROR(EAGAIN)) { + // Out of input buffers - keep packet +@@ -442,7 +441,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // when discarding + // This returns AVERROR(EAGAIN) if there isn't a frame ready yet + // but there is room in the input Q +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1); ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1); + + if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) + av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", +@@ -569,10 +568,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + output->av_codec_id = avctx->codec_id; + output->av_pix_fmt = AV_PIX_FMT_NONE; + output->min_buf_size = max_coded_size(avctx); ++ output->no_pts_rescale = 1; + + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; + capture->min_buf_size = 0; ++ capture->no_pts_rescale = 1; + + /* the client requests the codec to generate DRM frames: + * - data[0] will therefore point to the returned AVDRMFrameDescriptor + +From 57ce5bd1dce97117cce25c215265582c6236784b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 8 Dec 2021 15:00:37 +0000 +Subject: [PATCH 032/113] Use bitbuf min size for all streams + +--- + libavcodec/v4l2_m2m_dec.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index d341f14a3d..422a89edec 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -507,15 +507,12 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + #endif + ++// This heuristic is for H264 but use for everything + static uint32_t max_coded_size(const AVCodecContext * const avctx) + { + uint32_t wxh = avctx->coded_width * avctx->coded_height; + uint32_t size; + +- // Currently the only thing we try to set our own limits for is H264 +- if (avctx->codec_id != AV_CODEC_ID_H264) +- return 0; +- + size = wxh * 3 / 2; + // H.264 Annex A table A-1 gives minCR which is either 2 or 4 + // unfortunately that doesn't yield an actually useful limit + +From 71a71bd52921753815acf5ad5c4e3288597f1cb4 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 3 Dec 2021 12:54:18 +0000 +Subject: [PATCH 033/113] Track pending frames in v4l2 stateful + +Track which frames are pending decode in the v4l2 stateful decoder. +This relies on DTS & PTS having some relationship to reality, so +any use of this code must cope with the results being wrong. + +Also moves the xlat state vars out of the main context and into their +own structure. +--- + libavcodec/v4l2_m2m.h | 15 ++++-- + libavcodec/v4l2_m2m_dec.c | 100 +++++++++++++++++++++++++++++--------- + 2 files changed, 89 insertions(+), 26 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 82feb0afdb..3f86809623 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -44,8 +44,10 @@ + #define FF_V4L2_M2M_TRACK_SIZE 128 + typedef struct V4L2m2mTrackEl { + int discard; // If we see this buffer its been flushed, so discard ++ int pending; + int pkt_size; + int64_t pts; ++ int64_t dts; + int64_t reordered_opaque; + int64_t pkt_pos; + int64_t pkt_duration; +@@ -62,6 +64,14 @@ typedef struct pts_stats_s + int64_t guess; + } pts_stats_t; + ++typedef struct xlat_track_s { ++ unsigned int track_no; ++ int64_t last_pts; ++ int64_t last_pkt_dts; ++ int64_t last_opaque; ++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++} xlat_track_t; ++ + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; + int fd; +@@ -96,10 +106,7 @@ typedef struct V4L2m2mContext { + int output_drm; + + /* Frame tracking */ +- int64_t last_pkt_dts; +- int64_t last_opaque; +- unsigned int track_no; +- V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++ xlat_track_t xlat; + + pts_stats_t pts_stat; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 422a89edec..859fecdb77 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -242,22 +242,24 @@ static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts + // buffer of all the things we want preserved (including the original PTS) + // indexed by the tracking no. + static void +-xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) ++xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt) + { + int64_t track_pts; + + // Avoid 0 +- if (++s->track_no == 0) +- s->track_no = 1; ++ if (++x->track_no == 0) ++ x->track_no = 1; + +- track_pts = track_to_pts(avctx, s->track_no); ++ track_pts = track_to_pts(avctx, x->track_no); + +- av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); +- s->last_pkt_dts = avpkt->dts; +- s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); ++ x->last_pkt_dts = avpkt->dts; ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ + .discard = 0, ++ .pending = 1, + .pkt_size = avpkt->size, + .pts = avpkt->pts, ++ .dts = avpkt->dts, + .reordered_opaque = avctx->reordered_opaque, + .pkt_pos = avpkt->pos, + .pkt_duration = avpkt->duration, +@@ -268,31 +270,36 @@ xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *cons + + // Returns -1 if we should discard the frame + static int +-xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) ++xlat_pts_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ pts_stats_t * const ps, ++ AVFrame *const frame) + { + unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; +- const V4L2m2mTrackEl *const t = s->track_els + n; ++ V4L2m2mTrackEl *const t = x->track_els + n; + if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) + { + av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); + frame->pts = AV_NOPTS_VALUE; +- frame->pkt_dts = s->last_pkt_dts; +- frame->reordered_opaque = s->last_opaque; ++ frame->pkt_dts = x->last_pkt_dts; ++ frame->reordered_opaque = x->last_opaque; + frame->pkt_pos = -1; + frame->pkt_duration = 0; + frame->pkt_size = -1; + } + else if (!t->discard) + { +- frame->pts = t->pts; +- frame->pkt_dts = s->last_pkt_dts; ++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ frame->pkt_dts = x->last_pkt_dts; + frame->reordered_opaque = t->reordered_opaque; + frame->pkt_pos = t->pkt_pos; + frame->pkt_duration = t->pkt_duration; + frame->pkt_size = t->pkt_size; + +- s->last_opaque = s->track_els[n].reordered_opaque; +- s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (frame->pts != AV_NOPTS_VALUE) ++ x->last_pts = frame->pts; ++ t->pending = 0; + } + else + { +@@ -300,14 +307,62 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons + return -1; + } + +- pts_stats_add(&s->pts_stat, frame->pts); ++ pts_stats_add(ps, frame->pts); + +- frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat); ++ frame->best_effort_timestamp = pts_stats_guess(ps); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? + av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); + return 0; + } + ++static void ++xlat_flush(xlat_track_t * const x) ++{ ++ unsigned int i; ++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { ++ x->track_els[i].pending = 0; ++ x->track_els[i].discard = 1; ++ } ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ ++static int ++xlat_pending(const xlat_track_t * const x) ++{ ++ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; ++ unsigned int i; ++ int r = 0; ++ int64_t now = AV_NOPTS_VALUE; ++ ++ for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { ++ const V4L2m2mTrackEl * const t = x->track_els + n; ++ ++ if (!t->pending) ++ continue; ++ ++ if (now == AV_NOPTS_VALUE) ++ now = t->dts; ++ ++ if (t->pts == AV_NOPTS_VALUE || ++ ((now == AV_NOPTS_VALUE || t->pts <= now) && ++ (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) ++ ++r; ++ } ++ ++ // If we never get any ideas about PTS vs DTS allow a lot more buffer ++ if (now == AV_NOPTS_VALUE) ++ r -= 16; ++ ++ return r; ++} ++ + static inline int stream_started(const V4L2m2mContext * const s) { + return s->capture.streamon && s->output.streamon; + } +@@ -374,7 +429,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + return ret; + } + +- xlat_pts_in(avctx, s, &s->buf_pkt); ++ xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); + } + + if ((ret = check_output_streamon(avctx, s)) != 0) +@@ -417,6 +472,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + int dst_rv = 1; // Non-zero (done), non-negative (error) number + + do { ++ av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat)); + src_rv = try_enqueue_src(avctx, s); + + // If we got a frame last time and we have nothing to enqueue then +@@ -451,7 +507,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + s->draining, s->capture.done, dst_rv); + + // Go again if we got a frame that we need to discard +- } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); ++ } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); + } + + // Continue trying to enqueue packets if either +@@ -550,6 +606,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + if (ret < 0) + return ret; + ++ xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); + + capture = &s->capture; +@@ -632,7 +689,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + V4L2m2mContext * const s = priv->context; + V4L2Context * const output = &s->output; + V4L2Context * const capture = &s->capture; +- int ret, i; ++ int ret; + + av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); + +@@ -646,8 +703,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + + // V4L2 makes no guarantees about whether decoded frames are flushed or not + // so mark all frames we are tracking to be discarded if they appear +- for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) +- s->track_els[i].discard = 1; ++ xlat_flush(&s->xlat); + + // resend extradata + s->extdata_sent = 0; + +From 7931bfc9d3e4f30dcc3ab00557c3d0cb2d504516 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 15 Dec 2021 17:58:21 +0000 +Subject: [PATCH 034/113] Use pending tracking to reduce v4l2 latency + +If there are more than 5 pending decodes outstanding then add a small +timeout to the capture poll to reduce the rate at which frames are +added. +--- + libavcodec/v4l2_m2m_dec.c | 58 ++++++++++++++++++++++++--------------- + 1 file changed, 36 insertions(+), 22 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 859fecdb77..4d2ef92cbe 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -370,16 +370,19 @@ static inline int stream_started(const V4L2m2mContext * const s) { + #define NQ_OK 0 + #define NQ_Q_FULL 1 + #define NQ_SRC_EMPTY 2 +-#define NQ_DRAINING 3 +-#define NQ_DEAD 4 ++#define NQ_NONE 3 ++#define NQ_DRAINING 4 ++#define NQ_DEAD 5 + + #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) ++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) + + // AVERROR_EOF Flushing an already flushed stream + // -ve Error (all errors except EOF are unexpected) + // NQ_OK (0) OK + // NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) + // NQ_SRC_EMPTY Src empty (do not retry) ++// NQ_NONE Enqueue not attempted + // NQ_DRAINING At EOS, dQ dest until EOS there too + // NQ_DEAD Not running (do not retry, do not attempt capture dQ) + +@@ -468,23 +471,28 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- int src_rv; ++ int src_rv = NQ_NONE; + int dst_rv = 1; // Non-zero (done), non-negative (error) number ++ unsigned int i = 0; + + do { +- av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat)); +- src_rv = try_enqueue_src(avctx, s); +- +- // If we got a frame last time and we have nothing to enqueue then +- // return now. rv will be AVERROR(EAGAIN) indicating that we want more input +- // This should mean that once decode starts we enter a stable state where +- // we alternately ask for input and produce output +- if (s->req_pkt && src_rv == NQ_SRC_EMPTY) +- break; +- +- if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) { +- av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); +- src_rv = NQ_SRC_EMPTY; // If we can't enqueue pretend that there is nothing to enqueue ++ const int pending = xlat_pending(&s->xlat); ++ const int prefer_dq = (pending > 5); ++ ++ // Enqueue another pkt for decode if ++ // (a) We don't have a lot of stuff in the buffer already OR ++ // (b) ... we (think we) do but we've failed to get a frame already OR ++ // (c) We've dequeued a lot of frames without asking for input ++ if (!prefer_dq || i != 0 || s->req_pkt > 2) { ++ src_rv = try_enqueue_src(avctx, s); ++ ++ // If we got a frame last time or we've already tried to get a frame and ++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) ++ // indicating that we want more input. ++ // This should mean that once decode starts we enter a stable state where ++ // we alternately ask for input and produce output ++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) ++ break; + } + + // Try to get a new frame if +@@ -495,9 +503,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // Dequeue frame will unref any previous contents of frame + // if it returns success so we don't need an explicit unref + // when discarding +- // This returns AVERROR(EAGAIN) if there isn't a frame ready yet +- // but there is room in the input Q +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1); ++ // This returns AVERROR(EAGAIN) on timeout or if ++ // there is room in the input Q and timeout == -1 ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1); + + if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) + av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", +@@ -510,10 +518,16 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); + } + ++ ++i; ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); ++ src_rv = AVERROR(EIO); ++ } ++ + // Continue trying to enqueue packets if either + // (a) we succeeded last time OR +- // (b) enqueue failed due to input Q full AND there is now room +- } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) ); ++ // (b) we didn't ret a frame and we can retry the input ++ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); + + // Ensure that the frame contains nothing if we aren't returning a frame + // (might happen when discarding) +@@ -521,7 +535,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + av_frame_unref(frame); + + // If we got a frame this time ask for a pkt next time +- s->req_pkt = (dst_rv == 0); ++ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; + + #if 0 + if (dst_rv == 0) + +From 4425e0ed76c02fcd6dcf84e29c8e28a5aa7bbaf5 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 15 Dec 2021 12:23:54 +0000 +Subject: [PATCH 035/113] Allow logger() to take const ctx + +--- + libavcodec/v4l2_buffers.c | 2 +- + libavcodec/v4l2_context.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 22da6bd722..39c0094aec 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -46,7 +46,7 @@ static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) + container_of(buf->context, V4L2m2mContext, capture); + } + +-static inline AVCodecContext *logger(V4L2Buffer *buf) ++static inline AVCodecContext *logger(const V4L2Buffer * const buf) + { + return buf_to_m2mctx(buf)->avctx; + } +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 9de5838256..9833e903c2 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -43,14 +43,14 @@ struct v4l2_format_update { + int update_avfmt; + }; + +-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) ++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) + { + return V4L2_TYPE_IS_OUTPUT(ctx->type) ? + container_of(ctx, V4L2m2mContext, output) : + container_of(ctx, V4L2m2mContext, capture); + } + +-static inline AVCodecContext *logger(V4L2Context *ctx) ++static inline AVCodecContext *logger(const V4L2Context *ctx) + { + return ctx_to_m2mctx(ctx)->avctx; + } + +From be9d93d6adcb4d4286e39adde9f192eca7812d52 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 15 Dec 2021 13:00:27 +0000 +Subject: [PATCH 036/113] Track numbere of bufs qed with an atomic + +Safer and faster than counting status +--- + libavcodec/v4l2_buffers.c | 6 +++--- + libavcodec/v4l2_context.c | 3 ++- + libavcodec/v4l2_context.h | 3 +-- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 39c0094aec..2cf7be6632 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -922,6 +922,7 @@ fail: + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + { + int ret; ++ int qc; + + avbuf->buf.flags = avbuf->flags; + +@@ -941,11 +942,10 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + return AVERROR(err); + } + +- ++avbuf->context->q_count; ++ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, +- avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, +- avbuf->context->q_count); ++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); + + avbuf->status = V4L2BUF_IN_DRIVER; + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 9833e903c2..59cc7f0e76 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -599,7 +599,7 @@ static int v4l2_release_buffers(V4L2Context* ctx) + " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); + } + } +- ctx->q_count = 0; ++ atomic_store(&ctx->q_count, 0); + + return ret; + } +@@ -1019,6 +1019,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) + } + + ff_mutex_init(&ctx->lock, NULL); ++ atomic_init(&ctx->q_count, 0); + + if (s->output_drm) { + AVHWFramesContext *hwframes; +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 4cc164886c..a4176448d5 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -110,8 +110,7 @@ typedef struct V4L2Context { + int no_pts_rescale; + + AVBufferRef *frames_ref; +- int q_count; +- int dq_count; ++ atomic_int q_count; + struct ff_weak_link_master *wl_master; + + AVMutex lock; + +From c9b50f79590acfc352d7023dba4263acfe7b1f29 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 9 Dec 2021 12:01:25 +0000 +Subject: [PATCH 037/113] Clear pkt_buf on flush + +--- + libavcodec/v4l2_m2m_dec.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 4d2ef92cbe..94a32a5eee 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -715,6 +715,9 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); + ++ // Clear any buffered input packet ++ av_packet_unref(&s->buf_pkt); ++ + // V4L2 makes no guarantees about whether decoded frames are flushed or not + // so mark all frames we are tracking to be discarded if they appear + xlat_flush(&s->xlat); + +From bdf67cec94f343dbd6dc7708ee96ea645ae7d9cb Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 15 Dec 2021 12:52:56 +0000 +Subject: [PATCH 038/113] Rework v4l2 buffer dequeue + +--- + libavcodec/v4l2_context.c | 543 ++++++++++++++++++-------------------- + libavcodec/v4l2_context.h | 2 + + libavcodec/v4l2_m2m.c | 1 - + libavcodec/v4l2_m2m.h | 16 +- + libavcodec/v4l2_m2m_dec.c | 138 ++++------ + 5 files changed, 327 insertions(+), 373 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 59cc7f0e76..fe21218710 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -73,19 +73,27 @@ static AVRational v4l2_get_sar(V4L2Context *ctx) + return sar; + } + +-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) ++static inline int ctx_buffers_alloced(const V4L2Context * const ctx) + { +- struct v4l2_format *fmt1 = &ctx->format; +- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? +- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || +- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height +- : +- fmt1->fmt.pix.width != fmt2->fmt.pix.width || +- fmt1->fmt.pix.height != fmt2->fmt.pix.height; ++ return ctx->bufrefs != NULL; ++} ++ ++// Width/Height changed or we don't have an alloc in the first place? ++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) ++{ ++ const struct v4l2_format *fmt1 = &ctx->format; ++ int ret = !ctx_buffers_alloced(ctx) || ++ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ++ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || ++ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height ++ : ++ fmt1->fmt.pix.width != fmt2->fmt.pix.width || ++ fmt1->fmt.pix.height != fmt2->fmt.pix.height); + + if (ret) +- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", ++ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", + ctx->name, ++ ctx_buffers_alloced(ctx), + ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), + ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); + +@@ -167,10 +175,8 @@ static int do_source_change(V4L2m2mContext * const s) + + int ret; + int reinit; +- int full_reinit; + struct v4l2_format cap_fmt = s->capture.format; + +- s->resize_pending = 0; + s->capture.done = 0; + + ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); +@@ -179,15 +185,21 @@ static int do_source_change(V4L2m2mContext * const s) + return 0; + } + +- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); +- + get_default_selection(&s->capture, &s->capture.selection); + +- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); ++ reinit = ctx_resolution_changed(&s->capture, &cap_fmt); ++ s->capture.format = cap_fmt; + if (reinit) { + s->capture.height = ff_v4l2_get_format_height(&cap_fmt); + s->capture.width = ff_v4l2_get_format_width(&cap_fmt); + } ++ ++ // If we don't support selection (or it is bust) and we obviously have HD then kludge ++ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && ++ (s->capture.height == 1088 && s->capture.width == 1920)) { ++ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; ++ } ++ + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + + av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", +@@ -195,11 +207,11 @@ static int do_source_change(V4L2m2mContext * const s) + s->capture.selection.width, s->capture.selection.height, + s->capture.selection.left, s->capture.selection.top); + +- s->reinit = 1; +- + if (reinit) { + if (avctx) +- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); ++ ret = ff_set_dimensions(s->avctx, ++ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, ++ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); + if (ret < 0) + av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); + +@@ -208,11 +220,22 @@ static int do_source_change(V4L2m2mContext * const s) + av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); + return AVERROR(EINVAL); + } ++ ++ // Update pixel format - should only actually do something on initial change ++ s->capture.av_pix_fmt = ++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); ++ if (s->output_drm) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ avctx->sw_pix_fmt = s->capture.av_pix_fmt; ++ } ++ else ++ avctx->pix_fmt = s->capture.av_pix_fmt; ++ + goto reinit_run; + } + + /* Buffers are OK so just stream off to ack */ +- av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__); ++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); + + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + if (ret) +@@ -225,54 +248,6 @@ reinit_run: + return 1; + } + +-static int ctx_done(V4L2Context * const ctx) +-{ +- int rv = 0; +- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); +- +- ctx->done = 1; +- +- if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type)) +- rv = do_source_change(s); +- +- return rv; +-} +- +-/** +- * handle resolution change event and end of stream event +- * returns 1 if reinit was successful, negative if it failed +- * returns 0 if reinit was not executed +- */ +-static int v4l2_handle_event(V4L2Context *ctx) +-{ +- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); +- struct v4l2_event evt = { 0 }; +- int ret; +- +- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); +- if (ret < 0) { +- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); +- return 0; +- } +- +- av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type); +- +- if (evt.type == V4L2_EVENT_EOS) { +-// ctx->done = 1; +- av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); +- return 0; +- } +- +- if (evt.type != V4L2_EVENT_SOURCE_CHANGE) +- return 0; +- +- s->resize_pending = 1; +- if (!ctx->done) +- return 0; +- +- return do_source_change(s); +-} +- + static int v4l2_stop_decode(V4L2Context *ctx) + { + struct v4l2_decoder_cmd cmd = { +@@ -313,243 +288,252 @@ static int v4l2_stop_encode(V4L2Context *ctx) + return 0; + } + +-static int count_in_driver(const V4L2Context * const ctx) ++// DQ a buffer ++// Amalgamates all the various ways there are of signalling EOS/Event to ++// generate a consistant EPIPE. ++// ++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) ++// ++// Returns: ++// 0 Success ++// AVERROR(EPIPE) Nothing more to read ++// * AVERROR(..) ++ ++ static int ++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) + { +- int i; +- int n = 0; ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ V4L2Buffer * avbuf; ++ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); + +- if (!ctx->bufrefs) +- return -1; +- +- for (i = 0; i < ctx->num_buffers; ++i) { +- V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +- if (avbuf->status == V4L2BUF_IN_DRIVER) +- ++n; +- } +- return n; +-} ++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + +-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) +-{ +- V4L2m2mContext * const s = ctx_to_m2mctx(ctx); +- const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type); +- struct v4l2_plane planes[VIDEO_MAX_PLANES]; +- struct v4l2_buffer buf = { 0 }; +- V4L2Buffer *avbuf; +- struct pollfd pfd = { +- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ +- .fd = ctx_to_m2mctx(ctx)->fd, ++ struct v4l2_buffer buf = { ++ .type = ctx->type, ++ .memory = V4L2_MEMORY_MMAP, + }; +- int i, ret; +- int no_rx_means_done = 0; +- +- if (is_capture && ctx->bufrefs) { +- for (i = 0; i < ctx->num_buffers; i++) { +- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +- if (avbuf->status == V4L2BUF_IN_DRIVER) +- break; +- } +- if (i == ctx->num_buffers) +- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " +- "userspace. Increase num_capture_buffers " +- "to prevent device deadlock or dropped " +- "packets/frames.\n", i); ++ ++ *ppavbuf = NULL; ++ ++ if (ctx->flag_last) ++ return AVERROR(EPIPE); ++ ++ if (is_mp) { ++ buf.length = VIDEO_MAX_PLANES; ++ buf.m.planes = planes; + } + +-#if 0 +- // I think this is true but pointless +- // we will get some other form of EOF signal +- +- /* if we are draining and there are no more capture buffers queued in the driver we are done */ +- if (is_capture && ctx_to_m2mctx(ctx)->draining) { +- for (i = 0; i < ctx->num_buffers; i++) { +- /* capture buffer initialization happens during decode hence +- * detection happens at runtime +- */ +- if (!ctx->bufrefs) +- break; +- +- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +- if (avbuf->status == V4L2BUF_IN_DRIVER) +- goto start; ++ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { ++ const int err = errno; ++ av_assert0(AVERROR(err) < 0); ++ if (err != EINTR) { ++ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", ++ ctx->name, av_err2str(AVERROR(err))); ++ ++ if (err == EPIPE) ++ ctx->flag_last = 1; ++ ++ return AVERROR(err); + } +- ctx->done = 1; +- return NULL; + } +-#endif +- +-start: +- if (is_capture) { +- /* no need to listen to requests for more input while draining */ +- if (ctx_to_m2mctx(ctx)->draining || timeout > 0) +- pfd.events = POLLIN | POLLRDNORM | POLLPRI; +- } else { +- pfd.events = POLLOUT | POLLWRNORM; ++ atomic_fetch_sub(&ctx->q_count, 1); ++ ++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; ++ avbuf->status = V4L2BUF_AVAILABLE; ++ avbuf->buf = buf; ++ if (is_mp) { ++ memcpy(avbuf->planes, planes, sizeof(planes)); ++ avbuf->buf.m.planes = avbuf->planes; + } +- no_rx_means_done = s->resize_pending && is_capture; + +- for (;;) { +- // If we have a resize pending then all buffers should be Qed +- // With a resize pending we should be in drain but evidence suggests +- // that not all decoders do this so poll to clear +- int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout; +- const int e = pfd.events; +- +- ret = poll(&pfd, 1, t2); ++ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { ++ // Zero length cap buffer return == EOS ++ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); + +- if (ret > 0) +- break; ++ // Must reQ so we don't leak ++ // May not matter if the next thing we do is release all the ++ // buffers but better to be tidy. ++ ff_v4l2_buffer_enqueue(avbuf); + +- if (ret < 0) { +- int err = errno; +- if (err == EINTR) +- continue; +- av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n", +- err, strerror(err), +- e, count_in_driver(ctx)); +- return NULL; ++ ctx->flag_last = 1; ++ return AVERROR(EPIPE); + } + +- // ret == 0 (timeout) +- if (no_rx_means_done) { +- av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n"); +- ret = ctx_done(ctx); +- if (ret > 0) +- goto start; +- } +- if (timeout == -1) +- av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; +- return NULL; ++#ifdef V4L2_BUF_FLAG_LAST ++ // If flag_last set then this contains data but is the last frame ++ // so remember that but return OK ++ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) ++ ctx->flag_last = 1; ++#endif + } + +- /* 0. handle errors */ +- if (pfd.revents & POLLERR) { +- /* if we are trying to get free buffers but none have been queued yet +- no need to raise a warning */ +- if (timeout == 0) { +- for (i = 0; i < ctx->num_buffers; i++) { +- avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +- if (avbuf->status != V4L2BUF_AVAILABLE) +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); +- } +- } +- else +- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); ++ *ppavbuf = avbuf; ++ return 0; ++} + +- return NULL; +- } ++/** ++ * handle resolution change event and end of stream event ++ * Expects to be called after the stream has stopped + * -+ * To fully cater for the ranges specified requires various intermediate -+ * values to be held to 17-bit precision; yet these conditions do not appear -+ * to be utilised in real-world streams. At least some assembly -+ * implementations have chosen to restrict these values to 16-bit precision, -+ * to accelerate the decoding of real-world streams at the cost of strict -+ * adherence to the spec. To avoid our test marking these as failures, -+ * reduce our random inputs. ++ * returns 1 if reinit was successful, negative if it failed ++ * returns 0 if reinit was not executed + */ -+#define ATTENUATION 4 -+ -+static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height) ++static int ++get_event(V4L2m2mContext * const m) +{ -+ matrix *raw, *tmp, *D, *E, *R; -+ raw = new_matrix(width, height); -+ for (int i = 0; i < width * height; ++i) -+ raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION; -+ tmp = multiply(height == 8 ? &T8 : &T4, raw); -+ D = multiply(tmp, width == 8 ? &T8t : &T4t); -+ normalise(D); -+ divide_and_round_nearest(D, 1); -+ for (int i = 0; i < width * height; ++i) { -+ if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) { -+ /* Rare, so simply try again */ -+ av_free(raw); -+ av_free(tmp); -+ av_free(D); -+ return generate_inverse_quantized_transform_coefficients(width, height); -+ } ++ AVCodecContext * const avctx = m->avctx; ++ struct v4l2_event evt = { 0 }; + +- /* 1. handle resolution changes */ +- if (pfd.revents & POLLPRI) { +- ret = v4l2_handle_event(ctx); +- if (ret < 0) { +- /* if re-init failed, abort */ +- ctx->done = 1; +- return NULL; ++ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { ++ const int rv = AVERROR(errno); ++ if (rv == AVERROR(EINTR)) ++ continue; ++ if (rv == AVERROR(EAGAIN)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); ++ return AVERROR_EOF; + } +- if (ret > 0) +- goto start; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); ++ return rv; + } -+ E = multiply(D, width == 8 ? &T8 : &T4); -+ divide_and_round_nearest(E, 8); -+ for (int i = 0; i < width * height; ++i) -+ if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) { -+ /* Rare, so simply try again */ -+ av_free(raw); -+ av_free(tmp); -+ av_free(D); -+ av_free(E); -+ return generate_inverse_quantized_transform_coefficients(width, height); -+ } -+ R = multiply(height == 8 ? &T8t : &T4t, E); -+ tweak(R); -+ divide_and_round_nearest(R, 128); -+ for (int i = 0; i < width * height; ++i) -+ if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) { -+ /* Rare, so simply try again */ -+ av_free(raw); -+ av_free(tmp); -+ av_free(D); -+ av_free(E); -+ av_free(R); -+ return generate_inverse_quantized_transform_coefficients(width, height); -+ } -+ av_free(raw); -+ av_free(tmp); -+ av_free(E); -+ av_free(R); -+ return D; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); ++ ++ if (evt.type == V4L2_EVENT_EOS) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); ++ return AVERROR_EOF; + } + +- /* 2. dequeue the buffer */ +- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { ++ if (evt.type == V4L2_EVENT_SOURCE_CHANGE) ++ return do_source_change(m); + +- if (is_capture) { +- /* there is a capture buffer ready */ +- if (pfd.revents & (POLLIN | POLLRDNORM)) +- goto dequeue; ++ return 0; +} + +- // CAPTURE Q drained +- if (no_rx_means_done) { +- if (ctx_done(ctx) > 0) +- goto start; +- return NULL; +- } + +- /* the driver is ready to accept more input; instead of waiting for the capture +- * buffer to complete we return NULL so input can proceed (we are single threaded) +- */ +- if (pfd.revents & (POLLOUT | POLLWRNORM)) +- return NULL; ++// Get a buffer ++// If output then just gets the buffer in the expected way ++// If capture then runs the capture state m/c to deal with res change etc. ++// If return value == 0 then *ppavbuf != NULL + -+#define RANDOMIZE_BUFFER16(name, size) \ -+ do { \ -+ int i; \ -+ for (i = 0; i < size; ++i) { \ -+ uint16_t r = rnd(); \ -+ AV_WN16A(name##0 + i, r); \ -+ AV_WN16A(name##1 + i, r); \ -+ } \ -+ } while (0) -+ -+#define RANDOMIZE_BUFFER8(name, size) \ -+ do { \ -+ int i; \ -+ for (i = 0; i < size; ++i) { \ -+ uint8_t r = rnd(); \ -+ name##0[i] = r; \ -+ name##1[i] = r; \ -+ } \ -+ } while (0) -+ -+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \ -+ do { \ -+ uint8_t *p##0 = name##0, *p##1 = name##1; \ -+ int i = (size); \ -+ while (i-- > 0) { \ -+ int x = 0x80 | (rnd() & 0x7F); \ -+ x >>= rnd() % 9; \ -+ if (rnd() & 1) \ -+ x = -x; \ -+ *p##1++ = *p##0++ = 0x80 + x; \ -+ } \ -+ } while (0) -+ -+static void check_inv_trans_inplace(void) ++static int ++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) +{ -+ /* Inverse transform input coefficients are stored in a 16-bit buffer -+ * with row stride of 8 coefficients irrespective of transform size. -+ * vc1_inv_trans_8x8 differs from the others in two ways: coefficients -+ * are stored in column-major order, and the outputs are written back -+ * to the input buffer, so we oversize it slightly to catch overruns. */ -+ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]); -+ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]); ++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx); ++ AVCodecContext * const avctx = m->avctx; ++ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); + -+ VC1DSPContext h; ++ const unsigned int poll_cap = (POLLIN | POLLRDNORM); ++ const unsigned int poll_out = (POLLOUT | POLLWRNORM); ++ const unsigned int poll_event = POLLPRI; + -+ ff_vc1dsp_init(&h); ++ *ppavbuf = NULL; + -+ if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) { -+ matrix *coeffs; -+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *); -+ RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8); -+ coeffs = generate_inverse_quantized_transform_coefficients(8, 8); -+ for (int j = 0; j < 8; ++j) -+ for (int i = 0; i < 8; ++i) { -+ int idx = 8 + i * 8 + j; -+ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i]; -+ } -+ call_ref(inv_trans_in0 + 8); -+ call_new(inv_trans_in1 + 8); -+ if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t))) -+ fail(); -+ bench_new(inv_trans_in1 + 8); -+ av_free(coeffs); -+ } -+} ++ for (;;) { ++ struct pollfd pfd = { ++ .fd = m->fd, ++ // If capture && stream not started then assume we are waiting for the initial event ++ .events = !is_cap ? poll_out : ++ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : ++ poll_event, ++ }; ++ int ret; + -+static void check_inv_trans_adding(void) -+{ -+ /* Inverse transform input coefficients are stored in a 16-bit buffer -+ * with row stride of 8 coefficients irrespective of transform size. */ -+ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]); -+ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]); ++ if (ctx->done) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); ++ return AVERROR_EOF; + } + +-dequeue: +- memset(&buf, 0, sizeof(buf)); +- buf.memory = V4L2_MEMORY_MMAP; +- buf.type = ctx->type; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memset(planes, 0, sizeof(planes)); +- buf.length = VIDEO_MAX_PLANES; +- buf.m.planes = planes; ++ // If capture && timeout == -1 then also wait for rx buffer free ++ if (is_cap && timeout == -1 && m->output.streamon && !m->draining) ++ pfd.events |= poll_out; + -+ /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and -+ * added with saturation to an array of unsigned 8-bit values. Oversize -+ * this by 8 samples left and right and one row above and below. */ -+ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]); -+ LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]); ++ // If nothing Qed all we will get is POLLERR - avoid that ++ if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || ++ (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || ++ (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); ++ return AVERROR(EAGAIN); + } + +- while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) { +- const int err = errno; +- if (err == EINTR) ++ // Timeout kludged s.t. "forever" eventually gives up & produces logging ++ // If waiting for an event when we have seen a last_frame then we expect ++ // it to be ready already so force a short timeout ++ ret = poll(&pfd, 1, ++ ff_v4l2_ctx_eos(ctx) ? 10 : ++ timeout == -1 ? 3000 : timeout); ++ if (ret < 0) { ++ ret = AVERROR(errno); // Remember errno before logging etc. ++ av_assert0(ret < 0); ++ } + -+ VC1DSPContext h; ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", ++ ctx->name, ret, timeout, pfd.events, pfd.revents); + -+ const test tests[] = { -+ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8) -+ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4) -+ }; ++ if (ret < 0) { ++ if (ret == AVERROR(EINTR)) + continue; +- if (err != EAGAIN) { +- // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST +- if (err != EPIPE || !is_capture) +- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", +- ctx->name, av_err2str(AVERROR(err))); +- if (ctx_done(ctx) > 0) +- goto start; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); ++ return ret; ++ } + -+ ff_vc1dsp_init(&h); -+ -+ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { -+ void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); -+ if (check_func(func, "vc1dsp.%s", tests[t].name)) { -+ matrix *coeffs; -+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); -+ RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); -+ RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); -+ coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); -+ for (int j = 0; j < tests[t].height; ++j) -+ for (int i = 0; i < tests[t].width; ++i) { -+ int idx = j * 8 + i; -+ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; ++ if (ret == 0) { ++ if (timeout == -1) ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); ++ if (ff_v4l2_ctx_eos(ctx)) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); ++ ret = get_event(m); ++ if (ret < 0) { ++ ctx->done = 1; ++ return ret; + } -+ call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); -+ call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); -+ if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24)) -+ fail(); -+ bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8); -+ av_free(coeffs); + } +- return NULL; ++ return AVERROR(EAGAIN); + } +- --ctx->q_count; +- av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n", +- ctx->name, buf.index, +- buf.timestamp.tv_sec, buf.timestamp.tv_usec, +- ctx->q_count, ++ctx->dq_count, buf.field); +- +- avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; +- avbuf->status = V4L2BUF_AVAILABLE; +- avbuf->buf = buf; +- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { +- memcpy(avbuf->planes, planes, sizeof(planes)); +- avbuf->buf.m.planes = avbuf->planes; ++ ++ if ((pfd.revents & POLLERR) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); ++ return AVERROR_UNKNOWN; + } + +- if (ctx_to_m2mctx(ctx)->draining && is_capture) { +- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? +- buf.m.planes[0].bytesused : buf.bytesused; +- if (bytesused == 0) { +- av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n"); ++ if ((pfd.revents & poll_event) != 0) { ++ ret = get_event(m); ++ if (ret < 0) { ++ ctx->done = 1; ++ return ret; ++ } ++ continue; ++ } + +- // Must reQ so we don't leak +- // May not matter if the next thing we do is release all the +- // buffers but better to be tidy. +- ff_v4l2_buffer_enqueue(avbuf); ++ if ((pfd.revents & poll_cap) != 0) { ++ ret = dq_buf(ctx, ppavbuf); ++ if (ret == AVERROR(EPIPE)) ++ continue; ++ return ret; ++ } + +- if (ctx_done(ctx) > 0) +- goto start; +- return NULL; +- } +-#ifdef V4L2_BUF_FLAG_LAST +- if (buf.flags & V4L2_BUF_FLAG_LAST) { +- av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); +- avbuf->status = V4L2BUF_IN_USE; // Avoid flushing this buffer +- ctx_done(ctx); +- } +-#endif ++ if ((pfd.revents & poll_out) != 0) { ++ if (is_cap) ++ return AVERROR(EAGAIN); ++ return dq_buf(ctx, ppavbuf); + } + +- return avbuf; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); ++ return AVERROR_UNKNOWN; + } +- +- return NULL; + } + + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + { +- int timeout = 0; /* return when no more buffers to dequeue */ + int i; + + /* get back as many output buffers as possible */ + if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- do { +- } while (v4l2_dequeue_v4l2buf(ctx, timeout)); ++ V4L2Buffer * avbuf; ++ do { ++ get_qbuf(ctx, &avbuf, 0); ++ } while (avbuf); + } + + for (i = 0; i < ctx->num_buffers; i++) { +@@ -722,7 +706,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx) + if (buf->status == V4L2BUF_IN_DRIVER) + buf->status = V4L2BUF_AVAILABLE; + } +- ctx->q_count = 0; ++ atomic_store(&ctx->q_count, 0); + } + + static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) +@@ -755,6 +739,10 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + int ret; + AVCodecContext * const avctx = logger(ctx); + ++ // Avoid doing anything if there is nothing we can do ++ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) ++ return 0; ++ + ff_mutex_lock(&ctx->lock); + + if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) +@@ -777,6 +765,9 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); + } + ++ // Both stream off & on effectively clear flag_last ++ ctx->flag_last = 0; ++ + ff_mutex_unlock(&ctx->lock); + + return ret; +@@ -840,19 +831,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + { + V4L2Buffer *avbuf; ++ int rv; + +- /* +- * timeout=-1 blocks until: +- * 1. decoded frame available +- * 2. an input buffer is ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; +- +- return AVERROR(EAGAIN); +- } ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) ++ return rv; + + return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); + } +@@ -860,19 +842,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) + { + V4L2Buffer *avbuf; ++ int rv; + +- /* +- * blocks until: +- * 1. encoded packet available +- * 2. an input buffer ready to be dequeued +- */ +- avbuf = v4l2_dequeue_v4l2buf(ctx, -1); +- if (!avbuf) { +- if (ctx->done) +- return AVERROR_EOF; +- +- return AVERROR(EAGAIN); +- } ++ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) ++ return rv; + + return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); + } +@@ -956,6 +929,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers + int ret; + int i; + ++ av_assert0(ctx->bufrefs == NULL); ++ + memset(&req, 0, sizeof(req)); + req.count = req_buffers; + req.memory = V4L2_MEMORY_MMAP; +@@ -1033,8 +1008,8 @@ int ff_v4l2_context_init(V4L2Context* ctx) + hwframes = (AVHWFramesContext*)ctx->frames_ref->data; + hwframes->format = AV_PIX_FMT_DRM_PRIME; + hwframes->sw_format = ctx->av_pix_fmt; +- hwframes->width = ctx->width; +- hwframes->height = ctx->height; ++ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; ++ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; + ret = av_hwframe_ctx_init(ctx->frames_ref); + if (ret < 0) + goto fail_unref_hwframes; +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index a4176448d5..565858a1ed 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -102,6 +102,8 @@ typedef struct V4L2Context { + */ + int done; + ++ int flag_last; ++ + /** + * PTS rescale not wanted + * If the PTS is just a dummy frame count then rescale is +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index 50a192933b..5b38ad3598 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -233,7 +233,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) + + /* 5. complete reinit */ + s->draining = 0; +- s->reinit = 0; + + return 0; + } +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 3f86809623..d71f6b721c 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -84,8 +84,6 @@ typedef struct V4L2m2mContext { + AVCodecContext *avctx; + sem_t refsync; + atomic_uint refcount; +- int reinit; +- int resize_pending; + + /* null frame/packet received */ + int draining; +@@ -180,15 +178,25 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); + int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); + + +-static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt) ++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) + { + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; + } + +-static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt) ++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) + { + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; + } + ++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; ++} ++ ++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) ++{ ++ return ctx->flag_last; ++} ++ + + #endif /* AVCODEC_V4L2_M2M_H */ +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 94a32a5eee..d09b9f6a6d 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -113,9 +113,6 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); + +- if (!s->capture.streamon || ret < 0) +- return ret; +- + ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); +@@ -127,69 +124,12 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co + + static int v4l2_try_start(AVCodecContext *avctx) + { +- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- V4L2Context *const capture = &s->capture; +- struct v4l2_selection selection = { 0 }; ++ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; + int ret; + + /* 1. start the output process */ + if ((ret = check_output_streamon(avctx, s)) != 0) + return ret; +- +- if (capture->streamon) +- return 0; +- +- /* 2. get the capture format */ +- capture->format.type = capture->type; +- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); +- return ret; +- } +- +- /* 2.1 update the AVCodecContext */ +- capture->av_pix_fmt = +- ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); +- if (s->output_drm) { +- avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; +- avctx->sw_pix_fmt = capture->av_pix_fmt; +- } +- else +- avctx->pix_fmt = capture->av_pix_fmt; +- +- /* 3. set the crop parameters */ +-#if 1 +- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +- selection.target = V4L2_SEL_TGT_CROP_DEFAULT; +- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); +- av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); +-#else +- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +- selection.r.height = avctx->coded_height; +- selection.r.width = avctx->coded_width; +- av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height); +- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); +- av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height); +- if (1) { +- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); +- if (ret) { +- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); +- } else { +- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); +- /* update the size of the resulting frame */ +- capture->height = selection.r.height; +- capture->width = selection.r.width; +- } +- } +-#endif +- +- /* 5. start the capture process */ +- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); +- if (ret) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); +- return ret; +- } +- + return 0; + } + +@@ -364,7 +304,7 @@ xlat_pending(const xlat_track_t * const x) + } + + static inline int stream_started(const V4L2m2mContext * const s) { +- return s->capture.streamon && s->output.streamon; ++ return s->output.streamon; + } + + #define NQ_OK 0 +@@ -377,6 +317,9 @@ static inline int stream_started(const V4L2m2mContext * const s) { + #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) + #define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) + ++// do_not_get If true then no new packet will be got but status will ++// be set appropriately ++ + // AVERROR_EOF Flushing an already flushed stream + // -ve Error (all errors except EOF are unexpected) + // NQ_OK (0) OK +@@ -386,14 +329,14 @@ static inline int stream_started(const V4L2m2mContext * const s) { + // NQ_DRAINING At EOS, dQ dest until EOS there too + // NQ_DEAD Not running (do not retry, do not attempt capture dQ) + +-static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) ++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) + { + int ret; + + // If we don't already have a coded packet - get a new one + // We will already have a coded pkt if the output Q was full last time we + // tried to Q it +- if (!s->buf_pkt.size) { ++ if (!s->buf_pkt.size && !do_not_get) { + ret = ff_decode_get_packet(avctx, &s->buf_pkt); + + if (ret == AVERROR(EAGAIN)) { +@@ -435,6 +378,17 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); + } + ++ if (s->draining) { ++ if (s->buf_pkt.size) { ++ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); ++ av_packet_unref(&s->buf_pkt); ++ } ++ return NQ_DRAINING; ++ } ++ ++ if (!s->buf_pkt.size) ++ return NQ_NONE; ++ + if ((ret = check_output_streamon(avctx, s)) != 0) + return ret; + +@@ -471,7 +425,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- int src_rv = NQ_NONE; ++ int src_rv; + int dst_rv = 1; // Non-zero (done), non-negative (error) number + unsigned int i = 0; + +@@ -483,31 +437,40 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // (a) We don't have a lot of stuff in the buffer already OR + // (b) ... we (think we) do but we've failed to get a frame already OR + // (c) We've dequeued a lot of frames without asking for input +- if (!prefer_dq || i != 0 || s->req_pkt > 2) { +- src_rv = try_enqueue_src(avctx, s); +- +- // If we got a frame last time or we've already tried to get a frame and +- // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) +- // indicating that we want more input. +- // This should mean that once decode starts we enter a stable state where +- // we alternately ask for input and produce output +- if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) +- break; +- } ++ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); ++ ++ // If we got a frame last time or we've already tried to get a frame and ++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) ++ // indicating that we want more input. ++ // This should mean that once decode starts we enter a stable state where ++ // we alternately ask for input and produce output ++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) ++ break; + + // Try to get a new frame if + // (a) we haven't already got one AND + // (b) enqueue returned a status indicating that decode should be attempted + if (dst_rv != 0 && TRY_DQ(src_rv)) { ++ // Pick a timeout depending on state ++ const int t = ++ src_rv == NQ_DRAINING ? 300 : ++ prefer_dq ? 5 : ++ src_rv == NQ_Q_FULL ? -1 : 0; ++ + do { + // Dequeue frame will unref any previous contents of frame + // if it returns success so we don't need an explicit unref + // when discarding + // This returns AVERROR(EAGAIN) on timeout or if + // there is room in the input Q and timeout == -1 +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1); ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + +- if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) ++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { ++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); ++ dst_rv = AVERROR_EOF; ++ s->capture.done = 1; ++ } ++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) + av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", + s->draining, s->capture.done); + else if (dst_rv && dst_rv != AVERROR(EAGAIN)) +@@ -630,8 +593,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + * by the v4l2 driver; this event will trigger a full pipeline reconfig and + * the proper values will be retrieved from the kernel driver. + */ +- output->height = capture->height = avctx->coded_height; +- output->width = capture->width = avctx->coded_width; ++// output->height = capture->height = avctx->coded_height; ++// output->width = capture->width = avctx->coded_width; ++ output->height = capture->height = 0; ++ output->width = capture->width = 0; + + output->av_codec_id = avctx->codec_id; + output->av_pix_fmt = AV_PIX_FMT_NONE; +@@ -703,7 +668,6 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + V4L2m2mContext * const s = priv->context; + V4L2Context * const output = &s->output; + V4L2Context * const capture = &s->capture; +- int ret; + + av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); + +@@ -711,13 +675,19 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + // states like EOS processing so don't try to optimize out (having got it + // wrong once) + +- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); +- if (ret < 0) +- av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); ++ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); + + // Clear any buffered input packet + av_packet_unref(&s->buf_pkt); + ++ // Clear a pending EOS ++ if (ff_v4l2_ctx_eos(capture)) { ++ // Arguably we could delay this but this is easy and doesn't require ++ // thought or extra vars ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); ++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); ++ } ++ + // V4L2 makes no guarantees about whether decoded frames are flushed or not + // so mark all frames we are tracking to be discarded if they appear + xlat_flush(&s->xlat); + +From e44ac2d67a09e89678c0b6a13329295f0755c22d Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 9 Dec 2021 18:51:00 +0000 +Subject: [PATCH 039/113] Honor result of ff_get_format if possible + +--- + libavcodec/v4l2_m2m_dec.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index d09b9f6a6d..db87cc5d72 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -615,15 +615,19 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + * check the v4l2_get_drm_frame function. + */ + ++ avctx->sw_pix_fmt = avctx->pix_fmt; + gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); + av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", + avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); + +- s->output_drm = 0; + if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + s->output_drm = 1; + } ++ else { ++ capture->av_pix_fmt = gf_pix_fmt; ++ s->output_drm = 0; ++ } + + s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); + if (!s->device_ref) { + +From 2e5821bb8fb333bd971878796f721544f9c26df0 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 14 Dec 2021 16:11:10 +0000 +Subject: [PATCH 040/113] Add an always-reinit quirk + +--- + libavcodec/v4l2_context.c | 7 +++++-- + libavcodec/v4l2_m2m.h | 5 +++++ + libavcodec/v4l2_m2m_dec.c | 33 ++++++++++++++++++++++++++++++++- + 3 files changed, 42 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index fe21218710..6e2e5f24ad 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -188,6 +188,9 @@ static int do_source_change(V4L2m2mContext * const s) + get_default_selection(&s->capture, &s->capture.selection); + + reinit = ctx_resolution_changed(&s->capture, &cap_fmt); ++ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) ++ reinit = 1; ++ + s->capture.format = cap_fmt; + if (reinit) { + s->capture.height = ff_v4l2_get_format_height(&cap_fmt); +@@ -202,10 +205,10 @@ static int do_source_change(V4L2m2mContext * const s) + + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + +- av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n", ++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n", + s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, + s->capture.selection.width, s->capture.selection.height, +- s->capture.selection.left, s->capture.selection.top); ++ s->capture.selection.left, s->capture.selection.top, reinit); + + if (reinit) { + if (avctx) +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index d71f6b721c..f1923bb26d 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -113,6 +113,11 @@ typedef struct V4L2m2mContext { + + /* Ext data sent */ + int extdata_sent; ++ ++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 ++ /* Quirks */ ++ unsigned int quirks; ++ + } V4L2m2mContext; + + typedef struct V4L2m2mPriv { +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index db87cc5d72..f39f3e7ee2 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -540,6 +540,34 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + #endif + ++static int ++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ struct v4l2_capability cap; ++ ++ memset(&cap, 0, sizeof(cap)); ++ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { ++ int err = errno; ++ if (err == EINTR) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); ++ return AVERROR(err); ++ } ++ ++ // Could be made table driven if we have a few more but right now there ++ // seems no point ++ ++ // Meson (amlogic) always gives a resolution changed event after output ++ // streamon and userspace must (re)allocate capture buffers and streamon ++ // capture to clear the event even if the capture buffers were the right ++ // size in the first place. ++ if (strcmp(cap.driver, "meson-vdec") == 0) ++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); ++ return 0; ++} ++ + // This heuristic is for H264 but use for everything + static uint32_t max_coded_size(const AVCodecContext * const avctx) + { +@@ -646,7 +674,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + return ret; + } + +- return v4l2_prepare_decoder(s); ++ if ((ret = v4l2_prepare_decoder(s)) < 0) ++ return ret; ++ ++ return get_quirks(avctx, s); + } + + static av_cold int v4l2_decode_close(AVCodecContext *avctx) + +From 69d3d78fb855652b5816833ec619ef90c9bb33b3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 4 Jan 2022 16:58:31 +0000 +Subject: [PATCH 041/113] v4l2_buffers: rework flags for keyframe + +Previously flags could become confused and keyframe info could be lost. +This fixes that and removes the duplicate flags field in V4L2Buffer. +--- + libavcodec/v4l2_buffers.c | 15 ++++++++++----- + libavcodec/v4l2_buffers.h | 1 - + libavcodec/v4l2_context.c | 18 +++++++++++++++++- + 3 files changed, 27 insertions(+), 7 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 2cf7be6632..62d1c26053 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -680,7 +680,9 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + + int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { +- out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME); ++ out->buf.flags = frame->key_frame ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + // Beware that colour info is held in format rather than the actual + // v4l2 buffer struct so this may not be as useful as you might hope + v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); +@@ -706,6 +708,10 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) + + /* 2. get frame information */ + frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); ++ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : ++ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : ++ AV_PICTURE_TYPE_NONE; + frame->color_primaries = v4l2_get_color_primaries(avbuf); + frame->colorspace = v4l2_get_color_space(avbuf); + frame->color_range = v4l2_get_color_range(avbuf); +@@ -779,8 +785,9 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + + v4l2_set_pts(out, pkt->pts); + +- if (pkt->flags & AV_PKT_FLAG_KEY) +- out->flags = V4L2_BUF_FLAG_KEYFRAME; ++ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? ++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : ++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + + return ret; + } +@@ -924,8 +931,6 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + int ret; + int qc; + +- avbuf->buf.flags = avbuf->flags; +- + if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 641e0e147b..3b7ca4d99e 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -73,7 +73,6 @@ typedef struct V4L2Buffer { + struct v4l2_buffer buf; + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + +- int flags; + enum V4L2Buffer_status status; + + } V4L2Buffer; +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 6e2e5f24ad..d8a86e8261 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -527,6 +527,22 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout + } + } + ++// Clear out flags and timestamps that should should be set by the user ++// Returns the passed avbuf ++static V4L2Buffer * ++clean_v4l2_buffer(V4L2Buffer * const avbuf) ++{ ++ struct v4l2_buffer *const buf = &avbuf->buf; ++ ++ buf->flags = 0; ++ buf->field = V4L2_FIELD_ANY; ++ buf->timestamp = (struct timeval){0}; ++ buf->timecode = (struct v4l2_timecode){0}; ++ buf->sequence = 0; ++ ++ return avbuf; ++} ++ + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + { + int i; +@@ -542,7 +558,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + for (i = 0; i < ctx->num_buffers; i++) { + V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; + if (avbuf->status == V4L2BUF_AVAILABLE) +- return avbuf; ++ return clean_v4l2_buffer(avbuf); + } + + return NULL; + +From 95cdef2adcd0623098b34a2f7ce489c0af4a819a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 22 Mar 2022 11:44:30 +0000 +Subject: [PATCH 042/113] v4l2m2m: Rework decode to wait for missing buffer, + add dynamic pending + +Previously receive_frame exited with EAGAIN if no capture buffer +availble in the Q. Now it waits in the hope that another thread will +post one. + +The prefer dQ logic is now dynamic to help with cases where PTS/DTS +lies. If it looks like we are never getting a frame then the +threshold is increased. It then slowly decays over time to cope with +false alarms. +--- + libavcodec/v4l2_buffers.c | 6 +++-- + libavcodec/v4l2_context.c | 7 +++-- + libavcodec/v4l2_context.h | 3 +++ + libavcodec/v4l2_m2m.h | 2 ++ + libavcodec/v4l2_m2m_dec.c | 57 +++++++++++++++++++++++++++++++++++++-- + 5 files changed, 69 insertions(+), 6 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 62d1c26053..8c4f18dbed 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -947,12 +947,14 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + return AVERROR(err); + } + ++ // Lock not wanted - if called from buffer free then lock already obtained + qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; ++ avbuf->status = V4L2BUF_IN_DRIVER; ++ pthread_cond_broadcast(&avbuf->context->cond); ++ + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, + avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); + +- avbuf->status = V4L2BUF_IN_DRIVER; +- + return 0; + } +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index d8a86e8261..1aff16c1de 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -300,6 +300,7 @@ static int v4l2_stop_encode(V4L2Context *ctx) + // Returns: + // 0 Success + // AVERROR(EPIPE) Nothing more to read ++// AVERROR(ENOSPC) No buffers in Q to put result in + // * AVERROR(..) + + static int +@@ -457,7 +458,7 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout + (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || + (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); +- return AVERROR(EAGAIN); ++ return AVERROR(ENOSPC); + } + + // Timeout kludged s.t. "forever" eventually gives up & produces logging +@@ -864,7 +865,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) + int rv; + + if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) +- return rv; ++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC + + return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); + } +@@ -938,6 +939,7 @@ void ff_v4l2_context_release(V4L2Context* ctx) + av_buffer_unref(&ctx->frames_ref); + + ff_mutex_destroy(&ctx->lock); ++ pthread_cond_destroy(&ctx->cond); + } + + +@@ -1013,6 +1015,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) + } + + ff_mutex_init(&ctx->lock, NULL); ++ pthread_cond_init(&ctx->cond, NULL); + atomic_init(&ctx->q_count, 0); + + if (s->output_drm) { +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 565858a1ed..0efff58f18 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -116,6 +116,7 @@ typedef struct V4L2Context { + struct ff_weak_link_master *wl_master; + + AVMutex lock; ++ pthread_cond_t cond; + } V4L2Context; + + /** +@@ -182,6 +183,8 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); + * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) + * + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. ++ * AVERROR(ENOSPC) if no buffer availible to put ++ * the frame in + */ + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index f1923bb26d..9a20447030 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -105,6 +105,8 @@ typedef struct V4L2m2mContext { + + /* Frame tracking */ + xlat_track_t xlat; ++ int pending_hw; ++ int pending_n; + + pts_stats_t pts_stat; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index f39f3e7ee2..ab3584238d 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -251,7 +251,8 @@ xlat_pts_out(AVCodecContext *const avctx, + + frame->best_effort_timestamp = pts_stats_guess(ps); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? +- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); + return 0; + } + +@@ -422,6 +423,36 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + return ret; + } + ++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) ++{ ++ int rv = 0; ++ ++ ff_mutex_lock(&ctx->lock); ++ ++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { ++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); ++ break; ++ } ++ } ++ ++ ff_mutex_unlock(&ctx->lock); ++ return rv; ++} ++ ++// Number of frames over what xlat_pending returns that we keep *16 ++// This is a min value - if it appears to be too small the threshold should ++// adjust dynamically. ++#define PENDING_HW_MIN (3 * 16) ++// Offset to use when setting dynamically ++// Set to %16 == 15 to avoid the threshold changing immediately as we relax ++#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) ++// Number of consecutive times we've failed to get a frame when we prefer it ++// before we increase the prefer threshold (5ms * N = max expected decode ++// time) ++#define PENDING_N_THRESHOLD 6 ++ + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; +@@ -431,7 +462,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + + do { + const int pending = xlat_pending(&s->xlat); +- const int prefer_dq = (pending > 5); ++ const int prefer_dq = (pending > s->pending_hw / 16); + + // Enqueue another pkt for decode if + // (a) We don't have a lot of stuff in the buffer already OR +@@ -465,6 +496,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // there is room in the input Q and timeout == -1 + dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + ++ // Failure due to no buffer in Q? ++ if (dst_rv == AVERROR(ENOSPC)) { ++ // Wait & retry ++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ } ++ } ++ ++ // Adjust dynamic pending threshold ++ if (dst_rv == 0) { ++ if (--s->pending_hw < PENDING_HW_MIN) ++ s->pending_hw = PENDING_HW_MIN; ++ s->pending_n = 0; ++ } ++ else if (dst_rv == AVERROR(EAGAIN)) { ++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { ++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; ++ s->pending_n = 0; ++ } ++ } ++ + if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { + av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); + dst_rv = AVERROR_EOF; +@@ -613,6 +665,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); ++ s->pending_hw = PENDING_HW_MIN; + + capture = &s->capture; + output = &s->output; + +From 123466d58bdd4529fea70395f8201c78bfc5bc8a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 25 Mar 2022 15:37:58 +0000 +Subject: [PATCH 043/113] v4l2_m2m2_dec: Avoid loop if unable to resize buffers + +If source change signals a buffer size that cannot be honored give up +rather than looping indefinitely. This happens on Pi if (say) a +2560x1440 h264 stream is presented to the decode. +--- + libavcodec/v4l2_context.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 1aff16c1de..e4c848d6da 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -205,8 +205,9 @@ static int do_source_change(V4L2m2mContext * const s) + + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + +- av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n", ++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", + s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, ++ s->capture.width, s->capture.height, + s->capture.selection.width, s->capture.selection.height, + s->capture.selection.left, s->capture.selection.top, reinit); + +@@ -224,9 +225,17 @@ static int do_source_change(V4L2m2mContext * const s) + return AVERROR(EINVAL); + } + ++ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || ++ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", ++ s->capture.width, s->capture.height, ++ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); ++ return AVERROR(EINVAL); ++ } ++ + // Update pixel format - should only actually do something on initial change + s->capture.av_pix_fmt = +- ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); ++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); + if (s->output_drm) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + avctx->sw_pix_fmt = s->capture.av_pix_fmt; + +From 38f2aeca1d38db698686a41f87147d97e18e88db Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 25 Mar 2022 18:14:40 +0000 +Subject: [PATCH 044/113] v4l2dec: Improve size/format validation on init + +--- + libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++-- + libavcodec/v4l2_request_hevc.c | 11 +++++ + 2 files changed, 92 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index ab3584238d..f598072b94 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -592,6 +592,76 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + #endif + ++static int ++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ unsigned int i; ++ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); ++ const uint32_t w = avctx->coded_width; ++ const uint32_t h = avctx->coded_height; ++ ++ if (w == 0 || h == 0 || fcc == 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } ++ ++ for (i = 0;; ++i) { ++ struct v4l2_frmsizeenum fs = { ++ .index = i, ++ .pixel_format = fcc, ++ }; ++ ++ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { ++ const int err = AVERROR(errno); ++ if (err == AVERROR(EINTR)) ++ continue; ++ if (i == 0 && err == AVERROR(ENOTTY)) { ++ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); ++ return 0; ++ } ++ if (err != AVERROR(EINVAL)) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); ++ return err; ++ } ++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n", ++ w, h, av_fourcc2str(fcc)); ++ return err; ++ } ++ ++ switch (fs.type) { ++ case V4L2_FRMSIZE_TYPE_DISCRETE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, ++ fs.discrete.width,fs.discrete.height); ++ if (w == fs.discrete.width && h == fs.discrete.height) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_STEPWISE: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && ++ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && ++ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) ++ return 0; ++ break; ++ case V4L2_FRMSIZE_TYPE_CONTINUOUS: ++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, ++ fs.stepwise.min_width, fs.stepwise.min_height, ++ fs.stepwise.max_width, fs.stepwise.max_height, ++ fs.stepwise.step_width,fs.stepwise.step_height); ++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && ++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) ++ return 0; ++ break; ++ default: ++ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); ++ return AVERROR(EINVAL); + } + } +} + -+static void check_loop_filter(void) + static int + get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) + { +@@ -698,8 +768,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + avctx->sw_pix_fmt = avctx->pix_fmt; + gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); +- av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n", +- avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); ++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", ++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), ++ avctx->coded_width, avctx->coded_height, ++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); + + if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; +@@ -730,7 +802,13 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + if ((ret = v4l2_prepare_decoder(s)) < 0) + return ret; + +- return get_quirks(avctx, s); ++ if ((ret = get_quirks(avctx, s)) != 0) ++ return ret; ++ ++ if ((ret = check_size(avctx, s)) != 0) ++ return ret; ++ ++ return 0; + } + + static av_cold int v4l2_decode_close(AVCodecContext *avctx) +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index b0a5930844..76ab0916cd 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -147,6 +147,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + + av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); + ++ // Give up immediately if this is something that we have no code to deal with ++ if (h->ps.sps->chroma_format_idc != 1) { ++ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); ++ return AVERROR_PATCHWELCOME; ++ } ++ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || ++ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { ++ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); ++ return AVERROR_PATCHWELCOME; ++ } ++ + if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); + return (AVERROR(-ret)); + +From 1c64aa3f328d594db17b90d837c2435b5ab1f460 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 13 Apr 2022 16:05:56 +0000 +Subject: [PATCH 045/113] v4l2 stateless hevc: Add another API variation for + linux 5.18 + +This is probably going to be a short lived variation and may end up +being reverted if no release using it ever ends up in the wild. +--- + libavcodec/Makefile | 2 +- + libavcodec/hevc-ctrls-v3.h | 255 +++++++++++++++++++++++++++++++++ + libavcodec/v4l2_req_hevc_v3.c | 3 + + libavcodec/v4l2_req_hevc_vx.c | 17 +++ + libavcodec/v4l2_req_media.c | 15 +- + libavcodec/v4l2_req_media.h | 3 + + libavcodec/v4l2_request_hevc.c | 6 +- + libavcodec/v4l2_request_hevc.h | 1 + + 8 files changed, 295 insertions(+), 7 deletions(-) + create mode 100644 libavcodec/hevc-ctrls-v3.h + create mode 100644 libavcodec/v4l2_req_hevc_v3.c + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index a40c46bf93..09962a810b 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -976,7 +976,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o + OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ +- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o ++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o +diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h +new file mode 100644 +index 0000000000..4e35bd583d +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v3.h +@@ -0,0 +1,255 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 padding[5]; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u8 num_active_dpb_entries; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ ++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) ++/* ++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - ++ * the number of data (in bits) to skip in the ++ * slice segment header. ++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" ++ * to before syntax element "slice_temporal_mvp_enabled_flag". ++ * If IDR, the skipped bits are just "pic_output_flag" ++ * (separate_colour_plane_flag is not supported). ++ */ ++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) ++ ++#endif +diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c +new file mode 100644 +index 0000000000..dcc8d95632 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v3.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 3 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 0ae03b10c4..611fa21cc3 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -16,6 +16,8 @@ + + #elif HEVC_CTRLS_VERSION == 2 + #include "hevc-ctrls-v2.h" ++#elif HEVC_CTRLS_VERSION == 3 ++#include "hevc-ctrls-v3.h" + #else + #error Unknown HEVC_CTRLS_VERSION + #endif +@@ -147,6 +149,7 @@ static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_t + } + } + ++#if HEVC_CTRLS_VERSION <= 2 + static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) + { + const HEVCFrame *frame; +@@ -172,6 +175,7 @@ static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) + + return 0; + } ++#endif + + static unsigned int + get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, +@@ -247,7 +251,12 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const + struct v4l2_hevc_dpb_entry * const entry = entries + n++; + + entry->timestamp = frame_capture_dpb(frame->frame); ++#if HEVC_CTRLS_VERSION <= 2 + entry->rps = find_frame_rps_type(h, entry->timestamp); ++#else ++ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : ++ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; ++#endif + entry->field_pic = frame->frame->interlaced_frame; + + /* TODO: Interleaved: Get the POC for each field. */ +@@ -1011,6 +1020,14 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + }; + const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); + ++#if HEVC_CTRLS_VERSION == 2 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) ++ return AVERROR(EINVAL); ++#elif HEVC_CTRLS_VERSION == 3 ++ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) ++ return AVERROR(EINVAL); ++#endif ++ + if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { + av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); + return AVERROR(EINVAL); +diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c +index eb00ecb406..980b306b8a 100644 +--- a/libavcodec/v4l2_req_media.c ++++ b/libavcodec/v4l2_req_media.c +@@ -604,6 +604,7 @@ struct mediabufs_ctl { + + struct v4l2_format src_fmt; + struct v4l2_format dst_fmt; ++ struct v4l2_capability capability; + }; + + static int qe_v4l2_queue(struct qent_base *const be, +@@ -1498,20 +1499,24 @@ void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) + mediabufs_ctl_delete(mbc); + } + ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) +{ -+ /* Deblocking filter buffers are big enough to hold a 16x16 block, -+ * plus 16 columns left and 4 rows above to hold filter inputs -+ * (depending on whether v or h neighbouring block edge, oversized -+ * horizontally to maintain 16-byte alignment) plus 16 columns and -+ * 4 rows below to catch write overflows */ -+ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]); -+ LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]); ++ return mbc->capability.version; ++} + -+ VC1DSPContext h; + static int set_capabilities(struct mediabufs_ctl *const mbc) + { +- struct v4l2_capability capability = { 0 }; + uint32_t caps; + +- if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) { ++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { + int err = errno; + request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); + return -err; + } + +- caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? +- capability.device_caps : +- capability.capabilities; ++ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? ++ mbc->capability.device_caps : ++ mbc->capability.capabilities; + + if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { + mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; +diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h +index 2f826cfb14..0307a831de 100644 +--- a/libavcodec/v4l2_req_media.h ++++ b/libavcodec/v4l2_req_media.h +@@ -142,6 +142,9 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, + struct dmabufs_ctl * const dbsc, + unsigned int n); + ++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) ++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); + -+ const test tests[] = { -+ VC1DSP_TEST(vc1_v_loop_filter4) -+ VC1DSP_TEST(vc1_h_loop_filter4) -+ VC1DSP_TEST(vc1_v_loop_filter8) -+ VC1DSP_TEST(vc1_h_loop_filter8) -+ VC1DSP_TEST(vc1_v_loop_filter16) -+ VC1DSP_TEST(vc1_h_loop_filter16) + struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, + const char *vpath, struct pollqueue *const pq); + void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index 76ab0916cd..20e4e0ab15 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + goto fail4; + } + +- if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { ++ if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 3); ++ } ++ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 2); + } +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +index f14f594564..ed48d62e2d 100644 +--- a/libavcodec/v4l2_request_hevc.h ++++ b/libavcodec/v4l2_request_hevc.h +@@ -98,5 +98,6 @@ typedef struct v4l2_req_decode_fns { + + extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); + extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); + + #endif + +From 920f91ffbcd3b8f4834ff6d903df7212a958486d Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 3 May 2022 12:44:42 +0000 +Subject: [PATCH 046/113] Remove V4l2 frame size check for meson-vdec + +--- + libavcodec/v4l2_m2m.h | 3 ++- + libavcodec/v4l2_m2m_dec.c | 10 +++++++--- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 9a20447030..6bd5e8eda7 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -116,7 +116,8 @@ typedef struct V4L2m2mContext { + /* Ext data sent */ + int extdata_sent; + +-#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 ++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 ++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 + /* Quirks */ + unsigned int quirks; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index f598072b94..8328a78930 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -604,6 +604,10 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) + av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); + return 0; + } ++ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { ++ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); ++ return 0; ++ } + + for (i = 0;; ++i) { + struct v4l2_frmsizeenum fs = { +@@ -623,8 +627,8 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) + av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); + return err; + } +- av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n", +- w, h, av_fourcc2str(fcc)); ++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", ++ w, h, av_fourcc2str(fcc), i); + return err; + } + +@@ -684,7 +688,7 @@ get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) + // capture to clear the event even if the capture buffers were the right + // size in the first place. + if (strcmp(cap.driver, "meson-vdec") == 0) +- s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS; ++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; + + av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); + return 0; + +From 6d0e620abd7501c0c365e6f94b8937b55e7adf8f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 23 May 2022 18:05:20 +0100 +Subject: [PATCH 047/113] v4l2m2m_dec: Make some error rturns a bit more robust + +--- + libavcodec/v4l2_context.c | 5 ++--- + libavcodec/v4l2_m2m_dec.c | 23 ++++++++++++++--------- + 2 files changed, 16 insertions(+), 12 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index e4c848d6da..c0d257e5d3 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -765,7 +765,7 @@ static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) + int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + { + int type = ctx->type; +- int ret; ++ int ret = 0; + AVCodecContext * const avctx = logger(ctx); + + // Avoid doing anything if there is nothing we can do +@@ -777,8 +777,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) + stuff_all_buffers(avctx, ctx); + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); +- if (ret < 0) { ++ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { + const int err = errno; + av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 8328a78930..00d2d46d5d 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -110,16 +110,21 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co + return 0; + + ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); +- if (ret < 0) +- av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); +- +- ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); +- if (ret < 0) +- av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); +- else +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n"); ++ if (ret != 0) { ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); ++ return ret; ++ } + +- return ret; ++ // STREAMON should do implicit START so this just for those that don't. ++ // It is optional so don't worry if it fails ++ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { ++ ret = AVERROR(errno); ++ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); ++ } ++ else { ++ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); ++ } ++ return 0; + } + + static int v4l2_try_start(AVCodecContext *avctx) + +From e1a8efe55dd299dfecd9c0ada3b497d7c6f50cd8 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 24 May 2022 17:02:58 +0000 +Subject: [PATCH 048/113] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA + +Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA. Should +also detect and complain about unexpected streams of empty packets. + +This functionality untested as I haven't yet found anything that creates +NEW_EXTRADATA side data. +--- + libavcodec/v4l2_m2m.c | 1 + + libavcodec/v4l2_m2m.h | 3 +++ + libavcodec/v4l2_m2m_dec.c | 49 ++++++++++++++++++++++++++++++++++++--- + 3 files changed, 50 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index 5b38ad3598..728932fadc 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -249,6 +249,7 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) + av_frame_unref(s->frame); + av_frame_free(&s->frame); + av_packet_unref(&s->buf_pkt); ++ av_freep(&s->extdata_data); + + av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 6bd5e8eda7..19d618698d 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -115,6 +115,9 @@ typedef struct V4L2m2mContext { + + /* Ext data sent */ + int extdata_sent; ++ /* Ext data sent in packet - overrides ctx */ ++ uint8_t * extdata_data; ++ size_t extdata_size; + + #define FF_V4L2_QUIRK_REINIT_ALWAYS 1 + #define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 00d2d46d5d..2f20fc9ad8 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -343,7 +343,46 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + // We will already have a coded pkt if the output Q was full last time we + // tried to Q it + if (!s->buf_pkt.size && !do_not_get) { +- ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ unsigned int i; ++ ++ for (i = 0; i < 256; ++i) { ++ uint8_t * side_data; ++ size_t side_size; ++ ++ ret = ff_decode_get_packet(avctx, &s->buf_pkt); ++ if (ret != 0) ++ break; ++ ++ // New extradata is the only side-data we undertand ++ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); ++ if (side_data) { ++ av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); ++ av_freep(&s->extdata_data); ++ if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size); ++ return AVERROR(ENOMEM); ++ } ++ memcpy(s->extdata_data, side_data, side_size); ++ s->extdata_size = side_size; ++ s->extdata_sent = 0; ++ } ++ ++ if (s->buf_pkt.size != 0) ++ break; ++ ++ if (s->buf_pkt.side_data_elems == 0) { ++ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); ++ ret = AVERROR_EOF; ++ break; ++ } ++ ++ // Retry a side-data only pkt ++ } ++ // If i >= 256 something has gone wrong ++ if (i >= 256) { ++ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); ++ return AVERROR(EIO); ++ } + + if (ret == AVERROR(EAGAIN)) { + if (!stream_started(s)) { +@@ -398,8 +437,12 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + if ((ret = check_output_streamon(avctx, s)) != 0) + return ret; + +- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, +- avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); ++ if (s->extdata_sent) ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); ++ else if (s->extdata_data) ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); ++ else ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); + + if (ret == AVERROR(EAGAIN)) { + // Out of input buffers - keep packet + +From 0797fef446b16f03214043e93e7dadaace02e6e6 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 24 May 2022 20:02:48 +0000 +Subject: [PATCH 049/113] v4l2m2m_dec: Catch repeated Q fulls + +--- + libavcodec/v4l2_m2m_dec.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 2f20fc9ad8..4765fe0d5e 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -504,13 +504,14 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; +- int src_rv; ++ int src_rv = NQ_OK; + int dst_rv = 1; // Non-zero (done), non-negative (error) number + unsigned int i = 0; + + do { + const int pending = xlat_pending(&s->xlat); + const int prefer_dq = (pending > s->pending_hw / 16); ++ const int last_src_rv = src_rv; + + // Enqueue another pkt for decode if + // (a) We don't have a lot of stuff in the buffer already OR +@@ -526,6 +527,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) + break; + ++ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { ++ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); ++ break; ++ } ++ + // Try to get a new frame if + // (a) we haven't already got one AND + // (b) enqueue returned a status indicating that decode should be attempted + +From 8c842b220b9d55ff37f7149f198d6dfba532558f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 25 May 2022 15:22:12 +0000 +Subject: [PATCH 050/113] Remove requirement for epoxy & libudev config options + +--- + configure | 26 +++++++++++++++++--------- + pi-util/conf_native.sh | 2 -- + 2 files changed, 17 insertions(+), 11 deletions(-) + +diff --git a/configure b/configure +index 8424d451fb..a8e7ee5dab 100755 +--- a/configure ++++ b/configure +@@ -205,6 +205,7 @@ External library support: + --disable-bzlib disable bzlib [autodetect] + --disable-coreimage disable Apple CoreImage framework [autodetect] + --enable-chromaprint enable audio fingerprinting with chromaprint [no] ++ --disable-epoxy disable epoxy [autodetect] + --enable-frei0r enable frei0r video filtering [no] + --enable-gcrypt enable gcrypt, needed for rtmp(t)e support + if openssl, librtmp or gmp is not used [no] +@@ -281,7 +282,7 @@ External library support: + if openssl, gnutls or mbedtls is not used [no] + --enable-libtwolame enable MP2 encoding via libtwolame [no] + --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] +- --enable-libudev enable libudev [no] ++ --disable-libudev disable libudev [autodetect] + --enable-libv4l2 enable libv4l2/v4l-utils [no] + --enable-libvidstab enable video stabilization using vid.stab [no] + --enable-libvmaf enable vmaf filter via libvmaf [no] +@@ -1759,7 +1760,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" + avfoundation + bzlib + coreimage ++ epoxy + iconv ++ libudev + libxcb + libxcb_shm + libxcb_shape +@@ -1831,7 +1834,6 @@ EXTERNAL_LIBRARY_LIST=" + libdav1d + libdc1394 + libdrm +- epoxy + libflite + libfontconfig + libfreetype +@@ -1875,7 +1877,6 @@ EXTERNAL_LIBRARY_LIST=" + libtheora + libtwolame + libuavs3d +- libudev + libv4l2 + libvmaf + libvorbis +@@ -3557,9 +3558,8 @@ v4l2_indev_suggest="libv4l2" + v4l2_outdev_deps="libdrm" + v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_outdev_suggest="libv4l2" +-vout_drm_outdev_deps="libdrm vout_drm" +-vout_egl_outdev_deps="xlib" +-vout_egl_outdev_select="epoxy" ++vout_drm_outdev_deps="libdrm" ++vout_egl_outdev_deps="xlib epoxy" + vfwcap_indev_deps="vfw32 vfwcap_defines" + xcbgrab_indev_deps="libxcb" + xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" +@@ -6316,6 +6316,12 @@ if enabled xlib; then + disable xlib + fi + ++enabled libudev && ++ check_pkg_config libudev libudev libudev.h udev_new ++ ++enabled epoxy && ++ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version ++ + check_headers direct.h + check_headers dirent.h + check_headers dxgidebug.h +@@ -6561,7 +6567,6 @@ enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d + enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open + enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new + enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion +-enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version + enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || + { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && + warn "using libfdk without pkg-config"; } } +@@ -6655,7 +6660,6 @@ enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame + { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || + die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } + enabled libuavs3d && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode +-enabled libudev && require_pkg_config libudev libudev libudev.h udev_new + enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl + enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit + enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init +@@ -6760,9 +6764,13 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r + enabled v4l2_request && { enabled libdrm || + die "ERROR: v4l2-request requires --enable-libdrm"; } && + { enabled libudev || +- die "ERROR: v4l2-request requires --enable-libudev"; } ++ die "ERROR: v4l2-request requires libudev"; } + enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init + ++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } ++ ++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && ++ { enabled xlib || die "ERROR: vout_egl requires xlib"; } + + if enabled gcrypt; then + GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" +diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh +index 65576846e8..37cea71756 100755 +--- a/pi-util/conf_native.sh ++++ b/pi-util/conf_native.sh +@@ -91,8 +91,6 @@ $FFSRC/configure \ + --disable-thumb\ + --enable-v4l2-request\ + --enable-libdrm\ +- --enable-epoxy\ +- --enable-libudev\ + --enable-vout-egl\ + --enable-vout-drm\ + $SHARED_LIBS\ + +From c01515bd877dcdcdef86babbe93bdd847236d886 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 27 May 2022 09:36:51 +0000 +Subject: [PATCH 051/113] hevc: If hwaccel avoid creation of s/w only vars + +--- + libavcodec/hevc_refs.c | 35 +++++++++++++++++++++-------------- + libavcodec/hevcdec.c | 42 +++++++++++++++++++++++++++++------------- + 2 files changed, 50 insertions(+), 27 deletions(-) + +diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c +index 6a70c817b0..829943f910 100644 +--- a/libavcodec/hevc_refs.c ++++ b/libavcodec/hevc_refs.c +@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s) + if (!frame->rpl_buf) + goto fail; + +- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); +- if (!frame->tab_mvf_buf) +- goto fail; +- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ if (s->tab_mvf_pool) { ++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); ++ if (!frame->tab_mvf_buf) ++ goto fail; ++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ } + +- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); +- if (!frame->rpl_tab_buf) +- goto fail; +- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; +- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; +- for (j = 0; j < frame->ctb_count; j++) +- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ if (s->rpl_tab_pool) { ++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); ++ if (!frame->rpl_tab_buf) ++ goto fail; ++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; ++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; ++ for (j = 0; j < frame->ctb_count; j++) ++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ } + + frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; + frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); +@@ -284,14 +288,17 @@ static int init_slice_rpl(HEVCContext *s) + int ctb_count = frame->ctb_count; + int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + int i; ++ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; + + if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) + return AVERROR_INVALIDDATA; + +- for (i = ctb_addr_ts; i < ctb_count; i++) +- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; ++ if (frame->rpl_tab) { ++ for (i = ctb_addr_ts; i < ctb_count; i++) ++ frame->rpl_tab[i] = tab; ++ } + +- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; ++ frame->refPicList = tab->refPicList; + + return 0; + } +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index a2c43b888b..cd13de2603 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -524,6 +524,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, + if (!sps) + return 0; + ++ // If hwaccel then we don't need all the s/w decode helper arrays ++ if (s->avctx->hwaccel) { ++ export_stream_params(s, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ return 0; ++ } ++ + ret = pic_arrays_init(s, sps); + if (ret < 0) + goto fail; +@@ -3028,11 +3038,13 @@ static int hevc_frame_start(HEVCContext *s) + ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + +- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); +- memset(s->vertical_bs, 0, s->bs_width * s->bs_height); +- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); +- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); +- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ if (s->horizontal_bs) { ++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); ++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); ++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); ++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); ++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ } + + s->is_decoded = 0; + s->first_nal_type = s->nal_unit_type; +@@ -3580,15 +3592,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src) + dst->needs_fg = 1; + } + +- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); +- if (!dst->tab_mvf_buf) +- goto fail; +- dst->tab_mvf = src->tab_mvf; ++ if (src->tab_mvf_buf) { ++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); ++ if (!dst->tab_mvf_buf) ++ goto fail; ++ dst->tab_mvf = src->tab_mvf; ++ } + +- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); +- if (!dst->rpl_tab_buf) +- goto fail; +- dst->rpl_tab = src->rpl_tab; ++ if (src->rpl_tab_buf) { ++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); ++ if (!dst->rpl_tab_buf) ++ goto fail; ++ dst->rpl_tab = src->rpl_tab; ++ } + + dst->rpl_buf = av_buffer_ref(src->rpl_buf); + if (!dst->rpl_buf) + +From 0d7f7c86342a306c9d547c97e9602537ffa3cd0f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 30 May 2022 17:51:44 +0100 +Subject: [PATCH 052/113] rpi_sand: Add SAND30->NV12 conversion + +C code only. Reworks the hwcontext_drm conversion to use the +rpi_sand_fns generic frame convert fn rather than calling the +individual conversion functions directly. This keeps all teh stride and +size logic in a single place. +--- + libavutil/hwcontext_drm.c | 46 ++++++++------------ + libavutil/rpi_sand_fns.c | 89 +++++++++++++++++++++++++++++++++++++++ + libavutil/rpi_sand_fns.h | 5 +++ + 3 files changed, 111 insertions(+), 29 deletions(-) + +diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c +index baf18920fa..137a952d2c 100644 +--- a/libavutil/hwcontext_drm.c ++++ b/libavutil/hwcontext_drm.c +@@ -234,14 +234,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, + enum AVHWFrameTransferDirection dir, + enum AVPixelFormat **formats) + { +- enum AVPixelFormat *pix_fmts; ++ enum AVPixelFormat *p; + +- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts)); +- if (!pix_fmts) ++ p = *formats = av_malloc_array(3, sizeof(*p)); ++ if (!p) + return AVERROR(ENOMEM); + + // **** Offer native sand too ???? +- pix_fmts[0] = ++ *p++ = + #if CONFIG_SAND + ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? + AV_PIX_FMT_YUV420P : +@@ -249,9 +249,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, + AV_PIX_FMT_YUV420P10LE : + #endif + ctx->sw_format; +- pix_fmts[1] = AV_PIX_FMT_NONE; + +- *formats = pix_fmts; ++#if CONFIG_SAND ++ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 || ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128) ++ *p++ = AV_PIX_FMT_NV12; ++#endif ++ ++ *p = AV_PIX_FMT_NONE; + return 0; + } + +@@ -294,29 +299,12 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, + const unsigned int w = FFMIN(dst->width, map->width); + const unsigned int h = FFMIN(dst->height, map->height); + +- if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { +- av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], +- map->data[0], +- 128, stride2, +- 0, 0, w, h); +- av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], +- dst->data[2], dst->linesize[2], +- map->data[1], +- 128, stride2, +- 0, 0, w / 2, h / 2); +- } +- else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { +- av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], +- map->data[0], +- 128, stride2, +- 0, 0, w, h); +- av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], +- dst->data[2], dst->linesize[2], +- map->data[1], +- 128, stride2, +- 0, 0, w / 2, h / 2); +- } +- else ++ map->crop_top = 0; ++ map->crop_bottom = 0; ++ map->crop_left = 0; ++ map->crop_right = 0; ++ ++ if (av_rpi_sand_to_planar_frame(dst, map) != 0) + { + av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); + err = AVERROR(EINVAL); +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +index 1f543e9357..256c3d532f 100644 +--- a/libavutil/rpi_sand_fns.c ++++ b/libavutil/rpi_sand_fns.c +@@ -229,6 +229,75 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_ + } + } + ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// single lose bottom 2 bits truncation ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM && 0 ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint8_t * d = dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = (p3 >> 2) & 0xff; ++ *d++ = (p3 >> 12) & 0xff; ++ *d++ = (p3 >> 22) & 0xff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = (p3 >> 2) & 0xff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 12) & 0xff; ++ } ++ } ++} ++ ++ + + // w/h in pixels + void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, +@@ -310,6 +379,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; + default: + return -1; + } +@@ -344,6 +423,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; ++ case AV_PIX_FMT_NV12: ++ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w, h/2); ++ break; + default: + return -1; + } +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +index 634b55e800..462ccb8abd 100644 +--- a/libavutil/rpi_sand_fns.h ++++ b/libavutil/rpi_sand_fns.h +@@ -85,6 +85,11 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_ + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + ++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); + + // w/h in pixels + void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, + +From bc6fdb0674a89cb9f83538beff47cf54e78a4502 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 1 Jun 2022 17:49:26 +0000 +Subject: [PATCH 053/113] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8 + +Also reworks the previous Armv8 SAND30->Y16 function in a slightly more +efficient way that makes it look more like the Armv7 version. +--- + libavutil/aarch64/rpi_sand_neon.S | 549 ++++++++++++++++++------------ + libavutil/aarch64/rpi_sand_neon.h | 4 + + libavutil/arm/rpi_sand_neon.S | 239 ++++++++++--- + libavutil/arm/rpi_sand_neon.h | 11 + + libavutil/rpi_sand_fns.c | 2 +- + 5 files changed, 541 insertions(+), 264 deletions(-) + +diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S +index cdcf71ee67..2f07d9674c 100644 +--- a/libavutil/aarch64/rpi_sand_neon.S ++++ b/libavutil/aarch64/rpi_sand_neon.S +@@ -248,228 +248,6 @@ incomplete_block_loop_end_c8: + ret + endfunc + +-//void ff_rpi_sand30_lines_to_planar_y16( +-// uint8_t * dest, // [x0] +-// unsigned int dst_stride, // [w1] -> assumed to be equal to _w +-// const uint8_t * src, // [x2] +-// unsigned int src_stride1, // [w3] -> 128 +-// unsigned int src_stride2, // [w4] +-// unsigned int _x, // [w5] +-// unsigned int y, // [w6] +-// unsigned int _w, // [w7] +-// unsigned int h); // [sp, #0] +- +-function ff_rpi_sand30_lines_to_planar_y16, export=1 +- stp x19, x20, [sp, #-48]! +- stp x21, x22, [sp, #16] +- stp x23, x24, [sp, #32] +- +- // w6 = argument h +- ldr w6, [sp, #48] +- +- // slice_inc = ((stride2 - 1) * stride1) +- mov w5, w4 +- sub w5, w5, #1 +- lsl w5, w5, #7 +- +- // total number of bytes per row = (width / 3) * 4 +- mov w8, w7 +- mov w9, #3 +- udiv w8, w8, w9 +- lsl w8, w8, #2 +- +- // number of full 128 byte blocks to be processed +- mov w9, #96 +- udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 +- +- // w10 = number of full integers to process (4 bytes) +- // w11 = remaning zero to two 10bit values still to copy over +- mov w12, #96 +- mul w12, w9, w12 +- sub w12, w7, w12 // width - blocks*96 = remaining points per row +- mov w11, #3 +- udiv w10, w12, w11 // full integers to process = w12 / 3 +- mul w11, w10, w11 // #integers *3 +- sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 +- +- // increase w9 by one if w10+w11 is not zero, and decrease the row count by one +- // this is to efficiently copy incomplete blocks at the end of the rows +- // the last row is handled explicitly to avoid writing out of bounds +- add w22, w10, w11 +- cmp w22, #0 +- cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise +- add w9, w9, w22 +- sub w6, w6, #1 +- +- // store the number of bytes in w20 which we copy too much for every row +- // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) +- mov w20, #96*2 +- mul w20, w20, w9 +- sub w20, w1, w20 +- +- mov w23, #0 // flag to check whether the last line had already been processed +- +- // bitmask to clear the uppper 6bits of the result values +- mov x19, #0x03ff03ff03ff03ff +- dup v22.2d, x19 +- +- // row counter = 0 +- eor w12, w12, w12 +-row_loop_y16: +- cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows +- bge row_loop_y16_fin +- +- mov x13, x2 // row src +- eor w14, w14, w14 // full block counter +-block_loop_y16: +- cmp w14, w9 +- bge block_loop_y16_fin +- +- // load 64 bytes +- ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 +- +- // process v0 and v1 +- xtn v16.4h, v0.4s +- ushr v0.4s, v0.4s, #10 +- xtn v17.4h, v0.4s +- ushr v0.4s, v0.4s, #10 +- xtn v18.4h, v0.4s +- +- xtn2 v16.8h, v1.4s +- and v16.16b, v16.16b, v22.16b +- ushr v1.4s, v1.4s, #10 +- xtn2 v17.8h, v1.4s +- and v17.16b, v17.16b, v22.16b +- ushr v1.4s, v1.4s, #10 +- xtn2 v18.8h, v1.4s +- and v18.16b, v18.16b, v22.16b +- +- st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 +- +- // process v2 and v3 +- xtn v23.4h, v2.4s +- ushr v2.4s, v2.4s, #10 +- xtn v24.4h, v2.4s +- ushr v2.4s, v2.4s, #10 +- xtn v25.4h, v2.4s +- +- xtn2 v23.8h, v3.4s +- and v23.16b, v23.16b, v22.16b +- ushr v3.4s, v3.4s, #10 +- xtn2 v24.8h, v3.4s +- and v24.16b, v24.16b, v22.16b +- ushr v3.4s, v3.4s, #10 +- xtn2 v25.8h, v3.4s +- and v25.16b, v25.16b, v22.16b +- +- st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 +- +- // load the second half of the block -> 64 bytes into registers v4-v7 +- ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 +- +- // process v4 and v5 +- xtn v16.4h, v4.4s +- ushr v4.4s, v4.4s, #10 +- xtn v17.4h, v4.4s +- ushr v4.4s, v4.4s, #10 +- xtn v18.4h, v4.4s +- +- xtn2 v16.8h, v5.4s +- and v16.16b, v16.16b, v22.16b +- ushr v5.4s, v5.4s, #10 +- xtn2 v17.8h, v5.4s +- and v17.16b, v17.16b, v22.16b +- ushr v5.4s, v5.4s, #10 +- xtn2 v18.8h, v5.4s +- and v18.16b, v18.16b, v22.16b +- +- st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 +- +- // v6 and v7 +- xtn v23.4h, v6.4s +- ushr v6.4s, v6.4s, #10 +- xtn v24.4h, v6.4s +- ushr v6.4s, v6.4s, #10 +- xtn v25.4h, v6.4s +- +- xtn2 v23.8h, v7.4s +- and v23.16b, v23.16b, v22.16b +- ushr v7.4s, v7.4s, #10 +- xtn2 v24.8h, v7.4s +- and v24.16b, v24.16b, v22.16b +- ushr v7.4s, v7.4s, #10 +- xtn2 v25.8h, v7.4s +- and v25.16b, v25.16b, v22.16b +- +- st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 +- +- add x13, x13, x5 // row src += slice_inc +- add w14, w14, #1 +- b block_loop_y16 +-block_loop_y16_fin: +- +- +- +- +- add x2, x2, #128 // src += stride1 (start of the next row) +- add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst +- add w12, w12, #1 +- b row_loop_y16 +-row_loop_y16_fin: +- +- // check whether we have incomplete blocks at the end of every row +- // in that case decrease row block count by one +- // change height back to it's original value (meaning increase it by 1) +- // and jump back to another iteration of row_loop_y16 +- +- cmp w23, #1 +- beq row_loop_y16_fin2 // don't continue here if we already processed the last row +- add w6, w6, #1 // increase height to the original value +- sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count +- mov w23, #1 +- b row_loop_y16 +-row_loop_y16_fin2: +- +- sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference +- +- // now we've got to handle the last block in the last row +- eor w12, w12, w12 // w12 = 0 = counter +-integer_loop_y16: +- cmp w12, w10 +- bge integer_loop_y16_fin +- ldr w14, [x13], #4 +- and w15, w14, #0x3ff +- strh w15, [x0], #2 +- lsr w14, w14, #10 +- and w15, w14, #0x3ff +- strh w15, [x0], #2 +- lsr w14, w14, #10 +- and w15, w14, #0x3ff +- strh w15, [x0], #2 +- add w12, w12, #1 +- b integer_loop_y16 +-integer_loop_y16_fin: +- +-final_values_y16: +- // remaining point count = w11 +- ldr w14, [x13], #4 +- cmp w11, #0 +- beq final_values_y16_fin +- and w15, w14, #0x3ff +- strh w15, [x0], #2 +- cmp w11, #1 +- beq final_values_y16_fin +- lsr w14, w14, #10 +- and w15, w14, #0x3ff +- strh w15, [x0], #2 +-final_values_y16_fin: +- +- ldp x23, x24, [sp, #32] +- ldp x21, x22, [sp, #16] +- ldp x19, x20, [sp], #48 +- ret +-endfunc +- + //void ff_rpi_sand30_lines_to_planar_c16( + // uint8_t * dst_u, // [x0] + // unsigned int dst_stride_u, // [w1] == _w*2 +@@ -674,3 +452,330 @@ endfunc + // unsigned int _w, + // unsigned int h); + ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ sub w1, w1, w7, lsl #1 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ // v0, v1 ++ ++ shrn v18.4h, v0.4s, #14 ++ xtn v16.4h, v0.4s ++ shrn v17.4h, v0.4s, #10 ++ ++ shrn2 v18.8h, v1.4s, #14 ++ xtn2 v16.8h, v1.4s ++ shrn2 v17.8h, v1.4s, #10 ++ ++ ushr v18.8h, v18.8h, #6 ++ bic v16.8h, #0xfc, lsl #8 ++ bic v17.8h, #0xfc, lsl #8 ++ ++ // v2, v3 ++ ++ shrn v21.4h, v2.4s, #14 ++ xtn v19.4h, v2.4s ++ shrn v20.4h, v2.4s, #10 ++ ++ shrn2 v21.8h, v3.4s, #14 ++ xtn2 v19.8h, v3.4s ++ shrn2 v20.8h, v3.4s, #10 ++ ++ ushr v21.8h, v21.8h, #6 ++ bic v19.8h, #0xfc, lsl #8 ++ bic v20.8h, #0xfc, lsl #8 ++ ++ // v4, v5 ++ ++ shrn v24.4h, v4.4s, #14 ++ xtn v22.4h, v4.4s ++ shrn v23.4h, v4.4s, #10 ++ ++ shrn2 v24.8h, v5.4s, #14 ++ xtn2 v22.8h, v5.4s ++ shrn2 v23.8h, v5.4s, #10 ++ ++ ushr v24.8h, v24.8h, #6 ++ bic v22.8h, #0xfc, lsl #8 ++ bic v23.8h, #0xfc, lsl #8 ++ ++ // v6, v7 ++ ++ shrn v27.4h, v6.4s, #14 ++ xtn v25.4h, v6.4s ++ shrn v26.4h, v6.4s, #10 ++ ++ shrn2 v27.8h, v7.4s, #14 ++ xtn2 v25.8h, v7.4s ++ shrn2 v26.8h, v7.4s, #10 ++ ++ ushr v27.8h, v27.8h, #6 ++ bic v25.8h, #0xfc, lsl #8 ++ bic v26.8h, #0xfc, lsl #8 ++ ++ blt 2f ++ ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48 ++ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++ mov v19.16b, v25.16b ++ mov v20.16b, v26.16b ++ mov v21.16b, v27.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48 ++ beq 11b ++ mov v16.16b, v19.16b ++ mov v17.16b, v20.16b ++ sub w5, w5, #24 ++ mov v18.16b, v21.16b ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #12 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ st3 {v16.h, v17.h, v18.h}[1], [x0], #6 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #6 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #3 ++ mov v17.4h[0], v17.4h[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.h, v17.h}[0], [x0], #4 ++ b 11b ++1: ++ st1 {v16.h}[0], [x0], #2 ++ b 11b ++ ++endfunc ++ ++// void ff_rpi_sand30_lines_to_planar_y8( ++// uint8_t * dest, : x0 ++// unsigned int dst_stride, : w1 ++// const uint8_t * src, : x2 ++// unsigned int src_stride1, : w3, always 128 ++// unsigned int src_stride2, : w4 ++// unsigned int _x, : w5 ++// unsigned int y, : w6 ++// unsigned int _w, : w7 ++// unsigned int h); : [sp, #0] ++// ++// Assumes that we are starting on a stripe boundary and that overreading ++// within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ lsl w4, w4, #7 ++ sub w4, w4, #64 ++ sub w1, w1, w7 ++ uxtw x6, w6 ++ add x8, x2, x6, lsl #7 ++ ldr w6, [sp, #0] ++ ++10: ++ mov x2, x8 ++ mov w5, w7 ++1: ++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4 ++ ++ subs w5, w5, #96 ++ ++ // v0, v1 ++ ++ shrn v18.4h, v0.4s, #16 ++ xtn v16.4h, v0.4s ++ shrn v17.4h, v0.4s, #12 ++ ++ shrn2 v18.8h, v1.4s, #16 ++ xtn2 v16.8h, v1.4s ++ shrn2 v17.8h, v1.4s, #12 ++ ++ shrn v18.8b, v18.8h, #6 ++ shrn v16.8b, v16.8h, #2 ++ xtn v17.8b, v17.8h ++ ++ // v2, v3 ++ ++ shrn v21.4h, v2.4s, #16 ++ xtn v19.4h, v2.4s ++ shrn v20.4h, v2.4s, #12 ++ ++ shrn2 v21.8h, v3.4s, #16 ++ xtn2 v19.8h, v3.4s ++ shrn2 v20.8h, v3.4s, #12 ++ ++ shrn2 v18.16b, v21.8h, #6 ++ shrn2 v16.16b, v19.8h, #2 ++ xtn2 v17.16b, v20.8h ++ ++ // v4, v5 ++ ++ shrn v24.4h, v4.4s, #16 ++ xtn v22.4h, v4.4s ++ shrn v23.4h, v4.4s, #12 ++ ++ shrn2 v24.8h, v5.4s, #16 ++ xtn2 v22.8h, v5.4s ++ shrn2 v23.8h, v5.4s, #12 ++ ++ shrn v21.8b, v24.8h, #6 ++ shrn v19.8b, v22.8h, #2 ++ xtn v20.8b, v23.8h ++ ++ // v6, v7 ++ ++ shrn v27.4h, v6.4s, #16 ++ xtn v25.4h, v6.4s ++ shrn v26.4h, v6.4s, #12 ++ ++ shrn2 v27.8h, v7.4s, #16 ++ xtn2 v25.8h, v7.4s ++ shrn2 v26.8h, v7.4s, #12 ++ ++ shrn2 v21.16b, v27.8h, #6 ++ shrn2 v19.16b, v25.8h, #2 ++ xtn2 v20.16b, v26.8h ++ ++ blt 2f ++ ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48 ++ ++ bne 1b ++ ++11: ++ subs w6, w6, #1 ++ add x0, x0, w1, uxtw ++ add x8, x8, #128 ++ bne 10b ++ ++ ret ++ ++// Partial final write ++2: ++ cmp w5, #48-96 ++ blt 1f ++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48 ++ beq 11b ++ mov v16.16b, v22.16b ++ mov v17.16b, v23.16b ++ sub w5, w5, #48 ++ mov v18.16b, v24.16b ++1: ++ cmp w5, #24-96 ++ blt 1f ++ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24 ++ beq 11b ++ mov v16.2d[0], v16.2d[1] ++ sub w5, w5, #24 ++ mov v17.2d[0], v17.2d[1] ++ mov v18.2d[0], v18.2d[1] ++1: ++ cmp w5, #12-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[2], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[3], [x0], #3 ++ beq 11b ++ mov v16.2s[0], v16.2s[1] ++ sub w5, w5, #12 ++ mov v17.2s[0], v17.2s[1] ++ mov v18.2s[0], v18.2s[1] ++1: ++ cmp w5, #6-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3 ++ beq 11b ++ mov v16.4h[0], v16.4h[1] ++ sub w5, w5, #6 ++ mov v17.4h[0], v17.4h[1] ++ mov v18.4h[0], v18.4h[1] ++1: ++ cmp w5, #3-96 ++ blt 1f ++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3 ++ beq 11b ++ mov v16.8b[0], v16.8b[1] ++ sub w5, w5, #3 ++ mov v17.8b[0], v17.8b[1] ++1: ++ cmp w5, #2-96 ++ blt 1f ++ st2 {v16.b, v17.b}[0], [x0], #2 ++ b 11b ++1: ++ st1 {v16.b}[0], [x0], #1 ++ b 11b ++ ++endfunc ++ +diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h +index b3aa481ea4..2a56135bc3 100644 +--- a/libavutil/aarch64/rpi_sand_neon.h ++++ b/libavutil/aarch64/rpi_sand_neon.h +@@ -49,6 +49,10 @@ void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_ + uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, + unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); + ++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, ++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, ++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); ++ + #ifdef __cplusplus + } + #endif +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S +index 80890fe985..60e697f681 100644 +--- a/libavutil/arm/rpi_sand_neon.S ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -360,7 +360,6 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1 + ldr r6, [sp, #36] + ldr r7, [sp, #32] @ y + mov r12, #48 +- vmov.u16 q15, #0x3ff + sub r3, #1 + lsl r3, #7 + sub r1, r1, r6, lsl #1 +@@ -376,37 +375,33 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1 + vldm r2!, {q10-q13} + add lr, #64 + +- vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! ++ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20! + ands lr, #127 + vshrn.u32 d2, q10, #10 + vmovn.u32 d0, q10 +- vmovn.u32 d4, q14 + +- vshr.u32 q14, q11, #20 ++ vshrn.u32 d5, q11, #14 + it eq + addeq r2, r3 + vshrn.u32 d3, q11, #10 + vmovn.u32 d1, q11 +- vmovn.u32 d5, q14 + + subs r5, #48 +- vand q0, q15 +- vand q1, q15 +- vand q2, q15 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 + +- vshr.u32 q14, q12, #20 ++ vshrn.u32 d20, q12, #14 + vshrn.u32 d18, q12, #10 + vmovn.u32 d16, q12 +- vmovn.u32 d20, q14 + +- vshr.u32 q14, q13, #20 ++ vshrn.u32 d21, q13, #14 + vshrn.u32 d19, q13, #10 + vmovn.u32 d17, q13 +- vmovn.u32 d21, q14 + +- vand q8, q15 +- vand q9, q15 +- vand q10, q15 ++ vshr.u16 q10, #6 ++ vbic.u16 q8, #0xfc00 ++ vbic.u16 q9 , #0xfc00 + blt 2f + + vst3.16 {d0, d2, d4}, [r0], r12 +@@ -499,7 +494,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 + ldr r7, [sp, #48] + ldr r9, [sp, #52] + mov r12, #48 +- vmov.u16 q15, #0x3ff + sub r8, #1 + lsl r8, #7 + add r5, r5, r7, lsl #7 +@@ -515,48 +509,44 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1 + add lr, #64 + + @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 +- vshr.u32 q14, q0, #20 +- vshrn.u32 d16, q0, #10 ++ vshrn.u32 d20, q0, #14 + vmovn.u32 d18, q0 ++ vshrn.u32 d0, q0, #10 + ands lr, #127 +- vmovn.u32 d20, q14 + +- vshr.u32 q14, q1, #20 +- vshrn.u32 d17, q1, #10 ++ vshrn.u32 d21, q1, #14 + vmovn.u32 d19, q1 +- vmovn.u32 d21, q14 ++ vshrn.u32 d1, q1, #10 + +- vshr.u32 q14, q2, #20 + vshrn.u32 d22, q2, #10 +- vmovn.u32 d24, q2 +- vmovn.u32 d26, q14 ++ vmovn.u32 d2, q2 ++ vshrn.u32 d4, q2, #14 + +- vshr.u32 q14, q3, #20 +- vshrn.u32 d23, q3, #10 +- vmovn.u32 d25, q3 + add r10, r0, #24 +- vmovn.u32 d27, q14 ++ vshrn.u32 d23, q3, #10 ++ vmovn.u32 d3, q3 ++ vshrn.u32 d5, q3, #14 + + it eq + addeq r4, r8 +- vuzp.16 q8, q11 +- vuzp.16 q9, q12 +- vuzp.16 q10, q13 ++ vuzp.16 q0, q11 ++ vuzp.16 q9, q1 ++ vuzp.16 q10, q2 + +- @ q8 V0, V3,.. -> q0 ++ @ q0 V0, V3,.. + @ q9 U0, U3... + @ q10 U1, U4... + @ q11 U2, U5,.. +- @ q12 V1, V4,.. -> q1 +- @ q13 V2, V5,.. -> q2 ++ @ q1 V1, V4, ++ @ q2 V2, V5,.. + + subs r6, #24 +- vand q11, q15 +- vand q9, q15 +- vand q10, q15 +- vand q0, q8, q15 +- vand q1, q12, q15 +- vand q2, q13, q15 ++ vbic.u16 q11, #0xfc00 ++ vbic.u16 q9, #0xfc00 ++ vshr.u16 q10, #6 ++ vshr.u16 q2, #6 ++ vbic.u16 q0, #0xfc00 ++ vbic.u16 q1, #0xfc00 + + blt 2f + +@@ -765,4 +755,171 @@ function ff_rpi_sand30_lines_to_planar_p010, export=1 + endfunc + + ++@ void ff_rpi_sand30_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for wri ++ ++function ff_rpi_sand30_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ lsl r3, #7 ++ sub r1, r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++1: ++ vldm r2, {q8-q15} ++ ++ subs r5, #96 ++ ++ vmovn.u32 d0, q8 ++ vshrn.u32 d2, q8, #12 ++ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20! ++ ++ add r2, r3 ++ ++ vmovn.u32 d1, q9 ++ vshrn.u32 d3, q9, #12 ++ vshrn.u32 d5, q9, #16 ++ ++ pld [r2, #0] ++ ++ vshrn.u16 d0, q0, #2 ++ vmovn.u16 d1, q1 ++ vshrn.u16 d2, q2, #6 ++ ++ vmovn.u32 d16, q10 ++ vshrn.u32 d18, q10, #12 ++ vshrn.u32 d20, q10, #16 ++ ++ vmovn.u32 d17, q11 ++ vshrn.u32 d19, q11, #12 ++ vshrn.u32 d21, q11, #16 ++ ++ pld [r2, #64] ++ ++ vshrn.u16 d4, q8, #2 ++ vmovn.u16 d5, q9 ++ vshrn.u16 d6, q10, #6 ++ ++ vmovn.u32 d16, q12 ++ vshrn.u32 d18, q12, #12 ++ vshrn.u32 d20, q12, #16 ++ ++ vmovn.u32 d17, q13 ++ vshrn.u32 d19, q13, #12 ++ vshrn.u32 d21, q13, #16 ++ ++ vshrn.u16 d16, q8, #2 ++ vmovn.u16 d17, q9 ++ vshrn.u16 d18, q10, #6 ++ ++ vmovn.u32 d20, q14 ++ vshrn.u32 d22, q14, #12 ++ vshrn.u32 d24, q14, #16 ++ ++ vmovn.u32 d21, q15 ++ vshrn.u32 d23, q15, #12 ++ vshrn.u32 d25, q15, #16 ++ ++ vshrn.u16 d20, q10, #2 ++ vmovn.u16 d21, q11 ++ vshrn.u16 d22, q12, #6 ++ ++ blt 2f ++ ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ vst3.8 {d16, d17, d18}, [r0], r12 ++ vst3.8 {d20, d21, d22}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #48-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0], r12 ++ vst3.8 {d4, d5, d6}, [r4], r12 ++ beq 11b ++ vmov q0, q8 ++ vmov q2, q10 ++ sub r5, #48 ++ vmov d2, d18 ++ vmov d6, d22 ++1: ++ cmp r5, #24-96 ++ blt 1f ++ vst3.8 {d0, d1, d2}, [r0]! ++ beq 11b ++ vmov q0, q2 ++ sub r5, #24 ++ vmov d2, d6 ++1: ++ cmp r5, #12-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ vst3.8 {d0[2], d1[2], d2[2]}, [r0]! ++ vst3.8 {d0[3], d1[3], d2[3]}, [r0]! ++ beq 11b ++ vmov s0, s1 ++ sub r5, #12 ++ vmov s2, s3 ++ vmov s4, s5 ++1: ++ cmp r5, #6-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]! ++ add r0, #12 ++ beq 11b ++ vshr.u32 d0, #16 ++ sub r5, #6 ++ vshr.u32 d1, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #3-96 ++ blt 1f ++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #8 ++ vshr.u32 d1, #8 ++1: ++ cmp r5, #2-96 ++ blt 1f ++ vst2.8 {d0[0], d1[0]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ + +diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h +index 447f367bea..d457c10870 100644 +--- a/libavutil/arm/rpi_sand_neon.h ++++ b/libavutil/arm/rpi_sand_neon.h +@@ -95,5 +95,16 @@ void ff_rpi_sand30_lines_to_planar_p010( + unsigned int _w, // [sp, #12] -> r6 (cur r5) + unsigned int h); // [sp, #16] -> r7 + ++void ff_rpi_sand30_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ + #endif // AVUTIL_ARM_SAND_NEON_H + +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +index 256c3d532f..b6071e2928 100644 +--- a/libavutil/rpi_sand_fns.c ++++ b/libavutil/rpi_sand_fns.c +@@ -247,7 +247,7 @@ void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + +-#if HAVE_SAND_ASM && 0 ++#if HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); + return; + +From 34163b056f9425a5e75440dc045459e451aafc0b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 7 Jun 2022 14:46:12 +0000 +Subject: [PATCH 054/113] v4l2_m2m_enc: Add the ability to encode DRM_PRIME + frames + +--- + libavcodec/v4l2_buffers.c | 100 +++++++++++--- + libavcodec/v4l2_buffers.h | 20 ++- + libavcodec/v4l2_context.c | 212 +++++++++++++++++++++++++--- + libavcodec/v4l2_context.h | 15 +- + libavcodec/v4l2_m2m.c | 37 +++-- + libavcodec/v4l2_m2m.h | 3 + + libavcodec/v4l2_m2m_dec.c | 171 ++++++----------------- + libavcodec/v4l2_m2m_enc.c | 283 +++++++++++++++++++++++++++++++++++++- + 8 files changed, 643 insertions(+), 198 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 8c4f18dbed..9ef2f40e39 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -29,6 +29,8 @@ + #include + #include + #include "libavcodec/avcodec.h" ++#include "libavcodec/internal.h" ++#include "libavutil/avassert.h" + #include "libavutil/pixdesc.h" + #include "libavutil/hwcontext.h" + #include "v4l2_context.h" +@@ -60,27 +62,39 @@ static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) + return tb.num && tb.den ? tb : v4l2_timebase; + } + ++static inline struct timeval tv_from_int(const int64_t t) ++{ ++ return (struct timeval){ ++ .tv_usec = t % USEC_PER_SEC, ++ .tv_sec = t / USEC_PER_SEC ++ }; ++} ++ ++static inline int64_t int_from_tv(const struct timeval t) ++{ ++ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec; ++} ++ + static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) + { + /* convert pts to v4l2 timebase */ + const int64_t v4l2_pts = +- out->context->no_pts_rescale ? pts : + pts == AV_NOPTS_VALUE ? 0 : + av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); +- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; +- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; ++ out->buf.timestamp = tv_from_int(v4l2_pts); + } + + static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) + { ++ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp); ++ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE; ++#if 0 + /* convert pts back to encoder timebase */ +- const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + +- avbuf->buf.timestamp.tv_usec; +- + return + avbuf->context->no_pts_rescale ? v4l2_pts : + v4l2_pts == 0 ? AV_NOPTS_VALUE : + av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); ++#endif + } + + static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) +@@ -435,7 +449,7 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data) + + ff_mutex_lock(&ctx->lock); + +- avbuf->status = V4L2BUF_AVAILABLE; ++ ff_v4l2_buffer_set_avail(avbuf); + + if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { + av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); +@@ -599,6 +613,38 @@ static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) + return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); + } + ++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src) ++ return AVERROR(EINVAL); ++ ++ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ // Only currently cope with single buffer types ++ if (out->buf.length != 1) ++ return AVERROR_PATCHWELCOME; ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->planes[0].m.fd = src->objects[0].fd; ++ } ++ else { ++ if (src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ out->buf.m.fd = src->objects[0].fd; ++ } ++ ++ // No need to copy src AVDescriptor and if we did then we may confuse ++ // fd close on free ++ out->ref_buf = av_buffer_ref(frame->buf[0]); ++ ++ return 0; ++} ++ + static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + { + int i; +@@ -678,7 +724,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + * + ******************************************************************************/ + +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts) + { + out->buf.flags = frame->key_frame ? + (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : +@@ -688,10 +734,15 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) + v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); + v4l2_set_color_range(out, frame->color_range); + // PTS & interlace are buffer vars +- v4l2_set_pts(out, frame->pts); ++ if (track_ts) ++ out->buf.timestamp = tv_from_int(track_ts); ++ else ++ v4l2_set_pts(out, frame->pts); + v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); + +- return v4l2_buffer_swframe_to_buf(frame, out); ++ return frame->format == AV_PIX_FMT_DRM_PRIME ? ++ v4l2_buffer_primeframe_to_buf(frame, out) : ++ v4l2_buffer_swframe_to_buf(frame, out); + } + + int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) +@@ -754,6 +805,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + + pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; + pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; ++ pkt->flags = 0; + + if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) + pkt->flags |= AV_PKT_FLAG_KEY; +@@ -768,8 +820,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + return 0; + } + +-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, +- const void *extdata, size_t extlen) ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp) + { + int ret; + +@@ -783,7 +836,10 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + if (ret && ret != AVERROR(ENOMEM)) + return ret; + +- v4l2_set_pts(out, pkt->pts); ++ if (timestamp) ++ out->buf.timestamp = tv_from_int(timestamp); ++ else ++ v4l2_set_pts(out, pkt->pts); + + out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? + (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : +@@ -794,7 +850,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) + { +- return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); ++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0); + } + + +@@ -814,13 +870,15 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) + close(avbuf->drm_frame.objects[i].fd); + } + ++ av_buffer_unref(&avbuf->ref_buf); ++ + ff_weak_link_unref(&avbuf->context_wl); + + av_free(avbuf); + } + + +-int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) ++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem) + { + int ret, i; + V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); +@@ -837,7 +895,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + } + + avbuf->context = ctx; +- avbuf->buf.memory = V4L2_MEMORY_MMAP; ++ avbuf->buf.memory = mem; + avbuf->buf.type = ctx->type; + avbuf->buf.index = index; + +@@ -867,6 +925,8 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + avbuf->num_planes = 1; + + for (i = 0; i < avbuf->num_planes; i++) { ++ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP && ++ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm); + + avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? + ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline : +@@ -875,21 +935,17 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; + +- if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || +- !buf_to_m2mctx(avbuf)->output_drm) { ++ if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); +- } + } else { + avbuf->plane_info[i].length = avbuf->buf.length; + +- if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || +- !buf_to_m2mctx(avbuf)->output_drm) { ++ if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); +- } + } + + if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 3b7ca4d99e..1ac32c5989 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -59,6 +59,10 @@ typedef struct V4L2Buffer { + + /* DRM descriptor */ + AVDRMFrameDescriptor drm_frame; ++ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we ++ * are done ++ */ ++ AVBufferRef * ref_buf; + + /* keep track of the mmap address and mmap length */ + struct V4L2Plane_info { +@@ -110,8 +114,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); + */ + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + +-int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, +- const void *extdata, size_t extlen); ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out, ++ const void *extdata, size_t extlen, ++ const int64_t timestamp); + + /** + * Extracts the data from an AVFrame to a V4L2Buffer +@@ -121,7 +126,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); ++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts); + + /** + * Initializes a V4L2Buffer +@@ -131,7 +136,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); + * + * @returns 0 in case of success, a negative AVERROR code otherwise + */ +-int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); ++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem); + + /** + * Enqueues a V4L2Buffer +@@ -142,5 +147,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context + */ + int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf); + ++static inline void ++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf) ++{ ++ avbuf->status = V4L2BUF_AVAILABLE; ++ av_buffer_unref(&avbuf->ref_buf); ++} ++ + + #endif // AVCODEC_V4L2_BUFFERS_H +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index c0d257e5d3..3ed5234be4 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -43,6 +43,160 @@ struct v4l2_format_update { + int update_avfmt; + }; + ++ ++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) ++{ ++ return (int64_t)n; ++} ++ ++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) ++{ ++ return (unsigned int)pts; ++} ++ ++// FFmpeg requires us to propagate a number of vars from the coded pkt into ++// the decoded frame. The only thing that tracks like that in V4L2 stateful ++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no ++// guarantees about PTS being unique or specified for every frame so replace ++// the supplied PTS with a simple incrementing number and keep a circular ++// buffer of all the things we want preserved (including the original PTS) ++// indexed by the tracking no. ++static int64_t ++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt) ++{ ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); ++ x->last_pkt_dts = avpkt->dts; ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = avpkt->size, ++ .pts = avpkt->pts, ++ .dts = avpkt->dts, ++ .reordered_opaque = avctx->reordered_opaque, ++ .pkt_pos = avpkt->pos, ++ .pkt_duration = avpkt->duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; ++} ++ ++static int64_t ++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame) ++{ ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++x->track_no == 0) ++ x->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, x->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); ++ x->last_pkt_dts = frame->pkt_dts; ++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pending = 1, ++ .pkt_size = 0, ++ .pts = frame->pts, ++ .dts = AV_NOPTS_VALUE, ++ .reordered_opaque = frame->reordered_opaque, ++ .pkt_pos = frame->pkt_pos, ++ .pkt_duration = frame->pkt_duration, ++ .track_pts = track_pts ++ }; ++ return track_pts; ++} ++ ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_frame_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVFrame *const frame) ++{ ++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) ++ { ++ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ frame->pts = AV_NOPTS_VALUE; ++ frame->pkt_dts = x->last_pkt_dts; ++ frame->reordered_opaque = x->last_opaque; ++ frame->pkt_pos = -1; ++ frame->pkt_duration = 0; ++ frame->pkt_size = -1; ++ } ++ else if (!t->discard) ++ { ++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ frame->pkt_dts = x->last_pkt_dts; ++ frame->reordered_opaque = t->reordered_opaque; ++ frame->pkt_pos = t->pkt_pos; ++ frame->pkt_duration = t->pkt_duration; ++ frame->pkt_size = t->pkt_size; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (frame->pts != AV_NOPTS_VALUE) ++ x->last_pts = frame->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); ++ return 0; ++} ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_pkt_out(AVCodecContext *const avctx, ++ xlat_track_t * const x, ++ AVPacket *const pkt) ++{ ++ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ V4L2m2mTrackEl *const t = x->track_els + n; ++ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts) ++ { ++ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, ++ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ pkt->pts = AV_NOPTS_VALUE; ++ } ++ else if (!t->discard) ++ { ++ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE; ++ ++ x->last_opaque = x->track_els[n].reordered_opaque; ++ if (pkt->pts != AV_NOPTS_VALUE) ++ x->last_pts = pkt->pts; ++ t->pending = 0; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts); ++ return -1; ++ } ++ ++ // * Would like something much better than this...xlat(offset + out_count)? ++ pkt->dts = pkt->pts; ++ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ pkt->pts, t->track_pts, n); ++ return 0; ++} ++ ++ + static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) + { + return V4L2_TYPE_IS_OUTPUT(ctx->type) ? +@@ -353,12 +507,14 @@ dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) + atomic_fetch_sub(&ctx->q_count, 1); + + avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; +- avbuf->status = V4L2BUF_AVAILABLE; ++ ff_v4l2_buffer_set_avail(avbuf); + avbuf->buf = buf; + if (is_mp) { + memcpy(avbuf->planes, planes, sizeof(planes)); + avbuf->buf.m.planes = avbuf->planes; + } ++ // Done with any attached buffer ++ av_buffer_unref(&avbuf->ref_buf); + + if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { + // Zero length cap buffer return == EOS +@@ -733,7 +889,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx) + for (i = 0; i < ctx->num_buffers; ++i) { + struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; + if (buf->status == V4L2BUF_IN_DRIVER) +- buf->status = V4L2BUF_AVAILABLE; ++ ff_v4l2_buffer_set_avail(buf); + } + atomic_store(&ctx->q_count, 0); + } +@@ -787,6 +943,8 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + { + if (cmd == VIDIOC_STREAMOFF) + flush_all_buffers_status(ctx); ++ else ++ ctx->first_buf = 1; + + ctx->streamon = (cmd == VIDIOC_STREAMON); + av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, +@@ -803,14 +961,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + { +- V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ V4L2m2mContext *const s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; ++ int64_t track_ts; + V4L2Buffer* avbuf; + int ret; + + if (!frame) { + ret = v4l2_stop_encode(ctx); + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name); + s->draining= 1; + return 0; + } +@@ -819,7 +979,9 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf); ++ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame); ++ ++ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts); + if (ret) + return ret; + +@@ -830,14 +992,16 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + const void * extdata, size_t extlen) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer* avbuf; + int ret; ++ int64_t track_ts; + + if (!pkt->size) { + ret = v4l2_stop_decode(ctx); + // Log but otherwise ignore stop failure + if (ret) +- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); ++ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); + s->draining = 1; + return 0; + } +@@ -846,7 +1010,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); ++ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt); ++ ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts); + if (ret == AVERROR(ENOMEM)) + av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", + __func__, pkt->size, avbuf->planes[0].length); +@@ -858,24 +1024,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + + int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer *avbuf; + int rv; + +- if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) +- return rv; ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) ++ return rv; ++ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0); + +- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); ++ return 0; + } + + int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) + { ++ V4L2m2mContext *s = ctx_to_m2mctx(ctx); ++ AVCodecContext *const avctx = s->avctx; + V4L2Buffer *avbuf; + int rv; + +- if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) +- return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC ++ do { ++ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) ++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC ++ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) ++ return rv; ++ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0); + +- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); ++ return 0; + } + + int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -951,7 +1129,7 @@ void ff_v4l2_context_release(V4L2Context* ctx) + } + + +-static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) ++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem) + { + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + struct v4l2_requestbuffers req; +@@ -962,7 +1140,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers + + memset(&req, 0, sizeof(req)); + req.count = req_buffers; +- req.memory = V4L2_MEMORY_MMAP; ++ req.memory = mem; + req.type = ctx->type; + while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { + if (errno != EINTR) { +@@ -986,7 +1164,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers + } + + for (i = 0; i < ctx->num_buffers; i++) { +- ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); ++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem); + if (ret) { + av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); + goto fail_release; +@@ -1052,7 +1230,7 @@ int ff_v4l2_context_init(V4L2Context* ctx) + goto fail_unref_hwframes; + } + +- ret = create_buffers(ctx, ctx->num_buffers); ++ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); + if (ret < 0) + goto fail_unref_hwframes; + +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 0efff58f18..21265f1bd7 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -91,11 +91,19 @@ typedef struct V4L2Context { + */ + int num_buffers; + ++ /** ++ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF ++ */ ++ enum v4l2_memory buf_mem; ++ + /** + * Whether the stream has been started (VIDIOC_STREAMON has been sent). + */ + int streamon; + ++ /* 1st buffer after stream on */ ++ int first_buf; ++ + /** + * Either no more buffers available or an unrecoverable error was notified + * by the V4L2 kernel driver: once set the context has to be exited. +@@ -105,11 +113,10 @@ typedef struct V4L2Context { + int flag_last; + + /** +- * PTS rescale not wanted +- * If the PTS is just a dummy frame count then rescale is +- * actively harmful ++ * If NZ then when Qing frame/pkt use this rather than the ++ * "real" PTS + */ +- int no_pts_rescale; ++ uint64_t track_ts; + + AVBufferRef *frames_ref; + atomic_int q_count; +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index 728932fadc..e29df41729 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -35,6 +35,14 @@ + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" + ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ x->last_pts = AV_NOPTS_VALUE; ++} ++ ++ + static inline int v4l2_splane_video(struct v4l2_capability *cap) + { + if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) && +@@ -67,7 +75,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) + + s->capture.done = s->output.done = 0; + s->capture.name = "capture"; ++ s->capture.buf_mem = V4L2_MEMORY_MMAP; + s->output.name = "output"; ++ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + atomic_init(&s->refcount, 0); + sem_init(&s->refsync, 0, 0); + +@@ -332,35 +342,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv) + return v4l2_configure_contexts(s); + } + +-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s) ++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps) + { +- *s = av_mallocz(sizeof(V4L2m2mContext)); +- if (!*s) ++ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext)); ++ ++ *pps = NULL; ++ if (!s) + return AVERROR(ENOMEM); + +- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext), ++ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s), + &v4l2_m2m_destroy_context, NULL, 0); + if (!priv->context_ref) { +- av_freep(s); ++ av_free(s); + return AVERROR(ENOMEM); + } + + /* assign the context */ +- priv->context = *s; +- (*s)->priv = priv; ++ priv->context = s; ++ s->priv = priv; + + /* populate it */ +- priv->context->capture.num_buffers = priv->num_capture_buffers; +- priv->context->output.num_buffers = priv->num_output_buffers; +- priv->context->self_ref = priv->context_ref; +- priv->context->fd = -1; ++ s->capture.num_buffers = priv->num_capture_buffers; ++ s->output.num_buffers = priv->num_output_buffers; ++ s->self_ref = priv->context_ref; ++ s->fd = -1; ++ xlat_init(&s->xlat); + + priv->context->frame = av_frame_alloc(); + if (!priv->context->frame) { + av_buffer_unref(&priv->context_ref); +- *s = NULL; /* freed when unreferencing context_ref */ + return AVERROR(ENOMEM); + } + ++ *pps = s; + return 0; + } +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 19d618698d..d6cdaf65e1 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -103,6 +103,9 @@ typedef struct V4L2m2mContext { + /* generate DRM frames */ + int output_drm; + ++ /* input frames are drmprime */ ++ int input_drm; ++ + /* Frame tracking */ + xlat_track_t xlat; + int pending_hw; +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 4765fe0d5e..e61464b499 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -169,96 +169,17 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) + return 0; + } + +-static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) +-{ +- return (int64_t)n; +-} +- +-static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) +-{ +- return (unsigned int)pts; +-} +- +-// FFmpeg requires us to propagate a number of vars from the coded pkt into +-// the decoded frame. The only thing that tracks like that in V4L2 stateful +-// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no +-// guarantees about PTS being unique or specified for every frame so replace +-// the supplied PTS with a simple incrementing number and keep a circular +-// buffer of all the things we want preserved (including the original PTS) +-// indexed by the tracking no. + static void +-xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt) +-{ +- int64_t track_pts; +- +- // Avoid 0 +- if (++x->track_no == 0) +- x->track_no = 1; +- +- track_pts = track_to_pts(avctx, x->track_no); +- +- av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); +- x->last_pkt_dts = avpkt->dts; +- x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ +- .discard = 0, +- .pending = 1, +- .pkt_size = avpkt->size, +- .pts = avpkt->pts, +- .dts = avpkt->dts, +- .reordered_opaque = avctx->reordered_opaque, +- .pkt_pos = avpkt->pos, +- .pkt_duration = avpkt->duration, +- .track_pts = track_pts +- }; +- avpkt->pts = track_pts; +-} +- +-// Returns -1 if we should discard the frame +-static int +-xlat_pts_out(AVCodecContext *const avctx, +- xlat_track_t * const x, ++set_best_effort_pts(AVCodecContext *const avctx, + pts_stats_t * const ps, + AVFrame *const frame) + { +- unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; +- V4L2m2mTrackEl *const t = x->track_els + n; +- if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) +- { +- av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); +- frame->pts = AV_NOPTS_VALUE; +- frame->pkt_dts = x->last_pkt_dts; +- frame->reordered_opaque = x->last_opaque; +- frame->pkt_pos = -1; +- frame->pkt_duration = 0; +- frame->pkt_size = -1; +- } +- else if (!t->discard) +- { +- frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; +- frame->pkt_dts = x->last_pkt_dts; +- frame->reordered_opaque = t->reordered_opaque; +- frame->pkt_pos = t->pkt_pos; +- frame->pkt_duration = t->pkt_duration; +- frame->pkt_size = t->pkt_size; +- +- x->last_opaque = x->track_els[n].reordered_opaque; +- if (frame->pts != AV_NOPTS_VALUE) +- x->last_pts = frame->pts; +- t->pending = 0; +- } +- else +- { +- av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); +- return -1; +- } +- + pts_stats_add(ps, frame->pts); + + frame->best_effort_timestamp = pts_stats_guess(ps); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? +- av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", +- frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); +- return 0; ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts); + } + + static void +@@ -272,13 +193,6 @@ xlat_flush(xlat_track_t * const x) + x->last_pts = AV_NOPTS_VALUE; + } + +-static void +-xlat_init(xlat_track_t * const x) +-{ +- memset(x, 0, sizeof(*x)); +- x->last_pts = AV_NOPTS_VALUE; +-} +- + static int + xlat_pending(const xlat_track_t * const x) + { +@@ -419,8 +333,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); + return ret; + } +- +- xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); + } + + if (s->draining) { +@@ -542,49 +454,47 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + prefer_dq ? 5 : + src_rv == NQ_Q_FULL ? -1 : 0; + +- do { +- // Dequeue frame will unref any previous contents of frame +- // if it returns success so we don't need an explicit unref +- // when discarding +- // This returns AVERROR(EAGAIN) on timeout or if +- // there is room in the input Q and timeout == -1 +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); +- +- // Failure due to no buffer in Q? +- if (dst_rv == AVERROR(ENOSPC)) { +- // Wait & retry +- if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { +- dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); +- } ++ // Dequeue frame will unref any previous contents of frame ++ // if it returns success so we don't need an explicit unref ++ // when discarding ++ // This returns AVERROR(EAGAIN) on timeout or if ++ // there is room in the input Q and timeout == -1 ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ ++ // Failure due to no buffer in Q? ++ if (dst_rv == AVERROR(ENOSPC)) { ++ // Wait & retry ++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + } ++ } ++ ++ // Adjust dynamic pending threshold ++ if (dst_rv == 0) { ++ if (--s->pending_hw < PENDING_HW_MIN) ++ s->pending_hw = PENDING_HW_MIN; ++ s->pending_n = 0; + +- // Adjust dynamic pending threshold +- if (dst_rv == 0) { +- if (--s->pending_hw < PENDING_HW_MIN) +- s->pending_hw = PENDING_HW_MIN; ++ set_best_effort_pts(avctx, &s->pts_stat, frame); ++ } ++ else if (dst_rv == AVERROR(EAGAIN)) { ++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { ++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; + s->pending_n = 0; + } +- else if (dst_rv == AVERROR(EAGAIN)) { +- if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { +- s->pending_hw = pending * 16 + PENDING_HW_OFFSET; +- s->pending_n = 0; +- } +- } ++ } + +- if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { +- av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); +- dst_rv = AVERROR_EOF; +- s->capture.done = 1; +- } +- else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) +- av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", +- s->draining, s->capture.done); +- else if (dst_rv && dst_rv != AVERROR(EAGAIN)) +- av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", +- s->draining, s->capture.done, dst_rv); +- +- // Go again if we got a frame that we need to discard +- } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); ++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { ++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); ++ dst_rv = AVERROR_EOF; ++ s->capture.done = 1; ++ } ++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) ++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", ++ s->draining, s->capture.done); ++ else if (dst_rv && dst_rv != AVERROR(EAGAIN)) ++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", ++ s->draining, s->capture.done, dst_rv); + } + + ++i; +@@ -791,7 +701,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + if (ret < 0) + return ret; + +- xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); + s->pending_hw = PENDING_HW_MIN; + +@@ -810,12 +719,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + output->av_codec_id = avctx->codec_id; + output->av_pix_fmt = AV_PIX_FMT_NONE; + output->min_buf_size = max_coded_size(avctx); +- output->no_pts_rescale = 1; + + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; + capture->min_buf_size = 0; +- capture->no_pts_rescale = 1; + + /* the client requests the codec to generate DRM frames: + * - data[0] will therefore point to the returned AVDRMFrameDescriptor +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index 20f81df750..db6014d8e3 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -24,6 +24,8 @@ + #include + #include + #include ++#include ++ + #include "encode.h" + #include "libavcodec/avcodec.h" + #include "libavutil/pixdesc.h" +@@ -38,6 +40,34 @@ + #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x + #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x + ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#ifndef V4L2_CID_CODEC_BASE ++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE ++#endif ++ ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in videodev2.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ + static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den) + { + struct v4l2_streamparm parm = { 0 }; +@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p) + static int v4l2_check_b_frame_support(V4L2m2mContext *s) + { + if (s->avctx->max_b_frames) +- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n"); ++ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames); + +- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0); ++ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1); + v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0); + if (s->avctx->max_b_frames == 0) + return 0; + + avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding"); +- + return AVERROR_PATCHWELCOME; + } + +@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s) + return 0; + } + ++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame) ++{ ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ const uint32_t drm_fmt = src->layers[0].format; ++ // Treat INVALID as LINEAR ++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? ++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; ++ uint32_t pix_fmt = 0; ++ uint32_t w = 0; ++ uint32_t h = 0; ++ uint32_t bpl = src->layers[0].planes[0].pitch; ++ ++ // We really don't expect multiple layers ++ // All formats that we currently cope with are single object ++ ++ if (src->nb_layers != 1 || src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ switch (drm_fmt) { ++ case DRM_FORMAT_YUV420: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 3) ++ break; ++ pix_fmt = V4L2_PIX_FMT_YUV420; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ break; ++ ++ case DRM_FORMAT_NV12: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_COL128; ++ w = bpl; ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ case DRM_FORMAT_P030: ++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; ++ w = bpl / 2; // Matching lie to how we construct this ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (!pix_fmt) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->plane_fmt[0].bytesperline = bpl; ++ pix->num_planes = 1; ++ } ++ else { ++ struct v4l2_pix_format *const pix = &format->fmt.pix; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->bytesperline = bpl; ++ } ++ ++ return 0; ++} ++ ++// Do we have similar enough formats to be usable? ++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b) ++{ ++ if (a->type != b->type) ++ return 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) { ++ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp; ++ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp; ++ unsigned int i; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->num_planes != pb->num_planes) ++ return 0; ++ for (i = 0; i != pa->num_planes; ++i) { ++ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline) ++ return 0; ++ } ++ } ++ else { ++ const struct v4l2_pix_format *const pa = &a->fmt.pix; ++ const struct v4l2_pix_format *const pb = &b->fmt.pix; ++ if (pa->pixelformat != pb->pixelformat || ++ pa->bytesperline != pb->bytesperline) ++ return 0; ++ } ++ return 1; ++} ++ ++ + static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; + ++ // Signal EOF if needed ++ if (!frame) { ++ return ff_v4l2_context_enqueue_frame(output, frame); ++ } ++ ++ if (s->input_drm && !output->streamon) { ++ int rv; ++ struct v4l2_format req_format = {.type = output->format.type}; ++ ++ // Set format when we first get a buffer ++ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n"); ++ return rv; ++ } ++ ++ ff_v4l2_context_release(output); ++ ++ output->format = req_format; ++ ++ if ((rv = ff_v4l2_context_set_format(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n"); ++ return rv; ++ } ++ ++ if (!fmt_eq(&req_format, &output->format)) { ++ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ output->selection.top = frame->crop_top; ++ output->selection.left = frame->crop_left; ++ output->selection.width = av_frame_cropped_width(frame); ++ output->selection.height = av_frame_cropped_height(frame); ++ ++ if ((rv = ff_v4l2_context_init(output)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n"); ++ return rv; ++ } ++ ++ { ++ struct v4l2_selection selection = { ++ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT, ++ .target = V4L2_SEL_TGT_CROP, ++ .r = output->selection ++ }; ++ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top, ++ av_err2str(AVERROR(errno))); ++ } ++ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n", ++ selection.r.width, selection.r.height, selection.r.left, selection.r.top); ++ } ++ } ++ + #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME +- if (frame && frame->pict_type == AV_PICTURE_TYPE_I) ++ if (frame->pict_type == AV_PICTURE_TYPE_I) + v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); + #endif + +@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + } + + dequeue: +- return ff_v4l2_context_dequeue_packet(capture, avpkt); ++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ return ret; ++ ++ if (capture->first_buf == 1) { ++ uint8_t * data; ++ const int len = avpkt->size; ++ ++ // 1st buffer after streamon should be SPS/PPS ++ capture->first_buf = 2; ++ ++ // Clear both possible stores so there is no chance of confusion ++ av_freep(&s->extdata_data); ++ s->extdata_size = 0; ++ av_freep(&avctx->extradata); ++ avctx->extradata_size = 0; ++ ++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL) ++ memcpy(data, avpkt->data, len); ++ ++ av_packet_unref(avpkt); ++ ++ if (data == NULL) ++ return AVERROR(ENOMEM); ++ ++ // We need to copy the header, but keep local if not global ++ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { ++ avctx->extradata = data; ++ avctx->extradata_size = len; ++ } ++ else { ++ s->extdata_data = data; ++ s->extdata_size = len; ++ } ++ ++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ return ret; ++ } ++ ++ // First frame must be key so mark as such even if encoder forgot ++ if (capture->first_buf == 2) ++ avpkt->flags |= AV_PKT_FLAG_KEY; ++ ++ // Add SPS/PPS to the start of every key frame if non-global headers ++ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { ++ const size_t newlen = s->extdata_size + avpkt->size; ++ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); ++ ++ if (buf == NULL) { ++ av_packet_unref(avpkt); ++ return AVERROR(ENOMEM); ++ } ++ ++ memcpy(buf->data, s->extdata_data, s->extdata_size); ++ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); ++ ++ av_buffer_unref(&avpkt->buf); ++ avpkt->buf = buf; ++ avpkt->data = buf->data; ++ avpkt->size = newlen; ++ } ++ ++// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); ++ capture->first_buf = 0; ++ return 0; + } + + static av_cold int v4l2_encode_init(AVCodecContext *avctx) +@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + uint32_t v4l2_fmt_output; + int ret; + ++ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt); ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + capture = &s->capture; + output = &s->output; + ++ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME); ++ + /* common settings output/capture */ + output->height = capture->height = avctx->height; + output->width = capture->width = avctx->width; + + /* output context */ + output->av_codec_id = AV_CODEC_ID_RAWVIDEO; +- output->av_pix_fmt = avctx->pix_fmt; ++ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt : ++ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt : ++ AV_PIX_FMT_YUV420P; + + /* capture context */ + capture->av_codec_id = avctx->codec_id; +@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx) + v4l2_fmt_output = output->format.fmt.pix.pixelformat; + + pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO); +- if (pix_fmt_output != avctx->pix_fmt) { ++ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output); + av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name); + return AVERROR(EINVAL); + +From 1fa3b2c14f3d237de415b1dd924ffc9b0c6f2ced Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 8 Jun 2022 16:13:31 +0000 +Subject: [PATCH 055/113] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is + always NO_PTS + +If we do have DTS but don't have PTS then assume PTS=DTS. +Also get rid of last_dts from tracking as its info wasn't actually +useful in any way. +--- + libavcodec/v4l2_context.c | 6 ++---- + libavcodec/v4l2_m2m.h | 1 - + libavcodec/v4l2_m2m_dec.c | 8 +++++++- + 3 files changed, 9 insertions(+), 6 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 3ed5234be4..0225f6ba64 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -73,7 +73,6 @@ xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPack + track_pts = track_to_pts(avctx, x->track_no); + + av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); +- x->last_pkt_dts = avpkt->dts; + x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ + .discard = 0, + .pending = 1, +@@ -100,7 +99,6 @@ xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFr + track_pts = track_to_pts(avctx, x->track_no); + + av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no); +- x->last_pkt_dts = frame->pkt_dts; + x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ + .discard = 0, + .pending = 1, +@@ -129,7 +127,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx, + av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING, + "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); + frame->pts = AV_NOPTS_VALUE; +- frame->pkt_dts = x->last_pkt_dts; ++ frame->pkt_dts = AV_NOPTS_VALUE; + frame->reordered_opaque = x->last_opaque; + frame->pkt_pos = -1; + frame->pkt_duration = 0; +@@ -138,7 +136,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx, + else if (!t->discard) + { + frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; +- frame->pkt_dts = x->last_pkt_dts; ++ frame->pkt_dts = t->dts; + frame->reordered_opaque = t->reordered_opaque; + frame->pkt_pos = t->pkt_pos; + frame->pkt_duration = t->pkt_duration; +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index d6cdaf65e1..ee72beb052 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -67,7 +67,6 @@ typedef struct pts_stats_s + typedef struct xlat_track_s { + unsigned int track_no; + int64_t last_pts; +- int64_t last_pkt_dts; + int64_t last_opaque; + V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; + } xlat_track_t; +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index e61464b499..bb809be41e 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -177,7 +177,13 @@ set_best_effort_pts(AVCodecContext *const avctx, + pts_stats_add(ps, frame->pts); + + frame->best_effort_timestamp = pts_stats_guess(ps); +- frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? ++ // If we can't guess from just PTS - try DTS ++ if (frame->best_effort_timestamp == AV_NOPTS_VALUE) ++ frame->best_effort_timestamp = frame->pkt_dts; ++ ++ // We can't emulate what s/w does in a useful manner and using the ++ // "correct" answer seems to just confuse things. ++ frame->pkt_dts = frame->pts; + av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", + frame->pts, frame->best_effort_timestamp, frame->pkt_dts); + } + +From 07fbbc149d1aa6fc6e151a7f65505be873c27982 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 30 Jun 2022 15:59:23 +0000 +Subject: [PATCH 056/113] v4l2: Update H265 request for current API + +This works with v9 of the H265 patch set which hopefully will be the +last one. Hevc controls extracted from patched v4l2-controls into +hevc-ctrls-v4 - if HEVC controls found in the system v4l2-controls then +those will be used instead. +--- + libavcodec/Makefile | 2 +- + libavcodec/hevc-ctrls-v4.h | 515 +++++++++++++++++++++++++++++++++ + libavcodec/v4l2_req_hevc_v4.c | 3 + + libavcodec/v4l2_req_hevc_vx.c | 81 ++++-- + libavcodec/v4l2_request_hevc.c | 6 +- + libavcodec/v4l2_request_hevc.h | 1 + + 6 files changed, 583 insertions(+), 25 deletions(-) + create mode 100644 libavcodec/hevc-ctrls-v4.h + create mode 100644 libavcodec/v4l2_req_hevc_v4.c + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 09962a810b..a8951ddcbe 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -976,7 +976,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o + OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ +- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o ++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o +diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h +new file mode 100644 +index 0000000000..7e05f6e7c3 +--- /dev/null ++++ b/libavcodec/hevc-ctrls-v4.h +@@ -0,0 +1,515 @@ ++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */ ++/* ++ * Video for Linux Two controls header file ++ * ++ * Copyright (C) 1999-2012 the contributors ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * Alternatively you can redistribute this file under the terms of the ++ * BSD license as stated below: ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * 3. The names of its contributors may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * The contents of this header was split off from videodev2.h. All control ++ * definitions should be added to this header, which is included by ++ * videodev2.h. ++ */ ++ ++#ifndef AVCODEC_HEVC_CTRLS_V4_H ++#define AVCODEC_HEVC_CTRLS_V4_H ++ ++#include ++#include ++ ++#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) ++#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403) ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404) ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) ++#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) ++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) ++ ++enum v4l2_stateless_hevc_decode_mode { ++ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_stateless_hevc_start_code { ++ V4L2_STATELESS_HEVC_START_CODE_NONE, ++ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/** ++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set ++ * ++ * @video_parameter_set_id: specifies the value of the ++ * vps_video_parameter_set_id of the active VPS ++ * @seq_parameter_set_id: provides an identifier for the SPS for ++ * reference by other syntax elements ++ * @pic_width_in_luma_samples: specifies the width of each decoded picture ++ * in units of luma samples ++ * @pic_height_in_luma_samples: specifies the height of each decoded picture ++ * in units of luma samples ++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the ++ * samples of the luma array ++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the ++ * samples of the chroma arrays ++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of ++ * the variable MaxPicOrderCntLsb ++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum ++ * required size of the decoded picture ++ * buffer for the codec video sequence ++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures ++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the ++ * value of SpsMaxLatencyPictures array ++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum ++ * luma coding block size ++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * coding block size ++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma ++ * transform block size ++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between ++ * the maximum and minimum luma ++ * transform block size ++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in inter ++ * prediction mode ++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy ++ * depth for transform units of ++ * coding units coded in intra ++ * prediction mode ++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of ++ * bits used to represent each of PCM sample ++ * values of the luma component ++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number ++ * of bits used to represent each of PCM ++ * sample values of the chroma components ++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the ++ * minimum size of coding blocks ++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between ++ * the maximum and minimum size of ++ * coding blocks ++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set() ++ * syntax structures included in the SPS ++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term ++ * reference pictures that are specified in the SPS ++ * @chroma_format_idc: specifies the chroma sampling ++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number ++ * of temporal sub-layers ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_sps { ++ __u8 video_parameter_set_id; ++ __u8 seq_parameter_set_id; ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ __u8 sps_max_sub_layers_minus1; ++ ++ __u8 reserved[6]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) ++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) ++ ++/** ++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set ++ * ++ * @pic_parameter_set_id: identifies the PPS for reference by other ++ * syntax elements ++ * @num_extra_slice_header_bits: specifies the number of extra slice header ++ * bits that are present in the slice header RBSP ++ * for coded pictures referring to the PPS. ++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l0_active_minus1 ++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the ++ * inferred value of num_ref_idx_l1_active_minus1 ++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for ++ * each slice referring to the PPS ++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding ++ * tree block size and the minimum luma coding block ++ * size of coding units that convey cu_qp_delta_abs ++ * and cu_qp_delta_sign_flag ++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb ++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr ++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns ++ * partitioning the picture ++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning ++ * the picture ++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in ++ * units of coding tree blocks ++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in ++ * units of coding tree blocks ++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for ++ * beta divided by 2 ++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC ++ * divided by 2 ++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of ++ * the variable Log2ParMrgLevel ++ * @reserved: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_PPS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_pps { ++ __u8 pic_parameter_set_id; ++ __u8 num_extra_slice_header_bits; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ __u8 reserved; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 ++ ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7 ++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10 ++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11 ++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++/** ++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry ++ * ++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference. ++ * @flags: long term flag for the reference frame ++ * @field_pic: whether the reference is a field picture or a frame. ++ * @reserved: padding field. Should be zeroed by applications. ++ * @pic_order_cnt_val: the picture order count of the current picture. ++ */ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 flags; ++ __u8 field_pic; ++ __u16 reserved; ++ __s32 pic_order_cnt_val; ++}; ++ ++/** ++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters ++ * ++ * @delta_luma_weight_l0: the difference of the weighting factor applied ++ * to the luma prediction value for list 0 ++ * @luma_offset_l0: the additive offset applied to the luma prediction value ++ * for list 0 ++ * @delta_chroma_weight_l0: the difference of the weighting factor applied ++ * to the chroma prediction values for list 0 ++ * @chroma_offset_l0: the difference of the additive offset applied to ++ * the chroma prediction values for list 0 ++ * @delta_luma_weight_l1: the difference of the weighting factor applied ++ * to the luma prediction value for list 1 ++ * @luma_offset_l1: the additive offset applied to the luma prediction value ++ * for list 1 ++ * @delta_chroma_weight_l1: the difference of the weighting factor applied ++ * to the chroma prediction values for list 1 ++ * @chroma_offset_l1: the difference of the additive offset applied to ++ * the chroma prediction values for list 1 ++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for ++ * all luma weighting factors ++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm ++ * of the denominator for all chroma ++ * weighting factors ++ */ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++/** ++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters ++ * ++ * This control is a dynamically sized 1-dimensional array, ++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it. ++ * ++ * @bit_size: size (in bits) of the current slice data ++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data ++ * @num_entry_point_offsets: specifies the number of entry point offset syntax ++ * elements in the slice header. ++ * @nal_unit_type: specifies the coding type of the slice (B, P or I) ++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit ++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{} ++ * @colour_plane_id: specifies the colour plane associated with the current slice ++ * @slice_pic_order_cnt: specifies the picture order count ++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 0 ++ * that may be used to decode the slice ++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum ++ * reference index for reference picture list 1 ++ * that may be used to decode the slice ++ * @collocated_ref_idx: specifies the reference index of the collocated picture used ++ * for temporal motion vector prediction ++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging ++ * motion vector prediction candidates supported in ++ * the slice subtracted from 5 ++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding ++ * blocks in the slice ++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset ++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset ++ * @slice_act_y_qp_offset: screen content extension parameters ++ * @slice_act_cb_qp_offset: screen content extension parameters ++ * @slice_act_cr_qp_offset: screen content extension parameters ++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2 ++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2 ++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or ++ * more fields ++ * @reserved0: padding field. Should be zeroed by applications. ++ * @slice_segment_addr: specifies the address of the first coding tree block in ++ * the slice segment ++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB ++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS ++ * @pred_weight_table: the prediction weight coefficients for inter-picture ++ * prediction ++ * @reserved1: padding field. Should be zeroed by applications. ++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_byte_offset; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __s32 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ __u8 reserved0[3]; ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u8 reserved1[2]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 ++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 ++ ++/** ++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters ++ * ++ * @pic_order_cnt_val: picture order count ++ * @short_term_ref_pic_set_size: specifies the size of short-term reference ++ * pictures set included in the SPS of the first slice ++ * @long_term_ref_pic_set_size: specifies the size of long-term reference ++ * pictures set include in the SPS of the first slice ++ * @num_active_dpb_entries: the number of entries in dpb ++ * @num_poc_st_curr_before: the number of reference pictures in the short-term ++ * set that come before the current frame ++ * @num_poc_st_curr_after: the number of reference pictures in the short-term ++ * set that come after the current frame ++ * @num_poc_lt_curr: the number of reference pictures in the long-term set ++ * @poc_st_curr_before: provides the index of the short term before references ++ * in DPB array ++ * @poc_st_curr_after: provides the index of the short term after references ++ * in DPB array ++ * @poc_lt_curr: provides the index of the long term references in DPB array ++ * @reserved: padding field. Should be zeroed by applications. ++ * @dpb: the decoded picture buffer, for meta-data about reference frames ++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} ++ */ ++struct v4l2_ctrl_hevc_decode_params { ++ __s32 pic_order_cnt_val; ++ __u16 short_term_ref_pic_set_size; ++ __u16 long_term_ref_pic_set_size; ++ __u8 num_active_dpb_entries; ++ __u8 num_poc_st_curr_before; ++ __u8 num_poc_st_curr_after; ++ __u8 num_poc_lt_curr; ++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 reserved[4]; ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u64 flags; ++}; ++ ++/** ++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters ++ * ++ * @scaling_list_4x4: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_8x8: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_16x16: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_32x32: scaling list is used for the scaling process for ++ * transform coefficients. The values on each scaling ++ * list are expected in raster scan order ++ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process ++ * for transform coefficients. The values on each ++ * scaling list are expected in raster scan order. ++ */ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c +new file mode 100644 +index 0000000000..c35579d8e0 +--- /dev/null ++++ b/libavcodec/v4l2_req_hevc_v4.c +@@ -0,0 +1,3 @@ ++#define HEVC_CTRLS_VERSION 4 ++#include "v4l2_req_hevc_vx.c" ++ +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 611fa21cc3..761c5b2dc7 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -6,8 +6,6 @@ + #include "internal.h" + #include "thread.h" + +-#include "v4l2_request_hevc.h" +- + #if HEVC_CTRLS_VERSION == 1 + #include "hevc-ctrls-v1.h" + +@@ -18,10 +16,37 @@ + #include "hevc-ctrls-v2.h" + #elif HEVC_CTRLS_VERSION == 3 + #include "hevc-ctrls-v3.h" ++#elif HEVC_CTRLS_VERSION == 4 ++#include ++#if !defined(V4L2_CID_STATELESS_HEVC_SPS) ++#include "hevc-ctrls-v4.h" ++#endif + #else + #error Unknown HEVC_CTRLS_VERSION + #endif + ++#ifndef V4L2_CID_STATELESS_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS ++#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS ++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX ++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS ++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE ++#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE ++ ++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED ++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED ++#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE ++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B ++#endif ++ ++// Should be in videodev2 but we might not have a good enough one ++#ifndef V4L2_PIX_FMT_HEVC_SLICE ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++#endif ++ ++#include "v4l2_request_hevc.h" ++ + #include "libavutil/hwcontext_drm.h" + + #include +@@ -259,9 +284,13 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const + #endif + entry->field_pic = frame->frame->interlaced_frame; + ++#if HEVC_CTRLS_VERSION <= 3 + /* TODO: Interleaved: Get the POC for each field. */ + entry->pic_order_cnt[0] = frame->poc; + entry->pic_order_cnt[1] = frame->poc; ++#else ++ entry->pic_order_cnt_val = frame->poc; ++#endif + } + } + return n; +@@ -287,8 +316,11 @@ static void fill_slice_params(const HEVCContext * const h, + + *slice_params = (struct v4l2_ctrl_hevc_slice_params) { + .bit_size = bit_size, ++#if HEVC_CTRLS_VERSION <= 3 + .data_bit_offset = bit_offset, +- ++#else ++ .data_byte_offset = bit_offset / 8 + 1, ++#endif + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + .slice_segment_addr = sh->slice_segment_addr, + +@@ -376,8 +408,10 @@ static void fill_slice_params(const HEVCContext * const h, + av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); + } + ++#if HEVC_CTRLS_VERSION <= 3 + for (i = 0; i < slice_params->num_entry_point_offsets; i++) + slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; ++#endif + } + + #if HEVC_CTRLS_VERSION >= 2 +@@ -761,30 +795,30 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + + struct v4l2_ext_control control[] = { + { +- .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, ++ .id = V4L2_CID_STATELESS_HEVC_SPS, + .ptr = &controls->sps, + .size = sizeof(controls->sps), + }, + { +- .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, ++ .id = V4L2_CID_STATELESS_HEVC_PPS, + .ptr = &controls->pps, + .size = sizeof(controls->pps), + }, + #if HEVC_CTRLS_VERSION >= 2 + { +- .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, ++ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS, + .ptr = dec, + .size = sizeof(*dec), + }, + #endif + { +- .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, ++ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, + .ptr = slices + slice_no, + .size = sizeof(*slices) * slice_count, + }, + // Optional + { +- .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, ++ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, + .ptr = &controls->scaling_matrix, + .size = sizeof(controls->scaling_matrix), + }, +@@ -1000,12 +1034,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + + // Check for var slice array + struct v4l2_query_ext_ctrl qc[] = { +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_SPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_PPS }, ++ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, + #if HEVC_CTRLS_VERSION >= 2 +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS }, + #endif + }; + // Order & size must match! +@@ -1042,12 +1076,13 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + + fill_sps(&ctrl_sps, sps); + +- if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { ++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { + av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); + return AVERROR(EINVAL); + } + + ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; ++ av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No"); + return 0; + } + +@@ -1058,29 +1093,29 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + int ret; + + struct v4l2_query_ext_ctrl querys[] = { +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, + }; + + struct v4l2_ext_control ctrls[] = { +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, +- { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, + }; + + mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); + + ctx->decode_mode = querys[0].default_value; + +- if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && +- ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { ++ if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && ++ ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); + return AVERROR(EINVAL); + } + + ctx->start_code = querys[1].default_value; +- if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && +- ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { ++ if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE && ++ ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); + return AVERROR(EINVAL); + } +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index 20e4e0ab15..cd79aad563 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + goto fail4; + } + +- if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { ++ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); ++ ctx->fns = &V2(ff_v4l2_req_hevc, 4); ++ } ++ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 3); + } +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +index ed48d62e2d..d4adb3f812 100644 +--- a/libavcodec/v4l2_request_hevc.h ++++ b/libavcodec/v4l2_request_hevc.h +@@ -99,5 +99,6 @@ typedef struct v4l2_req_decode_fns { + extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); + extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); + extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); ++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4); + + #endif + +From e5cfb02fc431551da622c3ccaaf16d25afecf9e5 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sun, 3 Jul 2022 13:40:41 +0000 +Subject: [PATCH 057/113] v4l2_req: Observe limit on size of slice_array + +This in fact provides some minor simplifications by combing the +multi-slice and single-slice paths. + +(cherry picked from commit 7631e6d1a66fca9048605c214f3464c90d37932c) +--- + libavcodec/v4l2_req_hevc_vx.c | 39 ++++++++++++++-------------------- + libavcodec/v4l2_request_hevc.h | 5 +---- + 2 files changed, 17 insertions(+), 27 deletions(-) + +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 761c5b2dc7..9d08d13d9e 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -840,18 +840,21 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + int bcount = get_bits_count(&h->HEVClc->gb); + uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; + ++ const unsigned int n = rd->num_slices; ++ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices; ++ + int rv; + struct slice_info * si; + + if ((rv = slice_add(rd)) != 0) + return rv; + +- si = rd->slices + rd->num_slices - 1; ++ si = rd->slices + n; + si->ptr = buffer; + si->len = size; + +- if (ctx->multi_slice && rd->num_slices > 1) { +- struct slice_info *const si0 = rd->slices; ++ if (n != block_start) { ++ struct slice_info *const si0 = rd->slices + block_start; + const size_t offset = (buffer - si0->ptr); + boff += offset * 8; + size += offset; +@@ -859,11 +862,11 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + } + + #if HEVC_CTRLS_VERSION >= 2 +- if (rd->num_slices == 1) ++ if (n == 0) + fill_decode_params(h, &rd->dec); +- fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff); + #else +- fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); ++ fill_slice_params(h, rd->slice_params + n, size * 8, boff); + #endif + + return 0; +@@ -997,18 +1000,11 @@ static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) + } + + // Send as slices +- if (ctx->multi_slice) +- { +- if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) ++ for (i = 0; i < rd->num_slices; i += ctx->max_slices) { ++ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices); ++ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0) + goto fail; + } +- else +- { +- for (i = 0; i != rd->num_slices; ++i) { +- if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) +- goto fail; +- } +- } + + // Set the drm_prime desriptor + drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); +@@ -1081,8 +1077,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + return AVERROR(EINVAL); + } + +- ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; +- av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No"); + return 0; + } + +@@ -1120,11 +1114,10 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + return AVERROR(EINVAL); + } + +- ctx->max_slices = querys[2].elems; +- if (ctx->max_slices > MAX_SLICES) { +- av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); +- return AVERROR(EINVAL); +- } ++ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || ++ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? ++ 1 : querys[2].dims[0]; ++ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); + + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +index d4adb3f812..0029e23309 100644 +--- a/libavcodec/v4l2_request_hevc.h ++++ b/libavcodec/v4l2_request_hevc.h +@@ -46,8 +46,6 @@ + #define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 + #endif + +-#define MAX_SLICES 128 +- + #define VCAT(name, version) name##_v##version + #define V2(n,v) VCAT(n, v) + #define V(n) V2(n, HEVC_CTRLS_VERSION) +@@ -64,10 +62,9 @@ typedef struct V4L2RequestContextHEVC { + + unsigned int timestamp; // ?? maybe uint64_t + +- int multi_slice; + int decode_mode; + int start_code; +- int max_slices; ++ unsigned int max_slices; + + req_decode_q decode_q; + + +From 73ba75fbab29c26cd8b362b5f450d4e0beb4ed1f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 4 Jul 2022 14:43:20 +0100 +Subject: [PATCH 058/113] v4l2_req: Add entry point offsets array control + +--- + libavcodec/v4l2_req_hevc_vx.c | 88 +++++++++++++++++++++++++++------- + libavcodec/v4l2_request_hevc.h | 3 +- + 2 files changed, 72 insertions(+), 19 deletions(-) + +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 9d08d13d9e..43ef6631ed 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -82,11 +82,16 @@ typedef struct V4L2MediaReqDescriptor { + struct v4l2_ctrl_hevc_slice_params * slice_params; + struct slice_info * slices; + ++ size_t num_offsets; ++ size_t alloced_offsets; ++ uint32_t *offsets; ++ + } V4L2MediaReqDescriptor; + + struct slice_info { + const uint8_t * ptr; + size_t len; // bytes ++ size_t n_offsets; + }; + + // Handy container for accumulating controls before setting +@@ -245,7 +250,7 @@ static int slice_add(V4L2MediaReqDescriptor * const rd) + if (rd->num_slices >= rd->alloced_slices) { + struct v4l2_ctrl_hevc_slice_params * p2; + struct slice_info * s2; +- size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; ++ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2; + + p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); + if (p2 == NULL) +@@ -263,6 +268,23 @@ static int slice_add(V4L2MediaReqDescriptor * const rd) + return 0; + } + ++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets) ++{ ++ if (rd->num_offsets + n > rd->alloced_offsets) { ++ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2; ++ void * p2; ++ while (rd->num_offsets + n > n2) ++ n2 *= 2; ++ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->offsets = p2; ++ rd->alloced_offsets = n2; ++ } ++ for (size_t i = 0; i != n; ++i) ++ rd->offsets[rd->num_offsets++] = offsets[i] - 1; ++ return 0; ++} ++ + static unsigned int + fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) + { +@@ -403,12 +425,12 @@ static void fill_slice_params(const HEVCContext * const h, + fill_pred_table(h, &slice_params->pred_weight_table); + + slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; ++#if HEVC_CTRLS_VERSION <= 3 + if (slice_params->num_entry_point_offsets > 256) { + slice_params->num_entry_point_offsets = 256; + av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); + } + +-#if HEVC_CTRLS_VERSION <= 3 + for (i = 0; i < slice_params->num_entry_point_offsets; i++) + slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; + #endif +@@ -787,13 +809,17 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + #if HEVC_CTRLS_VERSION >= 2 + struct v4l2_ctrl_hevc_decode_params * const dec, + #endif +- struct v4l2_ctrl_hevc_slice_params * const slices, +- const unsigned int slice_no, +- const unsigned int slice_count) ++ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count, ++ void * const offsets, const size_t offset_count) + { + int rv; ++#if HEVC_CTRLS_VERSION >= 2 ++ unsigned int n = 4; ++#else ++ unsigned int n = 3; ++#endif + +- struct v4l2_ext_control control[] = { ++ struct v4l2_ext_control control[6] = { + { + .id = V4L2_CID_STATELESS_HEVC_SPS, + .ptr = &controls->sps, +@@ -813,21 +839,28 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + #endif + { + .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, +- .ptr = slices + slice_no, ++ .ptr = slices, + .size = sizeof(*slices) * slice_count, + }, +- // Optional +- { + }; + -+ ff_vc1dsp_init(&h); ++ if (controls->has_scaling) ++ control[n++] = (struct v4l2_ext_control) { + .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX, + .ptr = &controls->scaling_matrix, + .size = sizeof(controls->scaling_matrix), +- }, +- }; ++ }; + -+ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { -+ void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); -+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); -+ if (check_func(func, "vc1dsp.%s", tests[t].name)) { -+ for (int count = 1000; count > 0; --count) { -+ int pq = rnd() % 31 + 1; -+ RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); -+ call_ref(filter_buf0 + 4 * 48 + 16, 48, pq); -+ call_new(filter_buf1 + 4 * 48 + 16, 48, pq); -+ if (memcmp(filter_buf0, filter_buf1, 24 * 48)) -+ fail(); ++#if HEVC_CTRLS_VERSION >= 4 ++ if (offsets) ++ control[n++] = (struct v4l2_ext_control) { ++ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, ++ .ptr = offsets, ++ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count, ++ }; ++#endif + +- rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, +- controls->has_scaling ? +- FF_ARRAY_ELEMS(control) : +- FF_ARRAY_ELEMS(control) - 1); ++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n); + + return rv; + } +@@ -852,6 +885,7 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + si = rd->slices + n; + si->ptr = buffer; + si->len = size; ++ si->n_offsets = rd->num_offsets; + + if (n != block_start) { + struct slice_info *const si0 = rd->slices + block_start; +@@ -868,6 +902,9 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + #else + fill_slice_params(h, rd->slice_params + n, size * 8, boff); + #endif ++ if (ctx->max_offsets != 0 && ++ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0) ++ return rv; + + return 0; + } +@@ -893,10 +930,13 @@ static int send_slice(AVCodecContext * const avctx, + { + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + ++ const int is_last = (j == rd->num_slices); + struct slice_info *const si = rd->slices + i; + struct media_request * req = NULL; + struct qent_src * src = NULL; + MediaBufsStatus stat; ++ void * offsets = rd->offsets + rd->slices[i].n_offsets; ++ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets; + + if ((req = media_request_get(ctx->mpool)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); +@@ -908,8 +948,8 @@ static int send_slice(AVCodecContext * const avctx, + #if HEVC_CTRLS_VERSION >= 2 + &rd->dec, + #endif +- rd->slice_params, +- i, j - i)) { ++ rd->slice_params + i, j - i, ++ offsets, n_offsets)) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); + goto fail1; + } +@@ -935,7 +975,7 @@ static int send_slice(AVCodecContext * const avctx, + + stat = mediabufs_start_request(ctx->mbufs, &req, &src, + i == 0 ? rd->qe_dst : NULL, +- j == rd->num_slices); ++ is_last); + + if (stat != MEDIABUFS_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); +@@ -1090,6 +1130,9 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, + { .id = V4L2_CID_STATELESS_HEVC_START_CODE, }, + { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, }, ++#if HEVC_CTRLS_VERSION >= 4 ++ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, }, ++#endif + }; + + struct v4l2_ext_control ctrls[] = { +@@ -1119,6 +1162,14 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + 1 : querys[2].dims[0]; + av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices); + ++#if HEVC_CTRLS_VERSION >= 4 ++ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? ++ 0 : querys[3].dims[0]; ++ av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); ++#else ++ ctx->max_offsets = 0; ++#endif ++ + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; + +@@ -1141,6 +1192,7 @@ static void v4l2_req_frame_free(void *opaque, uint8_t *data) + + av_freep(&rd->slices); + av_freep(&rd->slice_params); ++ av_freep(&rd->offsets); + + av_free(rd); + } +diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h +index 0029e23309..99c90064ea 100644 +--- a/libavcodec/v4l2_request_hevc.h ++++ b/libavcodec/v4l2_request_hevc.h +@@ -64,7 +64,8 @@ typedef struct V4L2RequestContextHEVC { + + int decode_mode; + int start_code; +- unsigned int max_slices; ++ unsigned int max_slices; // 0 => not wanted (frame mode) ++ unsigned int max_offsets; // 0 => not wanted + + req_decode_q decode_q; + + +From 9cc5b21fa37450d7ab43163637cbf26718ffe92d Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 4 Jul 2022 16:22:54 +0100 +Subject: [PATCH 059/113] v4l2_req: Support Annex B + +--- + libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------ + 1 file changed, 41 insertions(+), 20 deletions(-) + +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 43ef6631ed..5e0db9850a 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -879,6 +879,18 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + int rv; + struct slice_info * si; + ++ // This looks dodgy but we know that FFmpeg has parsed this from a buffer ++ // that contains the entire frame including the start code ++ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { ++ buffer -= 3; ++ size += 3; ++ boff += 24; ++ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) { ++ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n", ++ buffer[0], buffer[1], buffer[2]); ++ } ++ } ++ + if ((rv = slice_add(rd)) != 0) + return rv; + +@@ -969,10 +981,6 @@ static int send_slice(AVCodecContext * const avctx, + goto fail2; + } + +-#warning ANNEX_B start code +-// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { +-// } +- + stat = mediabufs_start_request(ctx->mbufs, &req, &src, + i == 0 ? rd->qe_dst : NULL, + is_last); +@@ -1120,6 +1128,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + return 0; + } + ++static inline int ++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) ++{ ++ return v >= c->minimum && v <= c->maximum; ++} ++ + // Final init + static int + set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) +@@ -1142,21 +1156,6 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + + mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); + +- ctx->decode_mode = querys[0].default_value; +- +- if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && +- ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { +- av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); +- return AVERROR(EINVAL); +- } +- +- ctx->start_code = querys[1].default_value; +- if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE && +- ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) { +- av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); +- return AVERROR(EINVAL); +- } +- + ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) || + querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ? + 1 : querys[2].dims[0]; +@@ -1165,11 +1164,33 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + #if HEVC_CTRLS_VERSION >= 4 + ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ? + 0 : querys[3].dims[0]; +- av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); ++ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets); + #else + ctx->max_offsets = 0; + #endif + ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; ++ ++ if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ { ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; ++ ++ // Prefer NONE as it doesn't require the slightly dodgy look ++ // backwards in our raw buffer ++ if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); ++ } ++ + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; + + +From 6dc0b1d851ef47fb198d32eb3088cfaa6b792d08 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 4 Jul 2022 18:24:03 +0100 +Subject: [PATCH 060/113] v4l2_req: Add frame mode decode + +--- + libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------ + 1 file changed, 46 insertions(+), 23 deletions(-) + +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 5e0db9850a..ada53d0d44 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -814,9 +814,9 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + { + int rv; + #if HEVC_CTRLS_VERSION >= 2 +- unsigned int n = 4; +-#else + unsigned int n = 3; ++#else ++ unsigned int n = 2; + #endif + + struct v4l2_ext_control control[6] = { +@@ -837,12 +837,14 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + .size = sizeof(*dec), + }, + #endif +- { ++ }; ++ ++ if (slices) ++ control[n++] = (struct v4l2_ext_control) { + .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, + .ptr = slices, + .size = sizeof(*slices) * slice_count, +- }, +- }; ++ }; + + if (controls->has_scaling) + control[n++] = (struct v4l2_ext_control) { +@@ -865,6 +867,8 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + return rv; + } + ++// This only works because we started out from a single coded frame buffer ++// that will remain intact until after end_frame + static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) + { + const HEVCContext * const h = avctx->priv_data; +@@ -891,6 +895,17 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t * + } + } + ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) { ++ if (rd->slices == NULL) { ++ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL) ++ return AVERROR(ENOMEM); ++ rd->slices->ptr = buffer; ++ rd->num_slices = 1; ++ } ++ rd->slices->len = buffer - rd->slices->ptr + size; ++ return 0; ++ } ++ + if ((rv = slice_add(rd)) != 0) + return rv; + +@@ -1169,28 +1184,36 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + ctx->max_offsets = 0; + #endif + +- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; +- +- if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) +- { ++ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED || ++ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) ++ ctx->decode_mode = querys[0].default_value; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)) ++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED; ++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) + ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED; +- +- // Prefer NONE as it doesn't require the slightly dodgy look +- // backwards in our raw buffer +- if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) +- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; +- else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) +- ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; +- else { +- av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); +- return AVERROR(EINVAL); +- } +- } +- else +- { ++ else { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__); ++ return AVERROR(EINVAL); + } + ++ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE || ++ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) ++ ctx->start_code = querys[1].default_value; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B; ++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ else { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ // If we are in slice mode & START_CODE_NONE supported then pick that ++ // as it doesn't require the slightly dodgy look backwards in our raw buffer ++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED && ++ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE)) ++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE; ++ + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; + + +From eb9fe927c6582fba732e8d655072b2ce03099679 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 5 Jul 2022 12:54:22 +0000 +Subject: [PATCH 061/113] v4l2_req: Fix probe for frame based decode + +--- + libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++---------- + 1 file changed, 23 insertions(+), 10 deletions(-) + +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index ada53d0d44..5d083016f8 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -1082,6 +1082,12 @@ fail: + return rv; + } + ++static inline int ++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) ++{ ++ return v >= c->minimum && v <= c->maximum; ++} ++ + // Initial check & init + static int + probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) +@@ -1094,6 +1100,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + // Check for var slice array + struct v4l2_query_ext_ctrl qc[] = { + { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS }, ++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, }, + { .id = V4L2_CID_STATELESS_HEVC_SPS }, + { .id = V4L2_CID_STATELESS_HEVC_PPS }, + { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX }, +@@ -1104,6 +1111,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + // Order & size must match! + static const size_t ctrl_sizes[] = { + sizeof(struct v4l2_ctrl_hevc_slice_params), ++ sizeof(int32_t), + sizeof(struct v4l2_ctrl_hevc_sps), + sizeof(struct v4l2_ctrl_hevc_pps), + sizeof(struct v4l2_ctrl_hevc_scaling_matrix), +@@ -1121,11 +1129,22 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + return AVERROR(EINVAL); + #endif + +- if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { +- av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); ++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls); ++ i = 0; ++#if HEVC_CTRLS_VERSION >= 4 ++ // Skip slice check if no slice mode ++ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) ++ i = 1; ++#else ++ // Fail frame mode silently for anything prior to V4 ++ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED)) + return AVERROR(EINVAL); +- } +- for (i = 0; i != noof_ctrls; ++i) { ++#endif ++ for (; i != noof_ctrls; ++i) { ++ if (qc[i].type == 0) { ++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id); ++ return AVERROR(EINVAL); ++ } + if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { + av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", + HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); +@@ -1143,12 +1162,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + return 0; + } + +-static inline int +-ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v) +-{ +- return v >= c->minimum && v <= c->maximum; +-} +- + // Final init + static int + set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) + +From d01df5c9199285c04b8b31aec41ea0ae2ecaaecf Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 26 Jul 2022 15:46:14 +0000 +Subject: [PATCH 062/113] vf_deinterlace_v4l2m2m: Support NV12 through + deinterlace + +Supports NV12 (though not yet NV12M) through deinterlace. +Also improves error handling such that attempting to deinterlace an +unsupported drm format causes an error. +No longer leaks frame structures. +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 160 ++++++++++++++++++--------- + 1 file changed, 107 insertions(+), 53 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 1a933b7e0a..1a3bef5bcb 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -373,14 +373,16 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) + fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); + + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { +- if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 || ++ if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 && ++ fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) || + fmt->fmt.pix_mp.field != field) { + av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); + + return AVERROR(EINVAL); + } + } else { +- if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 || ++ if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 && ++ fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) || + fmt->fmt.pix.field != field) { + av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); + +@@ -391,7 +393,7 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) + return 0; + } + +-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize) ++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize) + { + struct v4l2_format *fmt = &queue->format; + DeintV4L2M2MContextShared *ctx = queue->ctx; +@@ -402,13 +404,16 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, + .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, + }; + ++ // This works for most single object 4:2:0 types + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = pixelformat; + fmt->fmt.pix_mp.field = field; + fmt->fmt.pix_mp.width = width; + fmt->fmt.pix_mp.height = ysize / pitch; + fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; + fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); + } else { ++ fmt->fmt.pix.pixelformat = pixelformat; + fmt->fmt.pix.field = field; + fmt->fmt.pix.width = width; + fmt->fmt.pix.height = height; +@@ -417,12 +422,22 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, + } + + ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); +- if (ret) ++ if (ret) { ++ ret = AVERROR(errno); + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); ++ return ret; ++ } ++ ++ if (pixelformat != fmt->fmt.pix.pixelformat) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat)); ++ return AVERROR(EINVAL); ++ } + + ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); +- if (ret) +- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret); ++ } + + sel.r.width = width; + sel.r.height = height; +@@ -432,10 +447,12 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, + sel.flags = V4L2_SEL_FLAG_LE; + + ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); +- if (ret) +- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret); ++ if (ret) { ++ ret = AVERROR(errno); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret); ++ } + +- return ret; ++ return 0; + } + + static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) +@@ -517,10 +534,25 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) + return 0; + } + +-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) + { + struct v4l2_exportbuffer expbuf; + int i, ret; ++ uint64_t mod = DRM_FORMAT_MOD_LINEAR; ++ uint32_t fmt = 0; ++ ++ switch (pixelformat) { ++ case V4L2_PIX_FMT_NV12: ++ fmt = DRM_FORMAT_NV12; ++ break; ++ case V4L2_PIX_FMT_YUV420: ++ fmt = DRM_FORMAT_YUV420; ++ break; ++ default: ++ return AVERROR(EINVAL); ++ } ++ ++ avbuf->drm_frame.layers[0].format = fmt; + + for (i = 0; i < avbuf->num_planes; i++) { + memset(&expbuf, 0, sizeof(expbuf)); +@@ -539,12 +571,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) + /* drm frame */ + avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; + avbuf->drm_frame.objects[i].fd = expbuf.fd; +- avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ avbuf->drm_frame.objects[i].format_modifier = mod; + } else { + /* drm frame */ + avbuf->drm_frame.objects[0].size = avbuf->buffer.length; + avbuf->drm_frame.objects[0].fd = expbuf.fd; +- avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ avbuf->drm_frame.objects[0].format_modifier = mod; + } + } + +@@ -629,7 +661,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + if (ret) + goto fail; + +- ret = v4l2_buffer_export_drm(buf); ++ ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat); + if (ret) + goto fail; + } +@@ -878,7 +910,6 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) + + static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) + { +- int av_pix_fmt = AV_PIX_FMT_YUV420P; + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; + AVDRMLayerDescriptor *layer; + +@@ -895,20 +926,13 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) + layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; + } + +- switch (av_pix_fmt) { +- case AV_PIX_FMT_YUYV422: +- +- layer->format = DRM_FORMAT_YUYV; ++ switch (layer->format) { ++ case DRM_FORMAT_YUYV: + layer->nb_planes = 1; +- + break; + +- case AV_PIX_FMT_NV12: +- case AV_PIX_FMT_NV21: +- +- layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ? +- DRM_FORMAT_NV12 : DRM_FORMAT_NV21; +- ++ case DRM_FORMAT_NV12: ++ case DRM_FORMAT_NV21: + if (avbuf->num_planes > 1) + break; + +@@ -920,10 +944,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) + layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; + break; + +- case AV_PIX_FMT_YUV420P: +- +- layer->format = DRM_FORMAT_YUV420; +- ++ case DRM_FORMAT_YUV420: + if (avbuf->num_planes > 1) + break; + +@@ -1032,6 +1053,26 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) + return 0; + } + ++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) ++{ ++ const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR || ++ drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID); ++ ++ switch (drm_desc->layers[0].format) { ++ case DRM_FORMAT_YUV420: ++ if (is_linear) ++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0; ++ break; ++ case DRM_FORMAT_NV12: ++ if (is_linear) ++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0; ++ break; ++ default: ++ break; ++ } ++ return 0; ++} ++ + static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + { + AVFilterContext *avctx = link->dst; +@@ -1047,23 +1088,27 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); + + if (ctx->field_order == V4L2_FIELD_ANY) { +- AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; ++ const uint32_t pixelformat = desc_pixelformat(drm_desc); ++ ++ if (pixelformat == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", ++ av_fourcc2str(drm_desc->layers[0].format), ++ drm_desc->nb_objects, drm_desc->objects[0].format_modifier); ++ return AVERROR(EINVAL); ++ } ++ + ctx->orig_width = drm_desc->layers[0].planes[0].pitch; + ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; + + av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, + drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); + +- if (in->top_field_first) +- ctx->field_order = V4L2_FIELD_INTERLACED_TB; +- else +- ctx->field_order = V4L2_FIELD_INTERLACED_BT; +- +- ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); + if (ret) + return ret; + +- ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); ++ ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); + if (ret) + return ret; + +@@ -1082,6 +1127,12 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + ret = deint_v4l2m2m_streamon(output); + if (ret) + return ret; ++ ++ if (in->top_field_first) ++ ctx->field_order = V4L2_FIELD_INTERLACED_TB; ++ else ++ ctx->field_order = V4L2_FIELD_INTERLACED_BT; ++ + } + + ret = deint_v4l2m2m_enqueue_frame(output, in); +@@ -1157,28 +1208,31 @@ again: + return 0; + } + +- { ++ recycle_q(&s->output); ++ n = count_enqueued(&s->output); ++ ++ while (n < 6) { + AVFrame * frame; + int rv; + +- recycle_q(&s->output); +- n = count_enqueued(&s->output); ++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { ++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); ++ return rv; ++ } + +- while (n < 6) { +- if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { +- av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); +- return rv; +- } ++ if (frame == NULL) { ++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); ++ break; ++ } + +- if (frame == NULL) { +- av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); +- break; +- } ++ rv = deint_v4l2m2m_filter_frame(inlink, frame); ++ av_frame_free(&frame); + +- deint_v4l2m2m_filter_frame(inlink, frame); +- av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); +- ++n; +- } ++ if (rv != 0) ++ return rv; ++ ++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); ++ ++n; + } + + if (n < 6) { + +From 234480a99c88f30b201dc16c9ffc8d7f22212be3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 19 Aug 2022 15:29:11 +0000 +Subject: [PATCH 063/113] v4l2_req: Enable use of MMAP for buffer alloc + +Use MMAP rather than DMABUF if either the dmabuf device can't be opened +or create_buf doesn't set the capability. +--- + libavcodec/v4l2_req_dmabufs.c | 22 +++ + libavcodec/v4l2_req_dmabufs.h | 3 + + libavcodec/v4l2_req_media.c | 263 ++++++++++++++++++++++++++++----- + libavcodec/v4l2_req_media.h | 21 ++- + libavcodec/v4l2_request_hevc.c | 42 +++++- + 5 files changed, 307 insertions(+), 44 deletions(-) + +diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c +index ae6c648369..c4bbed18c6 100644 +--- a/libavcodec/v4l2_req_dmabufs.c ++++ b/libavcodec/v4l2_req_dmabufs.c +@@ -36,6 +36,26 @@ static unsigned int total_bufs = 0; + static size_t total_size = 0; + #endif + ++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size) ++{ ++ struct dmabuf_h *dh; ++ ++ if (mapptr == MAP_FAILED) ++ return NULL; ++ ++ dh = malloc(sizeof(*dh)); ++ if (!dh) ++ return NULL; ++ ++ *dh = (struct dmabuf_h) { ++ .fd = -1, ++ .size = size, ++ .mapptr = mapptr ++ }; ++ ++ return dh; ++} ++ + struct dmabuf_h * dmabuf_import(int fd, size_t size) + { + struct dmabuf_h *dh; +@@ -122,6 +142,8 @@ int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) + struct dma_buf_sync sync = { + .flags = flags + }; ++ if (dh->fd == -1) ++ return 0; + while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { + const int err = errno; + if (errno == EINTR) +diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h +index cfb17e801d..c1d3d8c8d7 100644 +--- a/libavcodec/v4l2_req_dmabufs.h ++++ b/libavcodec/v4l2_req_dmabufs.h +@@ -18,6 +18,9 @@ static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t s + } + /* Create from existing fd - dups(fd) */ + struct dmabuf_h * dmabuf_import(int fd, size_t size); ++/* Import an MMAP - return NULL if mapptr = MAP_FAIL */ ++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size); ++ + void * dmabuf_map(struct dmabuf_h * const dh); + + /* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ +diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c +index 980b306b8a..910ac77bb6 100644 +--- a/libavcodec/v4l2_req_media.c ++++ b/libavcodec/v4l2_req_media.c +@@ -33,9 +33,11 @@ + #include + #include + #include ++#include + #include + #include + #include ++#include + + #include + +@@ -95,6 +97,32 @@ struct media_request { + struct polltask * pt; + }; + ++static inline enum v4l2_memory ++mediabufs_memory_to_v4l2(const enum mediabufs_memory m) ++{ ++ return (enum v4l2_memory)m; ++} ++ ++const char * ++mediabufs_memory_name(const enum mediabufs_memory m) ++{ ++ switch (m) { ++ case MEDIABUFS_MEMORY_UNSET: ++ return "Unset"; ++ case MEDIABUFS_MEMORY_MMAP: ++ return "MMap"; ++ case MEDIABUFS_MEMORY_USERPTR: ++ return "UserPtr"; ++ case MEDIABUFS_MEMORY_OVERLAY: ++ return "Overlay"; ++ case MEDIABUFS_MEMORY_DMABUF: ++ return "DMABuf"; ++ default: ++ break; ++ } ++ return "Unknown"; ++} ++ + + static inline int do_trywait(sem_t *const sem) + { +@@ -115,14 +143,14 @@ static inline int do_wait(sem_t *const sem) + } + + static int request_buffers(int video_fd, unsigned int type, +- enum v4l2_memory memory, unsigned int buffers_count) ++ enum mediabufs_memory memory, unsigned int buffers_count) + { + struct v4l2_requestbuffers buffers; + int rc; + + memset(&buffers, 0, sizeof(buffers)); + buffers.type = type; +- buffers.memory = memory; ++ buffers.memory = mediabufs_memory_to_v4l2(memory); + buffers.count = buffers_count; + + rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); +@@ -324,6 +352,7 @@ struct qent_base { + struct qent_base *next; + struct qent_base *prev; + enum qent_status status; ++ enum mediabufs_memory memtype; + uint32_t index; + struct dmabuf_h *dh[VIDEO_MAX_PLANES]; + struct timeval timestamp; +@@ -348,9 +377,9 @@ struct qe_list_head { + }; + + struct buf_pool { ++ enum mediabufs_memory memtype; + pthread_mutex_t lock; + sem_t free_sem; +- enum v4l2_buf_type buf_type; + struct qe_list_head free; + struct qe_list_head inuse; + }; +@@ -367,9 +396,10 @@ static inline struct qent_src *base_to_src(struct qent_base *be) + } + + +-#define QENT_BASE_INITIALIZER {\ ++#define QENT_BASE_INITIALIZER(mtype) {\ + .ref_count = ATOMIC_VAR_INIT(0),\ + .status = QENT_NEW,\ ++ .memtype = (mtype),\ + .index = INDEX_UNSET\ + } + +@@ -390,13 +420,13 @@ static void qe_src_free(struct qent_src *const be_src) + free(be_src); + } + +-static struct qent_src * qe_src_new(void) ++static struct qent_src * qe_src_new(enum mediabufs_memory mtype) + { + struct qent_src *const be_src = malloc(sizeof(*be_src)); + if (!be_src) + return NULL; + *be_src = (struct qent_src){ +- .base = QENT_BASE_INITIALIZER ++ .base = QENT_BASE_INITIALIZER(mtype) + }; + return be_src; + } +@@ -413,13 +443,13 @@ static void qe_dst_free(struct qent_dst *const be_dst) + free(be_dst); + } + +-static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) ++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype) + { + struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); + if (!be_dst) + return NULL; + *be_dst = (struct qent_dst){ +- .base = QENT_BASE_INITIALIZER, ++ .base = QENT_BASE_INITIALIZER(memtype), + .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + .mbc_wl = ff_weak_link_ref(wl) +@@ -553,14 +583,14 @@ static struct qent_base *queue_tryget_free(struct buf_pool *const bp) + return buf; + } + +-static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd) ++static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index) + { + struct qent_base *be; + + pthread_mutex_lock(&bp->lock); + /* Expect 1st in Q, but allow anywhere */ + for (be = bp->inuse.head; be; be = be->next) { +- if (dmabuf_fd(be->dh[0]) == fd) { ++ if (be->index == index) { + bq_extract_inuse(bp, be); + break; + } +@@ -602,6 +632,8 @@ struct mediabufs_ctl { + struct pollqueue * pq; + struct ff_weak_link_master * this_wlm; + ++ enum mediabufs_memory src_memtype; ++ enum mediabufs_memory dst_memtype; + struct v4l2_format src_fmt; + struct v4l2_format dst_fmt; + struct v4l2_capability capability; +@@ -614,7 +646,7 @@ static int qe_v4l2_queue(struct qent_base *const be, + { + struct v4l2_buffer buffer = { + .type = fmt->type, +- .memory = V4L2_MEMORY_DMABUF, ++ .memory = mediabufs_memory_to_v4l2(be->memtype), + .index = be->index + }; + struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; +@@ -628,7 +660,10 @@ static int qe_v4l2_queue(struct qent_base *const be, + /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ + planes[i].length = dmabuf_size(be->dh[i]); + planes[i].bytesused = dmabuf_len(be->dh[i]); +- planes[i].m.fd = dmabuf_fd(be->dh[i]); ++ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) ++ planes[i].m.fd = dmabuf_fd(be->dh[i]); ++ else ++ planes[i].m.mem_offset = 0; + } + buffer.m.planes = planes; + buffer.length = i; +@@ -639,7 +674,10 @@ static int qe_v4l2_queue(struct qent_base *const be, + + buffer.bytesused = dmabuf_len(be->dh[0]); + buffer.length = dmabuf_size(be->dh[0]); +- buffer.m.fd = dmabuf_fd(be->dh[0]); ++ if (be->memtype == MEDIABUFS_MEMORY_DMABUF) ++ buffer.m.fd = dmabuf_fd(be->dh[0]); ++ else ++ buffer.m.offset = 0; + } + + if (!is_dst && mreq) { +@@ -668,14 +706,13 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, + const int vfd, + const struct v4l2_format * const f) + { +- int fd; + struct qent_base *be; + int rc; + const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); + struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + struct v4l2_buffer buffer = { + .type = f->type, +- .memory = V4L2_MEMORY_DMABUF ++ .memory = mediabufs_memory_to_v4l2(bp->memtype) + }; + if (mp) { + buffer.length = f->fmt.pix_mp.num_planes; +@@ -690,10 +727,9 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, + return NULL; + } + +- fd = mp ? planes[0].m.fd : buffer.m.fd; +- be = queue_find_extract_fd(bp, fd); ++ be = queue_find_extract_index(bp, buffer.index); + if (!be) { +- request_log("Failed to find fd %d in Q\n", fd); ++ request_log("Failed to find index %d in Q\n", buffer.index); + return NULL; + } + +@@ -1104,7 +1140,7 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru + + struct v4l2_create_buffers cbuf = { + .count = n, +- .memory = V4L2_MEMORY_DMABUF, ++ .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype), + .format = mbc->dst_fmt, + }; + +@@ -1125,12 +1161,97 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru + return cbuf.count; + } + ++static MediaBufsStatus ++qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt, ++ const unsigned int n, const bool x_dmabuf) ++{ ++ struct v4l2_buffer buf = { ++ .index = n, ++ .type = fmt->type, ++ }; ++ struct v4l2_plane planes[VIDEO_MAX_PLANES]; ++ int ret; ++ ++ if (be->dh[0]) ++ return 0; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ memset(planes, 0, sizeof(planes)); ++ buf.m.planes = planes; ++ buf.length = VIDEO_MAX_PLANES; ++ } ++ ++ if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) { ++ request_err(mbc->dc, "VIDIOC_QUERYBUF failed"); ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) ++ { ++ unsigned int i; ++ for (i = 0; i != buf.length; ++i) { ++ if (x_dmabuf) { ++ struct v4l2_exportbuffer xbuf = { ++ .type = buf.type, ++ .index = buf.index, ++ .plane = i, ++ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine ++ }; ++ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) ++ be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length); ++ } ++ else { ++ be->dh[i] = dmabuf_import_mmap( ++ mmap(NULL, planes[i].length, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ mbc->vfd, planes[i].m.mem_offset), ++ planes[i].length); ++ } ++ /* On failure tidy up and die */ ++ if (!be->dh[i]) { ++ while (i--) { ++ dmabuf_free(be->dh[i]); ++ be->dh[i] = NULL; ++ } ++ return MEDIABUFS_ERROR_OPERATION_FAILED; + } + } -+ for (int j = 0; j < 24; ++j) -+ for (int i = 0; i < 48; ++i) -+ filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); -+ if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) -+ bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); -+ if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) -+ bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); ++ } ++ else ++ { ++ if (x_dmabuf) { ++ struct v4l2_exportbuffer xbuf = { ++ .type = buf.type, ++ .index = buf.index, ++ .flags = O_RDWR, // *** Arguably O_RDONLY would be fine ++ }; ++ if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0) ++ be->dh[0] = dmabuf_import(xbuf.fd, buf.length); ++ } ++ else { ++ be->dh[0] = dmabuf_import_mmap( ++ mmap(NULL, buf.length, ++ PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_POPULATE, ++ mbc->vfd, buf.m.offset), ++ buf.length); ++ } ++ /* On failure tidy up and die */ ++ if (!be->dh[0]) { ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ } ++ } ++ ++ return 0; ++} ++ + struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) + { + struct qent_dst * be_dst; + + if (mbc == NULL) { +- be_dst = qe_dst_new(NULL); ++ be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF); + if (be_dst) + be_dst->base.status = QENT_IMPORT; + return be_dst; +@@ -1144,7 +1265,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc + else { + be_dst = base_to_dst(queue_tryget_free(mbc->dst)); + if (!be_dst) { +- be_dst = qe_dst_new(mbc->this_wlm); ++ be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype); + if (!be_dst) + return NULL; + +@@ -1155,12 +1276,21 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc + } + } + +- if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { +- /* Given how create buf works we can't uncreate it on alloc failure +- * all we can do is put it on the free Q +- */ +- queue_put_free(mbc->dst, &be_dst->base); +- return NULL; ++ if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) { ++ if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) { ++ request_err(mbc->dc, "Failed to export as dmabuf\n"); ++ queue_put_free(mbc->dst, &be_dst->base); ++ return NULL; ++ } ++ } ++ else { ++ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { ++ /* Given how create buf works we can't uncreate it on alloc failure ++ * all we can do is put it on the free Q ++ */ ++ queue_put_free(mbc->dst, &be_dst->base); ++ return NULL; ++ } + } + + be_dst->base.status = QENT_PENDING; +@@ -1208,7 +1338,7 @@ MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, + + // ** This is a mess if we get partial alloc but without any way to remove + // individual V4L2 Q members we are somewhat stuffed +-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype) + { + unsigned int i; + int a = 0; +@@ -1218,10 +1348,12 @@ MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, cons + if (n > 32) + return MEDIABUFS_ERROR_ALLOCATION_FAILED; + ++ mbc->dst->memtype = memtype; ++ + // Create qents first as it is hard to get rid of the V4L2 buffers on error + for (qc = 0; qc != n; ++qc) + { +- if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) ++ if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL) + goto fail; + } + +@@ -1260,19 +1392,61 @@ void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src * + queue_put_free(mbc->src, &qe_src->base); + } + ++static MediaBufsStatus ++chk_memory_type(struct mediabufs_ctl *const mbc, ++ const struct v4l2_format * const f, ++ const enum mediabufs_memory m) ++{ ++ struct v4l2_create_buffers cbuf = { ++ .count = 0, ++ .memory = V4L2_MEMORY_MMAP, ++ .format = *f ++ }; ++ ++ if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0) ++ return MEDIABUFS_ERROR_OPERATION_FAILED; ++ ++ switch (m) { ++ case MEDIABUFS_MEMORY_DMABUF: ++ // 0 = Unknown but assume not in that case ++ if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0) ++ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; ++ break; ++ case MEDIABUFS_MEMORY_MMAP: ++ break; ++ default: ++ return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY; ++ } ++ ++ return MEDIABUFS_STATUS_SUCCESS; ++} ++ ++MediaBufsStatus ++mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) ++{ ++ return chk_memory_type(mbc, &mbc->src_fmt, memtype); ++} ++ ++MediaBufsStatus ++mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype) ++{ ++ return chk_memory_type(mbc, &mbc->dst_fmt, memtype); ++} ++ + /* src format must have been set up before this */ + MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, + struct dmabufs_ctl * const dbsc, +- unsigned int n) ++ unsigned int n, const enum mediabufs_memory memtype) + { + unsigned int i; + struct v4l2_requestbuffers req = { + .count = n, + .type = mbc->src_fmt.type, +- .memory = V4L2_MEMORY_DMABUF ++ .memory = mediabufs_memory_to_v4l2(memtype) + }; + + bq_free_all_free_src(mbc->src); ++ + while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { + if (errno != EINTR) { + request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); +@@ -1286,21 +1460,36 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, + } + + for (i = 0; i != n; ++i) { +- struct qent_src *const be_src = qe_src_new(); ++ struct qent_src *const be_src = qe_src_new(memtype); + if (!be_src) { + request_err(mbc->dc, "Failed to create src be %d\n", i); + goto fail; + } +- if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { +- qe_src_free(be_src); ++ switch (memtype) { ++ case MEDIABUFS_MEMORY_MMAP: ++ if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) { ++ qe_src_free(be_src); ++ goto fail; ++ } ++ be_src->fixed_size = 1; ++ break; ++ case MEDIABUFS_MEMORY_DMABUF: ++ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { ++ qe_src_free(be_src); ++ goto fail; ++ } ++ be_src->fixed_size = !mediabufs_src_resizable(mbc); ++ break; ++ default: ++ request_err(mbc->dc, "Unexpected memorty type\n"); + goto fail; + } + be_src->base.index = i; +- be_src->fixed_size = !mediabufs_src_resizable(mbc); + + queue_put_free(mbc->src, &be_src->base); + } + ++ mbc->src->memtype = memtype; + return MEDIABUFS_STATUS_SUCCESS; + + fail: +@@ -1437,9 +1626,13 @@ int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ + + int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) + { ++#if 1 ++ return 0; ++#else + // Single planar OUTPUT can only take exact size buffers + // Multiplanar will take larger than negotiated + return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); ++#endif + } + + static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) +diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h +index 0307a831de..890947b2e2 100644 +--- a/libavcodec/v4l2_req_media.h ++++ b/libavcodec/v4l2_req_media.h +@@ -43,6 +43,7 @@ typedef enum media_buf_status { + MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, + MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, + MEDIABUFS_ERROR_ALLOCATION_FAILED, ++ MEDIABUFS_ERROR_UNSUPPORTED_MEMORY, + } MediaBufsStatus; + + struct media_pool * media_pool_new(const char * const media_path, +@@ -70,6 +71,15 @@ struct qent_dst; + struct dmabuf_h; + struct dmabufs_ctl; + ++// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties ++enum mediabufs_memory { ++ MEDIABUFS_MEMORY_UNSET = 0, ++ MEDIABUFS_MEMORY_MMAP = 1, ++ MEDIABUFS_MEMORY_USERPTR = 2, ++ MEDIABUFS_MEMORY_OVERLAY = 3, ++ MEDIABUFS_MEMORY_DMABUF = 4, ++}; ++ + int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); + struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); + +@@ -93,6 +103,8 @@ MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, + unsigned int plane, + int fd, size_t size); + ++const char * mediabufs_memory_name(const enum mediabufs_memory m); ++ + MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, + struct media_request **const pmreq, + struct qent_src **const psrc_be, +@@ -106,7 +118,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, + // Create dst slots without alloc + // If fixed true then qent_alloc will only get slots from this pool and will + // block until a qent has been unrefed +-MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); ++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype); + + MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); + MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); +@@ -140,7 +152,12 @@ MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, + + MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, + struct dmabufs_ctl * const dbsc, +- unsigned int n); ++ unsigned int n, ++ const enum mediabufs_memory memtype); ++ ++// Want to have appropriate formats set first ++MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); ++MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype); + + #define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) + unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index cd79aad563..5cf17dd5e3 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -144,6 +144,8 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + const struct decdev * decdev; + const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes + size_t src_size; ++ enum mediabufs_memory src_memtype; ++ enum mediabufs_memory dst_memtype; + + av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); + +@@ -174,8 +176,14 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + decdev_media_path(decdev), decdev_video_path(decdev)); + + if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { +- av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n"); +- goto fail0; ++ av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n"); ++ src_memtype = MEDIABUFS_MEMORY_MMAP; ++ dst_memtype = MEDIABUFS_MEMORY_MMAP; ++ } ++ else { ++ av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n"); ++ src_memtype = MEDIABUFS_MEMORY_DMABUF; ++ dst_memtype = MEDIABUFS_MEMORY_DMABUF; + } + + if ((ctx->pq = pollqueue_new()) == NULL) { +@@ -196,8 +204,9 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + // Ask for an initial bitbuf size of max size / 4 + // We will realloc if we need more + // Must use sps->h/w as avctx contains cropped size ++retry_src_memtype: + src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); +- if (mediabufs_src_resizable(ctx->mbufs)) ++ if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs)) + src_size /= 4; + // Kludge for conformance tests which break Annex A limits + else if (src_size < 0x40000) +@@ -210,6 +219,15 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + goto fail4; + } + ++ if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) { ++ if (src_memtype == MEDIABUFS_MEMORY_DMABUF) { ++ src_memtype = MEDIABUFS_MEMORY_MMAP; ++ goto retry_src_memtype; ++ } ++ av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n"); ++ goto fail4; ++ } ++ + if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 4); +@@ -238,7 +256,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + goto fail4; + } + +- if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) { ++ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) { + av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); + goto fail4; + } +@@ -250,8 +268,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, + avctx->thread_count, avctx->extra_hw_frames); + ++ if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) { ++ if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n"); ++ goto fail4; ++ } ++ av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n"); ++ dst_memtype = MEDIABUFS_MEMORY_MMAP; ++ } ++ + // extra_hw_frames is -1 if unset +- if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { ++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) { + av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); + goto fail4; + } +@@ -277,9 +304,10 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + // Set our s/w format + avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; + +- av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n", ++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n", + ctx->fns->name, +- decdev_media_path(decdev), decdev_video_path(decdev)); ++ decdev_media_path(decdev), decdev_video_path(decdev), ++ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype)); + + return 0; + + +From ea0d413ee122f53e6697919884c92a3505c6ae52 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 22 Aug 2022 12:35:40 +0000 +Subject: [PATCH 064/113] Set buffer lengths on DQ + +--- + libavcodec/v4l2_req_media.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c +index 910ac77bb6..1a9944774a 100644 +--- a/libavcodec/v4l2_req_media.c ++++ b/libavcodec/v4l2_req_media.c +@@ -733,6 +733,14 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp, + return NULL; + } + ++ if (mp) { ++ unsigned int i; ++ for (i = 0; i != buffer.length; ++i) ++ dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0); ++ } ++ else ++ dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0); ++ + be->timestamp = buffer.timestamp; + be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; + return be; + +From f15e0e5b59b559e4ffb9c3e6673b1815900f0ff3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 22 Aug 2022 17:11:24 +0000 +Subject: [PATCH 065/113] Fix compile if videodev2.h defines V4L2 HEVC request + API + +If videodev2.h does define the HEVC request API it is really hard to +set old variations of the controls so if it does then we only compile +against the system includes and remove the back compatability. +--- + configure | 9 +++++++++ + libavcodec/Makefile | 4 ++-- + libavcodec/hevc-ctrls-v4.h | 2 ++ + libavcodec/v4l2_req_hevc_vx.c | 5 ----- + libavcodec/v4l2_request_hevc.c | 6 ++++-- + 5 files changed, 17 insertions(+), 9 deletions(-) + +diff --git a/configure b/configure +index a8e7ee5dab..f555f60dc8 100755 +--- a/configure ++++ b/configure +@@ -1957,6 +1957,7 @@ FEATURE_LIST=" + swscale_alpha + vout_drm + vout_egl ++ v4l2_req_hevc_vx + " + + # this list should be kept in linking order +@@ -6853,6 +6854,14 @@ fi + + check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns + check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" ++disable v4l2_req_hevc_vx ++if enabled hevc_v4l2request_hwaccel; then ++ enable v4l2_req_hevc_vx ++fi ++if enabled hevc_v4l2_request; then ++ disable v4l2_req_hevc_vx ++fi ++ + check_headers sys/videoio.h + test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index a8951ddcbe..928756850f 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -975,8 +975,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o +-OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ +- v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o ++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o ++OBJS-$(CONFIG_V4L2_REQ_HEVC_VX) += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o +diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h +index 7e05f6e7c3..7829d82084 100644 +--- a/libavcodec/hevc-ctrls-v4.h ++++ b/libavcodec/hevc-ctrls-v4.h +@@ -53,6 +53,8 @@ + #include + #include + ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ + #define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) + #define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401) + #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402) +diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c +index 5d083016f8..e1bd5c6a1f 100644 +--- a/libavcodec/v4l2_req_hevc_vx.c ++++ b/libavcodec/v4l2_req_hevc_vx.c +@@ -40,11 +40,6 @@ + #define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B + #endif + +-// Should be in videodev2 but we might not have a good enough one +-#ifndef V4L2_PIX_FMT_HEVC_SLICE +-#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ +-#endif +- + #include "v4l2_request_hevc.h" + + #include "libavutil/hwcontext_drm.h" +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index 5cf17dd5e3..614a1b4d99 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -17,7 +17,7 @@ + */ + + +- ++#include "config.h" + #include "decode.h" + #include "hevcdec.h" + #include "hwconfig.h" +@@ -142,7 +142,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx) + const HEVCSPS * const sps = h->ps.sps; + int ret; + const struct decdev * decdev; +- const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes ++ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes + size_t src_size; + enum mediabufs_memory src_memtype; + enum mediabufs_memory dst_memtype; +@@ -232,6 +232,7 @@ retry_src_memtype: + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 4); + } ++#if CONFIG_V4L2_REQ_HEVC_VX + else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 3); +@@ -244,6 +245,7 @@ retry_src_memtype: + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 1); + } ++#endif + else { + av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); + ret = AVERROR(EINVAL); + +From 6b41732aaec86b0ade91003419b89035940b32fe Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Sep 2022 17:59:22 +0100 +Subject: [PATCH 066/113] v4l2_m2m_enc: Send headers in in pkt side_data + +If GLOBAL_HEADERS are requested then we can't provide them at init time +so send as NEW_EXTRADATA side data in a similar way to some AV1 +encoders. +--- + libavcodec/v4l2_m2m_enc.c | 33 +++++++++++++++++++++++---------- + 1 file changed, 23 insertions(+), 10 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index db6014d8e3..c98f2129dc 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -544,14 +544,12 @@ dequeue: + av_freep(&avctx->extradata); + avctx->extradata_size = 0; + +- if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL) +- memcpy(data, avpkt->data, len); ++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) ++ goto fail_no_mem; + ++ memcpy(data, avpkt->data, len); + av_packet_unref(avpkt); + +- if (data == NULL) +- return AVERROR(ENOMEM); +- + // We need to copy the header, but keep local if not global + if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) { + avctx->extradata = data; +@@ -567,18 +565,28 @@ dequeue: + } + + // First frame must be key so mark as such even if encoder forgot +- if (capture->first_buf == 2) ++ if (capture->first_buf == 2) { + avpkt->flags |= AV_PKT_FLAG_KEY; + ++ // Add any extradata to the 1st packet we emit as we cannot create it at init ++ if (avctx->extradata_size > 0 && avctx->extradata) { ++ void * const side = av_packet_new_side_data(avpkt, ++ AV_PKT_DATA_NEW_EXTRADATA, ++ avctx->extradata_size); ++ if (!side) ++ goto fail_no_mem; ++ ++ memcpy(side, avctx->extradata, avctx->extradata_size); ++ } ++ } ++ + // Add SPS/PPS to the start of every key frame if non-global headers + if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) { + const size_t newlen = s->extdata_size + avpkt->size; + AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE); + +- if (buf == NULL) { +- av_packet_unref(avpkt); +- return AVERROR(ENOMEM); +- } ++ if (buf == NULL) ++ goto fail_no_mem; + + memcpy(buf->data, s->extdata_data, s->extdata_size); + memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size); +@@ -592,6 +600,11 @@ dequeue: + // av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); + capture->first_buf = 0; + return 0; ++ ++fail_no_mem: ++ ret = AVERROR(ENOMEM); ++ av_packet_unref(avpkt); ++ return ret; + } + + static av_cold int v4l2_encode_init(AVCodecContext *avctx) + +From b8ea626474687c921372b04b138d6b1ed94fb67e Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 14 Sep 2022 15:44:10 +0000 +Subject: [PATCH 067/113] matroskaenc: Allow H264 SPS/PPS headers in packet + sidedata + +--- + libavformat/matroskaenc.c | 26 ++++++++++++++++++++++---- + 1 file changed, 22 insertions(+), 4 deletions(-) + +diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c +index 1256bdfe36..fae9509442 100644 +--- a/libavformat/matroskaenc.c ++++ b/libavformat/matroskaenc.c +@@ -75,6 +75,10 @@ + + #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \ + ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER) ++ ++/* Reserved size for H264 headers if not extant at init time */ ++#define MAX_H264_HEADER_SIZE 1024 ++ + #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \ + !(mkv)->is_live) + +@@ -1119,8 +1123,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn + case AV_CODEC_ID_WAVPACK: + return put_wv_codecpriv(dyn_cp, extradata, extradata_size); + case AV_CODEC_ID_H264: +- return ff_isom_write_avcc(dyn_cp, extradata, +- extradata_size); ++ if (par->extradata_size) ++ return ff_isom_write_avcc(dyn_cp, extradata, ++ extradata_size); ++ else ++ *size_to_reserve = MAX_H264_HEADER_SIZE; ++ break; + case AV_CODEC_ID_HEVC: + return ff_isom_write_hvcc(dyn_cp, extradata, + extradata_size, 0); +@@ -2726,8 +2734,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) + } + break; + #endif +- // FIXME: Remove the following once libaom starts propagating proper extradata during init() +- // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208 ++ // FIXME: Remove the following once libaom starts propagating extradata during init() ++ // See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012 + case AV_CODEC_ID_AV1: + if (side_data_size && mkv->track.bc && !par->extradata_size) { + // If the reserved space doesn't suffice, only write +@@ -2739,6 +2747,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt) + } else if (!par->extradata_size) + return AVERROR_INVALIDDATA; + break; ++ // H264 V4L2 has a similar issue ++ case AV_CODEC_ID_H264: ++ if (side_data_size && mkv->track.bc && !par->extradata_size) { ++ ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size, ++ par, mkv->track.bc, track, 0); ++ if (ret < 0) ++ return ret; ++ } else if (!par->extradata_size) ++ return AVERROR_INVALIDDATA; ++ break; + default: + if (side_data_size) + av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index); + +From aa58b31d5d68abcf5989c747e1d1cde5223813e3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 14 Sep 2022 15:55:15 +0000 +Subject: [PATCH 068/113] movenc: Allow H264 SPS/PPS headers in packet sidedata + +--- + libavformat/movenc.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/libavformat/movenc.c b/libavformat/movenc.c +index 5608afde42..0f2e236eaa 100644 +--- a/libavformat/movenc.c ++++ b/libavformat/movenc.c +@@ -6318,6 +6318,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt) + if (trk->par->codec_id == AV_CODEC_ID_MP4ALS || + trk->par->codec_id == AV_CODEC_ID_AAC || + trk->par->codec_id == AV_CODEC_ID_AV1 || ++ trk->par->codec_id == AV_CODEC_ID_H264 || + trk->par->codec_id == AV_CODEC_ID_FLAC) { + size_t side_size; + uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); + +From 840df546f21f95cc80eeb3f224d53c6dc19f34b9 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 26 Sep 2022 12:45:05 +0100 +Subject: [PATCH 069/113] Allow ffmpeg to select codec internal hwfmts if + no_cvt_hw + +This allows the selection of DRM_PRIME from v4l2m2m without forcing it +in the decoder. + +Not utterly sure this is the right method for 5.1 but it does work +--- + fftools/ffmpeg.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index 517324df3a..697b8a71cf 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -2571,12 +2571,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat + break; + + if (ist->hwaccel_id == HWACCEL_GENERIC || +- ist->hwaccel_id == HWACCEL_AUTO) { ++ ist->hwaccel_id == HWACCEL_AUTO || ++ no_cvt_hw) { + for (i = 0;; i++) { + config = avcodec_get_hw_config(s->codec, i); + if (!config) + break; +- if (!(config->methods & ++ if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL)) ++ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p); ++ else if (!(config->methods & + AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX)) + continue; + if (config->pix_fmt == *p) + +From 21b9406798673c6d6275fc7b2f96975b7733e49b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 1 Sep 2022 11:42:41 +0000 +Subject: [PATCH 070/113] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler + +The logic for running an isp based scaler is pretty much identical to +that for the deinterlacer so add to the deinterlacer. This requires +some rework of the setup code to avoid assumptions that are true for +deinterlace but not scale but the reworked code requires few switches +based on operation. +--- + libavfilter/allfilters.c | 1 + + libavfilter/vf_deinterlace_v4l2m2m.c | 1123 ++++++++++++++++++++------ + 2 files changed, 877 insertions(+), 247 deletions(-) + +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 9b17c37eb3..7c651f5a9d 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -415,6 +415,7 @@ extern const AVFilter ff_vf_scale; + extern const AVFilter ff_vf_scale_cuda; + extern const AVFilter ff_vf_scale_npp; + extern const AVFilter ff_vf_scale_qsv; ++extern const AVFilter ff_vf_scale_v4l2m2m; + extern const AVFilter ff_vf_scale_vaapi; + extern const AVFilter ff_vf_scale_vulkan; + extern const AVFilter ff_vf_scale2ref; +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 1a3bef5bcb..2df39ec0f1 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -52,31 +52,36 @@ + #include "avfilter.h" + #include "formats.h" + #include "internal.h" ++#include "scale_eval.h" + #include "video.h" + ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ ++#endif ++ + typedef struct V4L2Queue V4L2Queue; + typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; + +-typedef struct V4L2PlaneInfo { +- int bytesperline; +- size_t length; +-} V4L2PlaneInfo; ++typedef enum filter_type_v4l2_e ++{ ++ FILTER_V4L2_DEINTERLACE = 1, ++ FILTER_V4L2_SCALE, ++} filter_type_v4l2_t; + + typedef struct V4L2Buffer { + int enqueued; + int reenqueue; +- int fd; + struct v4l2_buffer buffer; + AVFrame frame; + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + int num_planes; +- V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; + AVDRMFrameDescriptor drm_frame; + V4L2Queue *q; + } V4L2Buffer; + + typedef struct V4L2Queue { + struct v4l2_format format; ++ struct v4l2_selection sel; + int num_buffers; + V4L2Buffer *buffers; + DeintV4L2M2MContextShared *ctx; +@@ -111,11 +116,18 @@ typedef struct pts_track_s + + typedef struct DeintV4L2M2MContextShared { + void * logctx; // For logging - will be NULL when done ++ filter_type_v4l2_t filter_type; + + int fd; + int done; + int width; + int height; ++ ++ // from options ++ int output_width; ++ int output_height; ++ enum AVPixelFormat output_format; ++ + int orig_width; + int orig_height; + atomic_uint refcount; +@@ -134,8 +146,60 @@ typedef struct DeintV4L2M2MContext { + const AVClass *class; + + DeintV4L2M2MContextShared *shared; ++ ++ char * w_expr; ++ char * h_expr; ++ char * output_format_string;; ++ ++ int force_original_aspect_ratio; ++ int force_divisible_by; ++ ++ char *colour_primaries_string; ++ char *colour_transfer_string; ++ char *colour_matrix_string; ++ int colour_range; ++ char *chroma_location_string; ++ ++ enum AVColorPrimaries colour_primaries; ++ enum AVColorTransferCharacteristic colour_transfer; ++ enum AVColorSpace colour_matrix; ++ enum AVChromaLocation chroma_location; + } DeintV4L2M2MContext; + ++// These just list the ones we know we can cope with ++static uint32_t ++fmt_av_to_v4l2(const enum AVPixelFormat avfmt) ++{ ++ switch (avfmt) { ++ case AV_PIX_FMT_YUV420P: ++ return V4L2_PIX_FMT_YUV420; ++ case AV_PIX_FMT_NV12: ++ return V4L2_PIX_FMT_NV12; ++ case AV_PIX_FMT_RPI4_8: ++ case AV_PIX_FMT_SAND128: ++ return V4L2_PIX_FMT_NV12_COL128; ++ default: ++ break; ++ } ++ return 0; ++} ++ ++static enum AVPixelFormat ++fmt_v4l2_to_av(const uint32_t pixfmt) ++{ ++ switch (pixfmt) { ++ case V4L2_PIX_FMT_YUV420: ++ return AV_PIX_FMT_YUV420P; ++ case V4L2_PIX_FMT_NV12: ++ return AV_PIX_FMT_NV12; ++ case V4L2_PIX_FMT_NV12_COL128: ++ return AV_PIX_FMT_RPI4_8; ++ default: ++ break; ++ } ++ return AV_PIX_FMT_NONE; ++} ++ + static unsigned int pts_stats_interval(const pts_stats_t * const stats) + { + return stats->last_interval; +@@ -301,6 +365,39 @@ static int pts_track_init(pts_track_t * const trk, void *logctx) + return 0; + } + ++static inline uint32_t ++fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline; ++} ++ ++static inline uint32_t ++fmt_height(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; ++} ++ ++static inline uint32_t ++fmt_width(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; ++} ++ ++static inline uint32_t ++fmt_pixelformat(const struct v4l2_format * const fmt) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; ++} ++ ++static void ++init_format(V4L2Queue * const q, const uint32_t format_type) ++{ ++ memset(&q->format, 0, sizeof(q->format)); ++ memset(&q->sel, 0, sizeof(q->sel)); ++ q->format.type = format_type; ++ q->sel.type = format_type; ++} ++ + static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) + { + struct v4l2_capability cap; +@@ -311,80 +408,99 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) + if (ret < 0) + return ret; + +- if (!(cap.capabilities & V4L2_CAP_STREAMING)) ++ if (ctx->filter_type == FILTER_V4L2_SCALE && ++ strcmp("bcm2835-codec-isp", cap.card) != 0) ++ { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n"); + return AVERROR(EINVAL); ++ } + +- if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { +- ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +- ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; +- +- return 0; ++ if (!(cap.capabilities & V4L2_CAP_STREAMING)) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n"); ++ return AVERROR(EINVAL); + } + + if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { +- ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; +- ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; +- +- return 0; ++ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE); ++ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE); ++ } ++ else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { ++ init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE); ++ init_format(&ctx->output, V4L2_BUF_TYPE_VIDEO_OUTPUT); ++ } ++ else { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n"); ++ return AVERROR(EINVAL); + } + +- return AVERROR(EINVAL); ++ return 0; + } + +-static int deint_v4l2m2m_try_format(V4L2Queue *queue) ++// Just use for probe - doesn't modify q format ++static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt) + { +- struct v4l2_format *fmt = &queue->format; ++ struct v4l2_format fmt = {.type = queue->format.type}; + DeintV4L2M2MContextShared *ctx = queue->ctx; + int ret, field; ++ // Pick YUV to test with if not otherwise specified ++ uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt); ++ enum AVPixelFormat r_avfmt; ++ + +- ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); ++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt); + if (ret) + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); + +- if (V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type)) + field = V4L2_FIELD_INTERLACED_TB; + else + field = V4L2_FIELD_NONE; + +- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { +- fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; +- fmt->fmt.pix_mp.field = field; +- fmt->fmt.pix_mp.width = ctx->width; +- fmt->fmt.pix_mp.height = ctx->height; ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ fmt.fmt.pix_mp.pixelformat = pixelformat; ++ fmt.fmt.pix_mp.field = field; ++ fmt.fmt.pix_mp.width = width; ++ fmt.fmt.pix_mp.height = height; + } else { +- fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; +- fmt->fmt.pix.field = field; +- fmt->fmt.pix.width = ctx->width; +- fmt->fmt.pix.height = ctx->height; ++ fmt.fmt.pix.pixelformat = pixelformat; ++ fmt.fmt.pix.field = field; ++ fmt.fmt.pix.width = width; ++ fmt.fmt.pix.height = height; + } + +- av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, +- fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, +- fmt->fmt.pix_mp.pixelformat, +- fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, ++ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, ++ fmt.fmt.pix_mp.pixelformat, ++ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); + +- ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); ++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt); + if (ret) + return AVERROR(EINVAL); + +- av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, +- fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, +- fmt->fmt.pix_mp.pixelformat, +- fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); ++ av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, ++ fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height, ++ fmt.fmt.pix_mp.pixelformat, ++ fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline); + +- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { +- if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 && +- fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) || +- fmt->fmt.pix_mp.field != field) { +- av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt)); ++ if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); ++ return AVERROR(EINVAL); ++ } ++ if (r_avfmt == AV_PIX_FMT_NONE) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src"); ++ return AVERROR(EINVAL); ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) { ++ if (fmt.fmt.pix_mp.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); + + return AVERROR(EINVAL); + } + } else { +- if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 && +- fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) || +- fmt->fmt.pix.field != field) { +- av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); ++ if (fmt.fmt.pix.field != field) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type); + + return AVERROR(EINVAL); + } +@@ -393,68 +509,410 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue) + return 0; + } + +-static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize) ++static int ++do_s_fmt(V4L2Queue * const q) + { +- struct v4l2_format *fmt = &queue->format; +- DeintV4L2M2MContextShared *ctx = queue->ctx; ++ DeintV4L2M2MContextShared * const ctx = q->ctx; ++ const uint32_t pixelformat = fmt_pixelformat(&q->format); + int ret; + +- struct v4l2_selection sel = { +- .type = fmt->type, +- .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, +- }; +- +- // This works for most single object 4:2:0 types +- if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { +- fmt->fmt.pix_mp.pixelformat = pixelformat; +- fmt->fmt.pix_mp.field = field; +- fmt->fmt.pix_mp.width = width; +- fmt->fmt.pix_mp.height = ysize / pitch; +- fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; +- fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); +- } else { +- fmt->fmt.pix.pixelformat = pixelformat; +- fmt->fmt.pix.field = field; +- fmt->fmt.pix.width = width; +- fmt->fmt.pix.height = height; +- fmt->fmt.pix.sizeimage = 0; +- fmt->fmt.pix.bytesperline = 0; +- } +- +- ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); ++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format); + if (ret) { + ret = AVERROR(errno); +- av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); ++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret)); + return ret; + } + +- if (pixelformat != fmt->fmt.pix.pixelformat) { +- av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat)); ++ if (pixelformat != fmt_pixelformat(&q->format)) { ++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format))); + return AVERROR(EINVAL); + } + +- ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); ++ q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, ++ q->sel.flags = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE; ++ ++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel); + if (ret) { + ret = AVERROR(errno); +- av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret); ++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret)); + } + +- sel.r.width = width; +- sel.r.height = height; +- sel.r.left = 0; +- sel.r.top = 0; +- sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, +- sel.flags = V4L2_SEL_FLAG_LE; ++ return 0; ++} + +- ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); +- if (ret) { +- ret = AVERROR(errno); +- av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret); ++static void ++set_fmt_color(struct v4l2_format *const fmt, ++ const enum AVColorPrimaries avcp, ++ const enum AVColorSpace avcs, ++ const enum AVColorTransferCharacteristic avxc) ++{ ++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; ++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; ++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; ++ ++ switch (avcp) { ++ case AVCOL_PRI_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ ycbcr = V4L2_YCBCR_ENC_709; ++ break; ++ case AVCOL_PRI_BT470M: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ ycbcr = V4L2_YCBCR_ENC_601; ++ break; ++ case AVCOL_PRI_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_PRI_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_PRI_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_PRI_BT2020: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ case AVCOL_PRI_SMPTE428: ++ case AVCOL_PRI_SMPTE431: ++ case AVCOL_PRI_SMPTE432: ++ case AVCOL_PRI_EBU3213: ++ case AVCOL_PRI_RESERVED: ++ case AVCOL_PRI_FILM: ++ case AVCOL_PRI_UNSPECIFIED: ++ default: ++ break; ++ } ++ ++ switch (avcs) { ++ case AVCOL_SPC_RGB: ++ cs = V4L2_COLORSPACE_SRGB; ++ break; ++ case AVCOL_SPC_BT709: ++ cs = V4L2_COLORSPACE_REC709; ++ break; ++ case AVCOL_SPC_FCC: ++ cs = V4L2_COLORSPACE_470_SYSTEM_M; ++ break; ++ case AVCOL_SPC_BT470BG: ++ cs = V4L2_COLORSPACE_470_SYSTEM_BG; ++ break; ++ case AVCOL_SPC_SMPTE170M: ++ cs = V4L2_COLORSPACE_SMPTE170M; ++ break; ++ case AVCOL_SPC_SMPTE240M: ++ cs = V4L2_COLORSPACE_SMPTE240M; ++ break; ++ case AVCOL_SPC_BT2020_CL: ++ cs = V4L2_COLORSPACE_BT2020; ++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; ++ break; ++ case AVCOL_SPC_BT2020_NCL: ++ cs = V4L2_COLORSPACE_BT2020; ++ break; ++ default: ++ break; ++ } ++ ++ switch (xfer) { ++ case AVCOL_TRC_BT709: ++ xfer = V4L2_XFER_FUNC_709; ++ break; ++ case AVCOL_TRC_IEC61966_2_1: ++ xfer = V4L2_XFER_FUNC_SRGB; ++ break; ++ case AVCOL_TRC_SMPTE240M: ++ xfer = V4L2_XFER_FUNC_SMPTE240M; ++ break; ++ case AVCOL_TRC_SMPTE2084: ++ xfer = V4L2_XFER_FUNC_SMPTE2084; ++ break; ++ default: ++ break; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.colorspace = cs; ++ fmt->fmt.pix_mp.ycbcr_enc = ycbcr; ++ fmt->fmt.pix_mp.xfer_func = xfer; ++ } else { ++ fmt->fmt.pix.colorspace = cs; ++ fmt->fmt.pix.ycbcr_enc = ycbcr; ++ fmt->fmt.pix.xfer_func = xfer; + } +} + -+#define TEST_UNESCAPE \ -+ do { \ -+ for (int count = 100; count > 0; --count) { \ -+ escaped_offset = rnd() & 7; \ -+ unescaped_offset = rnd() & 7; \ -+ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \ -+ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \ -+ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \ -+ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \ -+ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \ -+ fail(); \ -+ } \ ++static void ++set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr) ++{ ++ const enum v4l2_quantization q = ++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : ++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : ++ V4L2_QUANTIZATION_DEFAULT; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.quantization = q; ++ } else { ++ fmt->fmt.pix.quantization = q; ++ } ++} ++ ++static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ switch(ycbcr) { ++ case V4L2_YCBCR_ENC_XV709: ++ case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709; ++ case V4L2_YCBCR_ENC_XV601: ++ case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M; ++ default: ++ break; ++ } ++ ++ switch(cs) { ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M; ++ case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020; ++ default: ++ break; ++ } ++ ++ return AVCOL_PRI_UNSPECIFIED; ++} ++ ++static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ switch(cs) { ++ case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB; ++ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709; ++ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC; ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M; ++ case V4L2_COLORSPACE_BT2020: ++ if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM) ++ return AVCOL_SPC_BT2020_CL; ++ else ++ return AVCOL_SPC_BT2020_NCL; ++ default: ++ break; ++ } ++ ++ return AVCOL_SPC_UNSPECIFIED; ++} ++ ++static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_ycbcr_encoding ycbcr; ++ enum v4l2_xfer_func xfer; ++ enum v4l2_colorspace cs; ++ ++ cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.colorspace : ++ fmt->fmt.pix.colorspace; ++ ++ ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.ycbcr_enc: ++ fmt->fmt.pix.ycbcr_enc; ++ ++ xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.xfer_func: ++ fmt->fmt.pix.xfer_func; ++ ++ switch (xfer) { ++ case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709; ++ case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1; ++ default: ++ break; ++ } ++ ++ switch (cs) { ++ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22; ++ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28; ++ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M; ++ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M; ++ default: ++ break; ++ } ++ ++ switch (ycbcr) { ++ case V4L2_YCBCR_ENC_XV709: ++ case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG; ++ default: ++ break; ++ } ++ ++ return AVCOL_TRC_UNSPECIFIED; ++} ++ ++static enum AVColorRange get_color_range(const struct v4l2_format *const fmt) ++{ ++ enum v4l2_quantization qt; ++ ++ qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? ++ fmt->fmt.pix_mp.quantization : ++ fmt->fmt.pix.quantization; ++ ++ switch (qt) { ++ case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG; ++ case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG; ++ default: ++ break; ++ } ++ ++ return AVCOL_RANGE_UNSPECIFIED; ++} ++ ++static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) ++{ ++ struct v4l2_format *const format = &q->format; ++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0]; ++ ++ const uint32_t drm_fmt = src->layers[0].format; ++ // Treat INVALID as LINEAR ++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ? ++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier; ++ uint32_t pix_fmt = 0; ++ uint32_t w = 0; ++ uint32_t h = 0; ++ uint32_t bpl = src->layers[0].planes[0].pitch; ++ ++ // We really don't expect multiple layers ++ // All formats that we currently cope with are single object ++ ++ if (src->nb_layers != 1 || src->nb_objects != 1) ++ return AVERROR(EINVAL); ++ ++ switch (drm_fmt) { ++ case DRM_FORMAT_YUV420: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 3) ++ break; ++ pix_fmt = V4L2_PIX_FMT_YUV420; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ break; ++ ++ case DRM_FORMAT_NV12: ++ if (mod == DRM_FORMAT_MOD_LINEAR) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12; ++ h = src->layers[0].planes[1].offset / bpl; ++ w = bpl; ++ } ++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_COL128; ++ w = bpl; ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ case DRM_FORMAT_P030: ++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { ++ if (src->layers[0].nb_planes != 2) ++ break; ++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128; ++ w = bpl / 2; // Matching lie to how we construct this ++ h = src->layers[0].planes[1].offset / 128; ++ bpl = fourcc_mod_broadcom_param(mod); ++ } ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (!pix_fmt) ++ return AVERROR(EINVAL); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { ++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->plane_fmt[0].bytesperline = bpl; ++ pix->num_planes = 1; ++ } ++ else { ++ struct v4l2_pix_format *const pix = &format->fmt.pix; ++ ++ pix->width = w; ++ pix->height = h; ++ pix->pixelformat = pix_fmt; ++ pix->bytesperline = bpl; + } + ++ set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc); ++ set_fmt_color_range(format, frame->color_range); ++ ++ q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right); ++ q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom); ++ q->sel.r.left = frame->crop_left; ++ q->sel.r.top = frame->crop_top; ++ + return 0; + } + ++ ++static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height) ++{ ++ struct v4l2_format * const fmt = &queue->format; ++ struct v4l2_selection *const sel = &queue->sel; ++ ++ memset(&fmt->fmt, 0, sizeof(fmt->fmt)); ++ ++ // Align w/h to 16 here in case there are alignment requirements at the next ++ // stage of the filter chain (also RPi deinterlace setup is bust and this ++ // fixes it) ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { ++ fmt->fmt.pix_mp.pixelformat = pixelformat; ++ fmt->fmt.pix_mp.field = field; ++ fmt->fmt.pix_mp.width = FFALIGN(width, 16); ++ fmt->fmt.pix_mp.height = FFALIGN(height, 16); ++ } else { ++ fmt->fmt.pix.pixelformat = pixelformat; ++ fmt->fmt.pix.field = field; ++ fmt->fmt.pix.width = FFALIGN(width, 16); ++ fmt->fmt.pix.height = FFALIGN(height, 16); ++ } ++ ++ set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer); ++ set_fmt_color_range(fmt, priv->colour_range); ++ ++ sel->r.width = width; ++ sel->r.height = height; ++ sel->r.left = 0; ++ sel->r.top = 0; ++ ++ return do_s_fmt(queue); ++} ++ + static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) + { + int ret; +@@ -464,16 +922,22 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node + return AVERROR(errno); + + ret = deint_v4l2m2m_prepare_context(ctx); +- if (ret) ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n"); + goto fail; ++ } + +- ret = deint_v4l2m2m_try_format(&ctx->capture); +- if (ret) ++ ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format); ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n"); + goto fail; ++ } + +- ret = deint_v4l2m2m_try_format(&ctx->output); +- if (ret) ++ ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE); ++ if (ret) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n"); + goto fail; ++ } + + return 0; + +@@ -534,26 +998,118 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) + return 0; + } + +-static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) ++static void ++drm_frame_init(AVDRMFrameDescriptor * const d) ++{ ++ unsigned int i; ++ for (i = 0; i != AV_DRM_MAX_PLANES; ++i) { ++ d->objects[i].fd = -1; ++ } ++} ++ ++static void ++drm_frame_uninit(AVDRMFrameDescriptor * const d) ++{ ++ unsigned int i; ++ for (i = 0; i != d->nb_objects; ++i) { ++ if (d->objects[i].fd != -1) { ++ close(d->objects[i].fd); ++ d->objects[i].fd = -1; ++ } ++ } ++} ++ ++static void ++avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n) ++{ ++ unsigned int i; ++ V4L2Buffer* const avbufs = *ppavbufs; ++ ++ if (avbufs == NULL) ++ return; ++ *ppavbufs = NULL; ++ ++ for (i = 0; i != n; ++i) { ++ V4L2Buffer* const avbuf = avbufs + i; ++ drm_frame_uninit(&avbuf->drm_frame); ++ } ++ ++ av_free(avbufs); ++} ++ ++static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) + { + struct v4l2_exportbuffer expbuf; + int i, ret; + uint64_t mod = DRM_FORMAT_MOD_LINEAR; +- uint32_t fmt = 0; + +- switch (pixelformat) { +- case V4L2_PIX_FMT_NV12: +- fmt = DRM_FORMAT_NV12; +- break; +- case V4L2_PIX_FMT_YUV420: +- fmt = DRM_FORMAT_YUV420; +- break; +- default: +- return AVERROR(EINVAL); ++ AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; ++ const struct v4l2_format *const fmt = &q->format; ++ const uint32_t height = fmt_height(fmt); ++ const uint32_t width = fmt_width(fmt); ++ ptrdiff_t bpl0; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_layers = 1; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = fmt_bpl(fmt, i); + } ++ bpl0 = layer->planes[0].pitch; ++ ++ switch (fmt_pixelformat(fmt)) { ++ ++ case V4L2_PIX_FMT_NV12_COL128: ++ mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); ++ layer->format = V4L2_PIX_FMT_NV12; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = height * 128; ++ layer->planes[0].pitch = width; ++ layer->planes[1].pitch = width; ++ break; + +- avbuf->drm_frame.layers[0].format = fmt; ++ case DRM_FORMAT_NV12: ++ layer->format = V4L2_PIX_FMT_NV12; + ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = bpl0 * height; ++ layer->planes[1].pitch = bpl0; ++ break; ++ ++ case V4L2_PIX_FMT_YUV420: ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = bpl0 * height; ++ layer->planes[1].pitch = bpl0 / 2; ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4); ++ layer->planes[2].pitch = bpl0 / 2; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ return AVERROR(EINVAL); ++ } ++ ++ drm_desc->nb_objects = 0; + for (i = 0; i < avbuf->num_planes; i++) { + memset(&expbuf, 0, sizeof(expbuf)); + +@@ -565,19 +1121,11 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat) + if (ret < 0) + return AVERROR(errno); + +- avbuf->fd = expbuf.fd; +- +- if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { +- /* drm frame */ +- avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; +- avbuf->drm_frame.objects[i].fd = expbuf.fd; +- avbuf->drm_frame.objects[i].format_modifier = mod; +- } else { +- /* drm frame */ +- avbuf->drm_frame.objects[0].size = avbuf->buffer.length; +- avbuf->drm_frame.objects[0].fd = expbuf.fd; +- avbuf->drm_frame.objects[0].format_modifier = mod; +- } ++ drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ? ++ avbuf->buffer.m.planes[i].length : avbuf->buffer.length; ++ drm_desc->objects[i].fd = expbuf.fd; ++ drm_desc->objects[i].format_modifier = mod; ++ drm_desc->nb_objects = i + 1; + } + + return 0; +@@ -588,7 +1136,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + struct v4l2_format *fmt = &queue->format; + DeintV4L2M2MContextShared *ctx = queue->ctx; + struct v4l2_requestbuffers req; +- int ret, i, j, multiplanar; ++ int ret, i, multiplanar; + uint32_t memory; + + memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? +@@ -617,10 +1165,9 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + } + + for (i = 0; i < queue->num_buffers; i++) { +- V4L2Buffer *buf = &queue->buffers[i]; ++ V4L2Buffer * const buf = &queue->buffers[i]; + + buf->enqueued = 0; +- buf->fd = -1; + buf->q = queue; + + buf->buffer.type = fmt->type; +@@ -632,6 +1179,12 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + buf->buffer.m.planes = buf->planes; + } + ++ drm_frame_init(&buf->drm_frame); ++ } ++ ++ for (i = 0; i < queue->num_buffers; i++) { ++ V4L2Buffer * const buf = &queue->buffers[i]; ++ + ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); + if (ret < 0) { + ret = AVERROR(errno); +@@ -639,29 +1192,14 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + goto fail; + } + +- if (multiplanar) +- buf->num_planes = buf->buffer.length; +- else +- buf->num_planes = 1; +- +- for (j = 0; j < buf->num_planes; j++) { +- V4L2PlaneInfo *info = &buf->plane_info[j]; +- +- if (multiplanar) { +- info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; +- info->length = buf->buffer.m.planes[j].length; +- } else { +- info->bytesperline = fmt->fmt.pix.bytesperline; +- info->length = buf->buffer.length; +- } +- } ++ buf->num_planes = multiplanar ? buf->buffer.length : 1; + + if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { + ret = deint_v4l2m2m_enqueue_buffer(buf); + if (ret) + goto fail; + +- ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat); ++ ret = v4l2_buffer_export_drm(queue, buf); + if (ret) + goto fail; + } +@@ -670,12 +1208,8 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) + return 0; + + fail: +- for (i = 0; i < queue->num_buffers; i++) +- if (queue->buffers[i].fd >= 0) +- close(queue->buffers[i].fd); +- av_free(queue->buffers); +- queue->buffers = NULL; +- ++ avbufs_delete(&queue->buffers, queue->num_buffers); ++ queue->num_buffers = 0; + return ret; + } + +@@ -862,7 +1396,6 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) + if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { + V4L2Queue *capture = &ctx->capture; + V4L2Queue *output = &ctx->output; +- int i; + + av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); + +@@ -871,12 +1404,7 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) + deint_v4l2m2m_streamoff(output); + } + +- if (capture->buffers) +- for (i = 0; i < capture->num_buffers; i++) { +- capture->buffers[i].q = NULL; +- if (capture->buffers[i].fd >= 0) +- close(capture->buffers[i].fd); +- } ++ avbufs_delete(&capture->buffers, capture->num_buffers); + + deint_v4l2m2m_unref_queued(output); + +@@ -908,73 +1436,15 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) + deint_v4l2m2m_destroy_context(ctx); + } + +-static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) +-{ +- AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; +- AVDRMLayerDescriptor *layer; +- +- /* fill the DRM frame descriptor */ +- drm_desc->nb_objects = avbuf->num_planes; +- drm_desc->nb_layers = 1; +- +- layer = &drm_desc->layers[0]; +- layer->nb_planes = avbuf->num_planes; +- +- for (int i = 0; i < avbuf->num_planes; i++) { +- layer->planes[i].object_index = i; +- layer->planes[i].offset = 0; +- layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; +- } +- +- switch (layer->format) { +- case DRM_FORMAT_YUYV: +- layer->nb_planes = 1; +- break; +- +- case DRM_FORMAT_NV12: +- case DRM_FORMAT_NV21: +- if (avbuf->num_planes > 1) +- break; +- +- layer->nb_planes = 2; +- +- layer->planes[1].object_index = 0; +- layer->planes[1].offset = avbuf->plane_info[0].bytesperline * +- height; +- layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; +- break; +- +- case DRM_FORMAT_YUV420: +- if (avbuf->num_planes > 1) +- break; +- +- layer->nb_planes = 3; +- +- layer->planes[1].object_index = 0; +- layer->planes[1].offset = avbuf->plane_info[0].bytesperline * +- height; +- layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; +- +- layer->planes[2].object_index = 0; +- layer->planes[2].offset = layer->planes[1].offset + +- ((avbuf->plane_info[0].bytesperline * +- height) >> 2); +- layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; +- break; +- +- default: +- drm_desc->nb_layers = 0; +- break; +- } +- +- return (uint8_t *) drm_desc; +-} +- + // timeout in ms + static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) + { + DeintV4L2M2MContextShared *ctx = queue->ctx; + V4L2Buffer* avbuf; ++ enum AVColorPrimaries color_primaries; ++ enum AVColorSpace colorspace; ++ enum AVColorTransferCharacteristic color_trc; ++ enum AVColorRange color_range; + + av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); + +@@ -985,8 +1455,6 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim + } + + // Fill in PTS and anciliary info from src frame +- // we will want to overwrite some fields as only the pts/dts +- // fields are updated with new timing in this fn + pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); + + frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, +@@ -999,18 +1467,36 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim + + atomic_fetch_add(&ctx->refcount, 1); + +- frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); ++ frame->data[0] = (uint8_t *)&avbuf->drm_frame; + frame->format = AV_PIX_FMT_DRM_PRIME; + if (ctx->hw_frames_ctx) + frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); +- frame->height = ctx->height; +- frame->width = ctx->width; +- +- // Not interlaced now +- frame->interlaced_frame = 0; +- frame->top_field_first = 0; +- // Pkt duration halved +- frame->pkt_duration /= 2; ++ frame->height = ctx->output_height; ++ frame->width = ctx->output_width; ++ ++ color_primaries = get_color_primaries(&ctx->capture.format); ++ colorspace = get_color_space(&ctx->capture.format); ++ color_trc = get_color_trc(&ctx->capture.format); ++ color_range = get_color_range(&ctx->capture.format); ++ ++ // If the color parameters are unspecified by V4L2 then leave alone as they ++ // will have been copied from src ++ if (color_primaries != AVCOL_PRI_UNSPECIFIED) ++ frame->color_primaries = color_primaries; ++ if (colorspace != AVCOL_SPC_UNSPECIFIED) ++ frame->colorspace = colorspace; ++ if (color_trc != AVCOL_TRC_UNSPECIFIED) ++ frame->color_trc = color_trc; ++ if (color_range != AVCOL_RANGE_UNSPECIFIED) ++ frame->color_range = color_range; ++ ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) { ++ // Not interlaced now ++ frame->interlaced_frame = 0; // *** Fill in from dst buffer? ++ frame->top_field_first = 0; ++ // Pkt duration halved ++ frame->pkt_duration /= 2; ++ } + + if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { + av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); +@@ -1032,15 +1518,34 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) + ctx->height = avctx->inputs[0]->h; + ctx->width = avctx->inputs[0]->w; + +- av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); ++ if (ctx->filter_type == FILTER_V4L2_SCALE) { ++ if ((ret = ff_scale_eval_dimensions(priv, ++ priv->w_expr, priv->h_expr, ++ inlink, outlink, ++ &ctx->output_width, &ctx->output_height)) < 0) ++ return ret; ++ ++ ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height, ++ priv->force_original_aspect_ratio, priv->force_divisible_by); ++ } ++ else { ++ ctx->output_width = ctx->width; ++ ctx->output_height = ctx->height; ++ } ++ ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height); + + outlink->time_base = inlink->time_base; +- outlink->w = inlink->w; +- outlink->h = inlink->h; +- outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; ++ outlink->w = ctx->output_width; ++ outlink->h = ctx->output_height; + outlink->format = inlink->format; + outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate + ++ if (inlink->sample_aspect_ratio.num) ++ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); ++ else ++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; ++ + ret = deint_v4l2m2m_find_device(ctx); + if (ret) + return ret; +@@ -1055,18 +1560,19 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) + + static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) + { +- const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR || +- drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID); ++ const uint64_t mod = drm_desc->objects[0].format_modifier; ++ const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID); ++ ++ // Only currently support single object things ++ if (drm_desc->nb_objects != 1) ++ return 0; + + switch (drm_desc->layers[0].format) { + case DRM_FORMAT_YUV420: +- if (is_linear) +- return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0; +- break; ++ return is_linear ? V4L2_PIX_FMT_YUV420 : 0; + case DRM_FORMAT_NV12: +- if (is_linear) +- return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0; +- break; ++ return is_linear ? V4L2_PIX_FMT_NV12 : ++ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0; + default: + break; + } +@@ -1089,7 +1595,7 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + + if (ctx->field_order == V4L2_FIELD_ANY) { + const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0]; +- const uint32_t pixelformat = desc_pixelformat(drm_desc); ++ uint32_t pixelformat = desc_pixelformat(drm_desc); + + if (pixelformat == 0) { + av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n", +@@ -1104,29 +1610,49 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height, + drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); + +- ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); +- if (ret) ++ if ((ret = set_src_fmt(output, in)) != 0) { ++ av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n", ++ av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier); ++ return ret; ++ } ++ ++ ret = do_s_fmt(output); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n"); + return ret; ++ } + +- ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); +- if (ret) ++ if (ctx->output_format != AV_PIX_FMT_NONE) ++ pixelformat = fmt_av_to_v4l2(ctx->output_format); ++ ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height); ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n"); + return ret; ++ } + + ret = deint_v4l2m2m_allocate_buffers(capture); +- if (ret) ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n"); + return ret; ++ } + + ret = deint_v4l2m2m_streamon(capture); +- if (ret) ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret)); + return ret; ++ } + + ret = deint_v4l2m2m_allocate_buffers(output); +- if (ret) ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n"); + return ret; ++ } + + ret = deint_v4l2m2m_streamon(output); +- if (ret) ++ if (ret) { ++ av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret)); + return ret; ++ } + + if (in->top_field_first) + ctx->field_order = V4L2_FIELD_INTERLACED_TB; +@@ -1251,7 +1777,7 @@ again: + return did_something ? 0 : FFERROR_NOT_READY; + } + +-static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) ++static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type) + { + DeintV4L2M2MContext * const priv = avctx->priv; + DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); +@@ -1262,6 +1788,7 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) + } + priv->shared = ctx; + ctx->logctx = priv; ++ ctx->filter_type = filter_type; + ctx->fd = -1; + ctx->output.ctx = ctx; + ctx->output.num_buffers = 8; +@@ -1274,9 +1801,52 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) + + atomic_init(&ctx->refcount, 1); + ++ if (priv->output_format_string) { ++ ctx->output_format = av_get_pix_fmt(priv->output_format_string); ++ if (ctx->output_format == AV_PIX_FMT_NONE) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string); ++ return AVERROR(EINVAL); ++ } ++ if (fmt_av_to_v4l2(ctx->output_format) == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format)); ++ return AVERROR(EINVAL); ++ } ++ } else { ++ // Use the input format once that is configured. ++ ctx->output_format = AV_PIX_FMT_NONE; ++ } ++ ++#define STRING_OPTION(var_name, func_name, default_value) do { \ ++ if (priv->var_name ## _string) { \ ++ int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \ ++ if (var < 0) { \ ++ av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \ ++ return AVERROR(EINVAL); \ ++ } \ ++ priv->var_name = var; \ ++ } else { \ ++ priv->var_name = default_value; \ ++ } \ + } while (0) + -+static void check_unescape(void) ++ STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED); ++ STRING_OPTION(colour_transfer, color_transfer, AVCOL_TRC_UNSPECIFIED); ++ STRING_OPTION(colour_matrix, color_space, AVCOL_SPC_UNSPECIFIED); ++ STRING_OPTION(chroma_location, chroma_location, AVCHROMA_LOC_UNSPECIFIED); ++ + return 0; + } + ++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) +{ -+ /* This appears to be a typical length of buffer in use */ -+#define LOG2_UNESCAPE_BUF_SIZE 17 -+#define UNESCAPE_BUF_SIZE (1u<priv; +@@ -1294,6 +1864,51 @@ static const AVOption deinterlace_v4l2m2m_options[] = { + + AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); + ++#define OFFSET(x) offsetof(DeintV4L2M2MContext, x) ++#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) + -+ if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) { -+ int len0, len1, escaped_offset, unescaped_offset, escaped_len; -+ declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *); ++static const AVOption scale_v4l2m2m_options[] = { ++ { "w", "Output video width", ++ OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS }, ++ { "h", "Output video height", ++ OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS }, ++ { "format", "Output video format (software format of hardware frames)", ++ OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS }, ++ // These colour properties match the ones of the same name in vf_scale. ++ { "out_color_matrix", "Output colour matrix coefficient set", ++ OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS }, ++ { "out_range", "Output colour range", ++ OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, ++ AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" }, ++ { "full", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ { "limited", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "jpeg", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ { "mpeg", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "tv", "Limited range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" }, ++ { "pc", "Full range", ++ 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" }, ++ // These colour properties match the ones in the VAAPI scaler ++ { "out_color_primaries", "Output colour primaries", ++ OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "out_color_transfer", "Output colour transfer characteristics", ++ OFFSET(colour_transfer_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "out_chroma_location", "Output chroma sample location", ++ OFFSET(chroma_location_string), AV_OPT_TYPE_STRING, ++ { .str = NULL }, .flags = FLAGS }, ++ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" }, ++ { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS }, ++ { NULL }, ++}; + -+ /* Test data which consists of escapes sequences packed as tightly as possible */ -+ for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x) -+ escaped1[x] = escaped0[x] = 3 * (x % 3 == 0); -+ TEST_UNESCAPE; ++AVFILTER_DEFINE_CLASS(scale_v4l2m2m); + -+ /* Test random data */ -+ RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE); -+ TEST_UNESCAPE; + static const AVFilterPad deint_v4l2m2m_inputs[] = { + { + .name = "default", +@@ -1321,3 +1936,17 @@ AVFilter ff_vf_deinterlace_v4l2m2m = { + .priv_class = &deinterlace_v4l2m2m_class, + .activate = deint_v4l2m2m_activate, + }; + -+ /* Test data with escape sequences at random intervals */ -+ for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) { -+ int gap, gap_msb; -+ escaped1[x+0] = escaped0[x+0] = 0; -+ escaped1[x+1] = escaped0[x+1] = 0; -+ escaped1[x+2] = escaped0[x+2] = 3; -+ escaped1[x+3] = escaped0[x+3] = rnd() & 3; -+ gap_msb = 2u << (rnd() % 8); -+ gap = (rnd() &~ -gap_msb) | gap_msb; -+ x += gap; ++AVFilter ff_vf_scale_v4l2m2m = { ++ .name = "scale_v4l2m2m", ++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"), ++ .priv_size = sizeof(DeintV4L2M2MContext), ++ .init = &scale_v4l2m2m_init, ++ .uninit = &deint_v4l2m2m_uninit, ++ FILTER_INPUTS(deint_v4l2m2m_inputs), ++ FILTER_OUTPUTS(deint_v4l2m2m_outputs), ++ FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME), ++ .priv_class = &scale_v4l2m2m_class, ++ .activate = deint_v4l2m2m_activate, ++}; ++ + +From 84d071e7f04ac67ee2071c4512a635f0c4388b40 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 22 Sep 2022 14:54:46 +0000 +Subject: [PATCH 071/113] v4l2_m2m: Adjust buffer allocation based on min/max + controls + +Clip requested buffer count to min/max declared by driver. +If 0 buffers requested then set to min+2. +This allows encode to keep its src buffer count down to a plausible +minimum which helps with flow control. +--- + libavcodec/v4l2_context.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 0225f6ba64..5754a9fda7 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -1187,6 +1187,7 @@ fail_release: + + int ff_v4l2_context_init(V4L2Context* ctx) + { ++ struct v4l2_queryctrl qctrl; + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + int ret; + +@@ -1228,6 +1229,24 @@ int ff_v4l2_context_init(V4L2Context* ctx) + goto fail_unref_hwframes; + } + ++ memset(&qctrl, 0, sizeof(qctrl)); ++ qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT; ++ if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) { ++ ret = AVERROR(errno); ++ if (ret != AVERROR(EINVAL)) { ++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret)); ++ goto fail_unref_hwframes; + } -+ TEST_UNESCAPE; ++ // Control unsupported - set default if wanted ++ if (ctx->num_buffers < 2) ++ ctx->num_buffers = 4; ++ } ++ else { ++ if (ctx->num_buffers < 2) ++ ctx->num_buffers = qctrl.minimum + 2; ++ ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum); ++ } + -+ /* Test data which is known to contain no escape sequences */ -+ memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE); -+ memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE); -+ TEST_UNESCAPE; + ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem); + if (ret < 0) + goto fail_unref_hwframes; + +From 35d480e28ddaa7a68d06f00a52deb97f53e251d8 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 22 Sep 2022 15:00:12 +0000 +Subject: [PATCH 072/113] v4l2_m2m_dec: If src Q is full then wait indefinitely + for buffer + +If it is not possible to add another buffer to the src Q then alawys +wait indefinitely for either an output frame or the Q to have space. + +This has issues if the reason that the Q is stalled is due to dst buffer +exhaustion and buffers cannot be returned async by another thread but +the current scheme confuses ffmpegs pipeline scheduling. +--- + libavcodec/v4l2_m2m_dec.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index bb809be41e..e67e06313f 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -456,9 +456,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + if (dst_rv != 0 && TRY_DQ(src_rv)) { + // Pick a timeout depending on state + const int t = ++ src_rv == NQ_Q_FULL ? -1 : + src_rv == NQ_DRAINING ? 300 : +- prefer_dq ? 5 : +- src_rv == NQ_Q_FULL ? -1 : 0; ++ prefer_dq ? 5 : 0; + + // Dequeue frame will unref any previous contents of frame + // if it returns success so we don't need an explicit unref + +From 9bfc5748f3b1325c598af5426f689ccc291087ec Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 22 Sep 2022 15:12:27 +0000 +Subject: [PATCH 073/113] vf_deinterlace_v4l2m2m: Add Q name to structure for + debug + +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 2df39ec0f1..4edecc02bf 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -84,6 +84,7 @@ typedef struct V4L2Queue { + struct v4l2_selection sel; + int num_buffers; + V4L2Buffer *buffers; ++ const char * name; + DeintV4L2M2MContextShared *ctx; + } V4L2Queue; + +@@ -1792,8 +1793,10 @@ static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filt + ctx->fd = -1; + ctx->output.ctx = ctx; + ctx->output.num_buffers = 8; ++ ctx->output.name = "OUTPUT"; + ctx->capture.ctx = ctx; + ctx->capture.num_buffers = 12; ++ ctx->capture.name = "CAPTURE"; + ctx->done = 0; + ctx->field_order = V4L2_FIELD_ANY; + + +From 0a3064022e113153f6eaba307b7d982b1c350df7 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 22 Sep 2022 16:08:42 +0000 +Subject: [PATCH 074/113] v4l2_m2m_enc: Set src buffer count to min+2 by + default + +Set output.num_buffers to 0 by default which will then be set to min+2 +by the allocation code. This fixes an issue where the deinterlacer had +fewer dest buffer than the encoder has src buffers and so ran dry +creating deadlock in the ffmpeg filter chain. +--- + libavcodec/v4l2_m2m_enc.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index c98f2129dc..bc0c2d4245 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -672,9 +672,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx) + #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM + + #define V4L_M2M_CAPTURE_OPTS \ +- V4L_M2M_DEFAULT_OPTS,\ ++ { "num_output_buffers", "Number of buffers in the output context",\ ++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\ + { "num_capture_buffers", "Number of buffers in the capture context", \ +- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS } ++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS } + + static const AVOption mpeg4_options[] = { + V4L_M2M_CAPTURE_OPTS, + +From 738e263692a00f5a9c4ba37e17a0950d1fdeed6f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 22 Sep 2022 16:13:57 +0000 +Subject: [PATCH 075/113] vf_deinterlace_m2m: For deinterlace set outlink FR to + twice inlink + +We used to set the outlink framerate to unknown but it turns out that +ffmpegs filter pipeline copes with that badly. Otherwise leave at 0,0 +which will copy FR from inlink to outlink. +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 4edecc02bf..c52dae1c44 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -1534,13 +1534,16 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink) + ctx->output_height = ctx->height; + } + +- av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height); ++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__, ++ ctx->width, ctx->height, ctx->output_width, ctx->output_height, ++ inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den); + + outlink->time_base = inlink->time_base; + outlink->w = ctx->output_width; + outlink->h = ctx->output_height; + outlink->format = inlink->format; +- outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate ++ if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0) ++ outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den}; + + if (inlink->sample_aspect_ratio.num) + outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio); + +From 898b4ef41bfe2a2638f15850f752f04f9a9993bb Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 23 Sep 2022 11:30:56 +0000 +Subject: [PATCH 076/113] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from + a Q + +Useful for where (encode) we might have drmprime buffers that we want to +return to the source ASAP. +--- + libavcodec/v4l2_context.c | 17 +++++++++++------ + libavcodec/v4l2_context.h | 2 ++ + 2 files changed, 13 insertions(+), 6 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 5754a9fda7..eaaec44666 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -707,17 +707,22 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf) + return avbuf; + } + ++void ++ff_v4l2_dq_all(V4L2Context *const ctx) ++{ ++ V4L2Buffer * avbuf; ++ do { ++ get_qbuf(ctx, &avbuf, 0); ++ } while (avbuf); ++} + -+ /* Benchmark the no-escape-sequences case */ -+ bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1); + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + { + int i; + + /* get back as many output buffers as possible */ +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- V4L2Buffer * avbuf; +- do { +- get_qbuf(ctx, &avbuf, 0); +- } while (avbuf); +- } ++ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ ff_v4l2_dq_all(ctx); + + for (i = 0; i < ctx->num_buffers; i++) { + V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 21265f1bd7..523c53e97d 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -218,4 +218,6 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const + */ + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); + ++void ff_v4l2_dq_all(V4L2Context *const ctx); ++ + #endif // AVCODEC_V4L2_CONTEXT_H + +From 6d3e970647486212d614e5d77a4140fbffa12544 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 23 Sep 2022 11:38:36 +0000 +Subject: [PATCH 077/113] v4l2_m2m_enc: DQ output more frequently + +Ensure that we DQ any released src buffers on every op to avoid deadlock +with source. + +There is a plausible argument that this patch is inelegant and the drain +should be integrated into dq_buf, but that is a further reaching delta. +--- + libavcodec/v4l2_m2m_enc.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index bc0c2d4245..d8ee7fd2f2 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -421,6 +421,8 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; + ++ ff_v4l2_dq_all(output); ++ + // Signal EOF if needed + if (!frame) { + return ff_v4l2_context_enqueue_frame(output, frame); +@@ -492,6 +494,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + AVFrame *frame = s->frame; + int ret; + ++ ff_v4l2_dq_all(output); ++ + if (s->draining) + goto dequeue; + +@@ -528,7 +532,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + } + + dequeue: +- if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt); ++ ff_v4l2_dq_all(output); ++ if (ret) + return ret; + + if (capture->first_buf == 1) { +@@ -560,7 +566,9 @@ dequeue: + s->extdata_size = len; + } + +- if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0) ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt); ++ ff_v4l2_dq_all(output); ++ if (ret) + return ret; + } + + +From 236301167a8103afaffbdf1ad43616c1af8383c4 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 26 Sep 2022 18:20:00 +0100 +Subject: [PATCH 078/113] conf_native: Remove --enable-rpi from all builds + +--- + pi-util/conf_native.sh | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh +index 37cea71756..f22d531ca4 100755 +--- a/pi-util/conf_native.sh ++++ b/pi-util/conf_native.sh +@@ -54,9 +54,9 @@ if [ $MMAL ]; then + RPI_LIBDIRS="-L$RPI_OPT_VC/lib" + RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000" + RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group" +- RPIOPTS="--enable-mmal --enable-rpi" ++ RPIOPTS="--enable-mmal" + else +- RPIOPTS="--disable-mmal --enable-sand" ++ RPIOPTS="--disable-mmal" + fi + + C=`lsb_release -sc` +@@ -89,6 +89,7 @@ $FFSRC/configure \ + $MCOPTS\ + --disable-stripping\ + --disable-thumb\ ++ --enable-sand\ + --enable-v4l2-request\ + --enable-libdrm\ + --enable-vout-egl\ + +From 7c875c257b3d0ff913ced3ae2bc6b6a4c150b7cc Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 29 Sep 2022 19:48:08 +0000 +Subject: [PATCH 079/113] v4l2_m2m_dec: Deal correctly with avcC H264 data in + extradata + +Decoders expect AnnexB style headers, mkv and similar formats have +somewhat oddly wrapped extradata. Convert to annex-b style before use. +--- + libavcodec/v4l2_m2m.h | 2 +- + libavcodec/v4l2_m2m_dec.c | 177 ++++++++++++++++++++++++++++++++++++-- + 2 files changed, 169 insertions(+), 10 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index ee72beb052..babf101d65 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -118,7 +118,7 @@ typedef struct V4L2m2mContext { + /* Ext data sent */ + int extdata_sent; + /* Ext data sent in packet - overrides ctx */ +- uint8_t * extdata_data; ++ void * extdata_data; + size_t extdata_size; + + #define FF_V4L2_QUIRK_REINIT_ALWAYS 1 +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index e67e06313f..6ffd28e76d 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -46,6 +46,71 @@ + #define STATS_LAST_COUNT_MAX 64 + #define STATS_INTERVAL_MAX (1 << 30) + ++#ifndef FF_API_BUFFER_SIZE_T ++#define FF_API_BUFFER_SIZE_T 1 ++#endif ++ ++#define DUMP_FAILED_EXTRADATA 0 ++ ++#if DUMP_FAILED_EXTRADATA ++static inline char hex1(unsigned int x) ++{ ++ x &= 0xf; ++ return x <= 9 ? '0' + x : 'a' + x - 10; ++} ++ ++static inline char * hex2(char * s, unsigned int x) ++{ ++ *s++ = hex1(x >> 4); ++ *s++ = hex1(x); ++ return s; ++} ++ ++static inline char * hex4(char * s, unsigned int x) ++{ ++ s = hex2(s, x >> 8); ++ s = hex2(s, x); ++ return s; ++} ++ ++static inline char * dash2(char * s) ++{ ++ *s++ = '-'; ++ *s++ = '-'; ++ return s; ++} ++ ++static void ++data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len) ++{ ++ size_t i; ++ s = hex4(s, offset); ++ m += offset; ++ for (i = 0; i != 8; ++i) { ++ *s++ = ' '; ++ s = len > i + offset ? hex2(s, *m++) : dash2(s); ++ } ++ *s++ = ' '; ++ *s++ = ':'; ++ for (; i != 16; ++i) { ++ *s++ = ' '; ++ s = len > i + offset ? hex2(s, *m++) : dash2(s); ++ } ++ *s++ = 0; ++} ++ ++static void ++log_dump(void * logctx, int lvl, const void * const data, const size_t len) ++{ ++ size_t i; ++ for (i = 0; i < len; i += 16) { ++ char buf[80]; ++ data16(buf, i, data, len); ++ av_log(logctx, lvl, "%s\n", buf); + } +} ++#endif + -+void checkasm_check_vc1dsp(void) + static int64_t pts_stats_guess(const pts_stats_t * const stats) + { + if (stats->last_pts == AV_NOPTS_VALUE || +@@ -98,6 +163,98 @@ static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char + }; + } + ++// If abdata == NULL then this just counts space required ++// Unpacks avcC if detected ++static int ++h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata) +{ -+ check_inv_trans_inplace(); -+ check_inv_trans_adding(); -+ report("inv_trans"); ++ const uint8_t * const xdend = extradata + extrasize; ++ const uint8_t * p = extradata; ++ uint8_t * d = abdata; ++ unsigned int n; ++ unsigned int len; ++ const unsigned int hdrlen = 4; ++ unsigned int need_pps = 1; + -+ check_loop_filter(); -+ report("loop_filter"); ++ if (extrasize < 8) ++ return AVERROR(EINVAL); + -+ check_unescape(); -+ report("unescape_buffer"); ++ if (p[0] == 0 && p[1] == 0) { ++ // Assume a couple of leading zeros are good enough to indicate NAL ++ if (abdata) ++ memcpy(d, p, extrasize); ++ return extrasize; ++ } ++ ++ // avcC starts with a 1 ++ if (p[0] != 1) ++ return AVERROR(EINVAL); ++ ++ p += 5; ++ n = *p++ & 0x1f; ++ ++doxps: ++ while (n--) { ++ if (xdend - p < 2) ++ return AVERROR(EINVAL); ++ len = (p[0] << 8) | p[1]; ++ p += 2; ++ if (xdend - p < (ptrdiff_t)len) ++ return AVERROR(EINVAL); ++ if (abdata) { ++ d[0] = 0; ++ d[1] = 0; ++ d[2] = 0; ++ d[3] = 1; ++ memcpy(d + 4, p, len); ++ } ++ d += len + hdrlen; ++ p += len; ++ } ++ if (need_pps) { ++ need_pps = 0; ++ if (p >= xdend) ++ return AVERROR(EINVAL); ++ n = *p++; ++ goto doxps; ++ } ++ ++ return d - abdata; +} -diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak -index 07f1d8238e..aa5f45ec8f 100644 ---- a/tests/fate/checkasm.mak -+++ b/tests/fate/checkasm.mak -@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \ - fate-checkasm-hevc_add_res \ - fate-checkasm-hevc_idct \ - fate-checkasm-hevc_sao \ -+ fate-checkasm-idctdsp \ - fate-checkasm-jpeg2000dsp \ - fate-checkasm-llviddsp \ - fate-checkasm-llviddspenc \ -@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \ - fate-checkasm-sw_scale \ - fate-checkasm-v210dec \ - fate-checkasm-v210enc \ -+ fate-checkasm-vc1dsp \ - fate-checkasm-vf_blend \ - fate-checkasm-vf_colorspace \ - fate-checkasm-vf_eq \ ++ ++static int ++copy_extradata(AVCodecContext * const avctx, ++ const void * const src_data, const int src_len, ++ void ** const pdst_data, size_t * const pdst_len) ++{ ++ int len; ++ ++ *pdst_len = 0; ++ av_freep(pdst_data); ++ ++ if (avctx->codec_id == AV_CODEC_ID_H264) ++ len = h264_xd_copy(src_data, src_len, NULL); ++ else ++ len = src_len < 0 ? AVERROR(EINVAL) : src_len; ++ ++ // Zero length is OK but we swant to stop - -ve is error val ++ if (len <= 0) ++ return len; ++ ++ if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL) ++ return AVERROR(ENOMEM); ++ ++ if (avctx->codec_id == AV_CODEC_ID_H264) ++ h264_xd_copy(src_data, src_len, *pdst_data); ++ else ++ memcpy(*pdst_data, src_data, len); ++ *pdst_len = len; ++ ++ return 0; ++} ++ ++ ++ + static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) + { + int ret; +@@ -277,13 +434,8 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); + if (side_data) { + av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); +- av_freep(&s->extdata_data); +- if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { +- av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size); +- return AVERROR(ENOMEM); +- } +- memcpy(s->extdata_data, side_data, side_size); +- s->extdata_size = side_size; ++ if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0) ++ av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret)); + s->extdata_sent = 0; + } + +@@ -359,8 +511,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + else if (s->extdata_data) + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); +- else +- ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); + + if (ret == AVERROR(EAGAIN)) { + // Out of input buffers - keep packet +@@ -770,6 +920,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + return ret; + } + ++ if (avctx->extradata && ++ (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret)); ++#if DUMP_FAILED_EXTRADATA ++ log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size); ++#endif ++ return ret; ++ } ++ + if ((ret = v4l2_prepare_decoder(s)) < 0) + return ret; + + +From 62f6924639cee07142ebc2567c693fb1304ac44c Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 30 Sep 2022 14:20:23 +0000 +Subject: [PATCH 080/113] v4l2_request_hevc: Fix up + V4L2_CID_CODEC_STATELESS_BASE if missing + +--- + libavcodec/hevc-ctrls-v4.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h +index 7829d82084..c02fdbe5a8 100644 +--- a/libavcodec/hevc-ctrls-v4.h ++++ b/libavcodec/hevc-ctrls-v4.h +@@ -53,6 +53,13 @@ + #include + #include + ++#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS ++#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000 /* Stateless codecs controls */ ++#endif ++#ifndef V4L2_CID_CODEC_STATELESS_BASE ++#define V4L2_CID_CODEC_STATELESS_BASE (V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900) ++#endif ++ + #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ + + #define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400) + +From 76ad115bb301a2b4bd27135557a4e0b4653e8d5f Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sat, 1 Oct 2022 13:40:57 +0000 +Subject: [PATCH 081/113] vf_deinterlace_v4l2m2m: Fix compile on m/c without + V4L2 SAND + +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 33 +++++++++++++++++++++++----- + 1 file changed, 28 insertions(+), 5 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index c52dae1c44..716789f988 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -35,6 +35,8 @@ + #include + #include + ++#include "config.h" ++ + #include "libavutil/avassert.h" + #include "libavutil/avstring.h" + #include "libavutil/common.h" +@@ -59,6 +61,16 @@ + #define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */ + #endif + ++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined ++// in drm_fourcc.h hopefully will be sometime in the future but until then... ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ + typedef struct V4L2Queue V4L2Queue; + typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; + +@@ -176,9 +188,11 @@ fmt_av_to_v4l2(const enum AVPixelFormat avfmt) + return V4L2_PIX_FMT_YUV420; + case AV_PIX_FMT_NV12: + return V4L2_PIX_FMT_NV12; ++#if CONFIG_SAND + case AV_PIX_FMT_RPI4_8: + case AV_PIX_FMT_SAND128: + return V4L2_PIX_FMT_NV12_COL128; ++#endif + default: + break; + } +@@ -193,8 +207,10 @@ fmt_v4l2_to_av(const uint32_t pixfmt) + return AV_PIX_FMT_YUV420P; + case V4L2_PIX_FMT_NV12: + return AV_PIX_FMT_NV12; ++#if CONFIG_SAND + case V4L2_PIX_FMT_NV12_COL128: + return AV_PIX_FMT_RPI4_8; ++#endif + default: + break; + } +@@ -823,6 +839,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) + h = src->layers[0].planes[1].offset / bpl; + w = bpl; + } ++#if CONFIG_SAND + else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { + if (src->layers[0].nb_planes != 2) + break; +@@ -831,9 +848,11 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) + h = src->layers[0].planes[1].offset / 128; + bpl = fourcc_mod_broadcom_param(mod); + } ++#endif + break; + + case DRM_FORMAT_P030: ++#if CONFIG_SAND + if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) { + if (src->layers[0].nb_planes != 2) + break; +@@ -842,6 +861,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame) + h = src->layers[0].planes[1].offset / 128; + bpl = fourcc_mod_broadcom_param(mod); + } ++#endif + break; + + default: +@@ -1048,7 +1068,6 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) + AVDRMLayerDescriptor * const layer = &drm_desc->layers[0]; + const struct v4l2_format *const fmt = &q->format; + const uint32_t height = fmt_height(fmt); +- const uint32_t width = fmt_width(fmt); + ptrdiff_t bpl0; + + /* fill the DRM frame descriptor */ +@@ -1063,7 +1082,7 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) + bpl0 = layer->planes[0].pitch; + + switch (fmt_pixelformat(fmt)) { +- ++#if CONFIG_SAND + case V4L2_PIX_FMT_NV12_COL128: + mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0); + layer->format = V4L2_PIX_FMT_NV12; +@@ -1074,9 +1093,10 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf) + layer->nb_planes = 2; + layer->planes[1].object_index = 0; + layer->planes[1].offset = height * 128; +- layer->planes[0].pitch = width; +- layer->planes[1].pitch = width; ++ layer->planes[0].pitch = fmt_width(fmt); ++ layer->planes[1].pitch = layer->planes[0].pitch; + break; ++#endif + + case DRM_FORMAT_NV12: + layer->format = V4L2_PIX_FMT_NV12; +@@ -1576,7 +1596,10 @@ static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc) + return is_linear ? V4L2_PIX_FMT_YUV420 : 0; + case DRM_FORMAT_NV12: + return is_linear ? V4L2_PIX_FMT_NV12 : +- fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0; ++#if CONFIG_SAND ++ fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : ++#endif ++ 0; + default: + break; + } + +From 7539260084d21d63a5dfd989891cc34a8d2e42c8 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sun, 2 Oct 2022 12:36:43 +0000 +Subject: [PATCH 082/113] configure: Fix v4l2_req_hevc_vx setup; set after deps + fixups + +--- + configure | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/configure b/configure +index f555f60dc8..c51b342cd2 100755 +--- a/configure ++++ b/configure +@@ -6855,12 +6855,6 @@ fi + check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns + check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" + disable v4l2_req_hevc_vx +-if enabled hevc_v4l2request_hwaccel; then +- enable v4l2_req_hevc_vx +-fi +-if enabled hevc_v4l2_request; then +- disable v4l2_req_hevc_vx +-fi + + check_headers sys/videoio.h + test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete +@@ -7350,6 +7344,9 @@ check_deps $CONFIG_LIST \ + + enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86" + ++# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done ++enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx ++ + case $target_os in + haiku) + disable memalign + +From d8ffedb056a1c5f5e6f6737d20fecc1aa0b3ed93 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Sat, 1 Oct 2022 12:39:45 +0000 +Subject: [PATCH 083/113] vf_deinterlace_v4l2m2m: Ensure we get consistent + final frames + +On getting EOS at the input of the filster do not simply drop everything +in transit on the floor but attempt to retrieve everything possible from +the capture Q before on-signalling EOS. +If we know that we expect 1 frame in to always produce 1 frame out then +match CAPTURE frame to the last OUTPUT frame Qed (scale) +If frames out have an unknown relation to source frames (deinterlace) try +an encode stop and wait for the last frame marker to emerge from CAPTURE +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 172 +++++++++++++++++++++++---- + 1 file changed, 148 insertions(+), 24 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 716789f988..ce875c2c61 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -94,6 +94,7 @@ typedef struct V4L2Buffer { + typedef struct V4L2Queue { + struct v4l2_format format; + struct v4l2_selection sel; ++ int eos; + int num_buffers; + V4L2Buffer *buffers; + const char * name; +@@ -127,20 +128,41 @@ typedef struct pts_track_s + pts_track_el_t a[PTS_TRACK_SIZE]; + } pts_track_t; + ++typedef enum drain_state_e ++{ ++ DRAIN_NONE = 0, // Not draining ++ DRAIN_TIMEOUT, // Drain until normal timeout setup yields no frame ++ DRAIN_LAST, // Drain with long timeout last_frame in received on output expected ++ DRAIN_EOS, // Drain with long timeout EOS expected ++ DRAIN_DONE // Drained ++} drain_state_t; ++ + typedef struct DeintV4L2M2MContextShared { + void * logctx; // For logging - will be NULL when done + filter_type_v4l2_t filter_type; + + int fd; +- int done; ++ int done; // fd closed - awating all refs dropped + int width; + int height; + ++ int drain; // EOS received (inlink status) ++ drain_state_t drain_state; ++ int64_t drain_pts; // PTS associated with inline status ++ ++ unsigned int frames_rx; ++ unsigned int frames_tx; ++ + // from options + int output_width; + int output_height; + enum AVPixelFormat output_format; + ++ int has_enc_stop; ++ // We expect to get exactly the same number of frames out as we put in ++ // We can drain by matching input to output ++ int one_to_one; ++ + int orig_width; + int orig_height; + atomic_uint refcount; +@@ -179,6 +201,12 @@ typedef struct DeintV4L2M2MContext { + enum AVChromaLocation chroma_location; + } DeintV4L2M2MContext; + ++ ++static inline int drain_frame_expected(const drain_state_t d) ++{ ++ return d == DRAIN_EOS || d == DRAIN_LAST; ++} ++ + // These just list the ones we know we can cope with + static uint32_t + fmt_av_to_v4l2(const enum AVPixelFormat avfmt) +@@ -334,6 +362,13 @@ fail: + return 0; + } + ++// We are only ever expecting in-order frames so nothing more clever is required ++static unsigned int ++pts_track_count(const pts_track_t * const trk) ++{ ++ return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1); ++} ++ + static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) + { + const uint32_t n = pts_track_next_n(trk); +@@ -406,6 +441,12 @@ fmt_pixelformat(const struct v4l2_format * const fmt) + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; + } + ++static inline uint32_t ++buf_bytesused0(const struct v4l2_buffer * const buf) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused; ++} ++ + static void + init_format(V4L2Queue * const q, const uint32_t format_type) + { +@@ -1469,12 +1510,24 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim + + av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); + ++ if (queue->eos) { ++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__); ++ return AVERROR_EOF; ++ } ++ + avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); + if (!avbuf) { + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); + return AVERROR(EAGAIN); + } + ++ if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) { ++ if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0) ++ queue->eos = 1; ++ if (buf_bytesused0(&avbuf->buffer) == 0) ++ return queue->eos ? AVERROR_EOF : AVERROR(EINVAL); ++ } ++ + // Fill in PTS and anciliary info from src frame + pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); + +@@ -1686,6 +1739,20 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + else + ctx->field_order = V4L2_FIELD_INTERLACED_BT; + ++ { ++ struct v4l2_encoder_cmd ecmd = { ++ .cmd = V4L2_ENC_CMD_STOP ++ }; ++ ctx->has_enc_stop = 0; ++ if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n"); ++ ctx->has_enc_stop = 1; ++ } ++ else { ++ av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno))); ++ } ++ ++ } + } + + ret = deint_v4l2m2m_enqueue_frame(output, in); +@@ -1694,6 +1761,41 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + return ret; + } + ++static int ++ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s, ++ AVFilterLink * const inlink) ++{ ++ int instatus; ++ int64_t inpts; ++ ++ if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0) ++ return 0; ++ ++ s->drain = instatus; ++ s->drain_pts = inpts; ++ s->drain_state = DRAIN_TIMEOUT; ++ ++ if (s->field_order == V4L2_FIELD_ANY) { // Not yet started ++ s->drain_state = DRAIN_DONE; ++ } ++ else if (s->one_to_one) { ++ s->drain_state = DRAIN_LAST; ++ } ++ else if (s->has_enc_stop) { ++ struct v4l2_encoder_cmd ecmd = { ++ .cmd = V4L2_ENC_CMD_STOP ++ }; ++ if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) { ++ av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n"); ++ s->drain_state = DRAIN_EOS; ++ } ++ else { ++ av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno))); ++ } ++ } ++ return 1; ++} ++ + static int deint_v4l2m2m_activate(AVFilterContext *avctx) + { + DeintV4L2M2MContext * const priv = avctx->priv; +@@ -1702,15 +1804,13 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) + AVFilterLink * const inlink = avctx->inputs[0]; + int n = 0; + int cn = 99; +- int instatus = 0; +- int64_t inpts = 0; + int did_something = 0; + + av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); + + FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); + +- ff_inlink_acknowledge_status(inlink, &instatus, &inpts); ++ ack_inlink(avctx, s, inlink); + + if (!ff_outlink_frame_wanted(outlink)) { + av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); +@@ -1720,7 +1820,6 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) + AVFrame * frame = av_frame_alloc(); + int rv; + +-again: + recycle_q(&s->output); + n = count_enqueued(&s->output); + +@@ -1729,10 +1828,21 @@ again: + return AVERROR(ENOMEM); + } + +- rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); ++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, ++ drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0); + if (rv != 0) { + av_frame_free(&frame); +- if (rv != AVERROR(EAGAIN)) { ++ if (rv == AVERROR_EOF) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } ++ else if (rv == AVERROR(EAGAIN)) { ++ if (s->drain_state != DRAIN_NONE) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } ++ } ++ else { + av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); + return rv; + } +@@ -1742,29 +1852,30 @@ again: + // frame is always consumed by filter_frame - even on error despite + // a somewhat confusing comment in the header + rv = ff_filter_frame(outlink, frame); +- +- if (instatus != 0) { +- av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); +- goto again; +- } ++ ++s->frames_tx; + + av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); + did_something = 1; ++ ++ if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) { ++ av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__); ++ s->drain_state = DRAIN_DONE; ++ } + } + + cn = count_enqueued(&s->capture); + } + +- if (instatus != 0) { +- ff_outlink_set_status(outlink, instatus, inpts); +- av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); ++ if (s->drain_state == DRAIN_DONE) { ++ ff_outlink_set_status(outlink, s->drain, s->drain_pts); ++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain)); + return 0; + } + + recycle_q(&s->output); + n = count_enqueued(&s->output); + +- while (n < 6) { ++ while (n < 6 && !s->drain) { + AVFrame * frame; + int rv; + +@@ -1775,8 +1886,13 @@ again: + + if (frame == NULL) { + av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); ++ if (!ack_inlink(avctx, s, inlink)) { ++ ff_inlink_request_frame(inlink); ++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); ++ } + break; + } ++ ++s->frames_rx; + + rv = deint_v4l2m2m_filter_frame(inlink, frame); + av_frame_free(&frame); +@@ -1785,16 +1901,11 @@ again: + return rv; + + av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); +- ++n; +- } +- +- if (n < 6) { +- ff_inlink_request_frame(inlink); + did_something = 1; +- av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); ++ ++n; + } + +- if (n > 4 && ff_outlink_frame_wanted(outlink)) { ++ if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) { + ff_filter_set_ready(avctx, 1); + did_something = 1; + av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); +@@ -1873,7 +1984,18 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) + + static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx) + { +- return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE); ++ int rv; ++ DeintV4L2M2MContext * priv; ++ DeintV4L2M2MContextShared * ctx; ++ ++ if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0) ++ return rv; ++ ++ priv = avctx->priv; ++ ctx = priv->shared; ++ ++ ctx->one_to_one = 1; ++ return 0; + } + + static void deint_v4l2m2m_uninit(AVFilterContext *avctx) +@@ -1881,6 +2003,8 @@ static void deint_v4l2m2m_uninit(AVFilterContext *avctx) + DeintV4L2M2MContext *priv = avctx->priv; + DeintV4L2M2MContextShared *ctx = priv->shared; + ++ av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n", ++ ctx->frames_rx, ctx->frames_tx); + ctx->done = 1; + ctx->logctx = NULL; // Log to NULL works, log to missing crashes + pts_track_uninit(&ctx->track); + +From 6a37ed2a2d181e63e6a73c13fe20302818c21427 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 5 Oct 2022 16:12:02 +0000 +Subject: [PATCH 084/113] v4l2_m2m_dec: Rework decode pending heuristic + +The old code measured the length of the entire Q in the decoder and +attempted to dynamically guess an appropriate length. This was prone to +failure when the guesswork became confused. +The new code attempts to measure the Q length before insertion into decode +which, after all, is what we actually care about. It does this by +asserting that the decoder must have consumed all packets that came +before the one associated with the most recent CAPTURE frame. This +avoids all need for reorder buffer size guesswork. +--- + libavcodec/v4l2_m2m.h | 2 - + libavcodec/v4l2_m2m_dec.c | 77 +++++++++++++++++---------------------- + 2 files changed, 34 insertions(+), 45 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index babf101d65..26a7161042 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -107,8 +107,6 @@ typedef struct V4L2m2mContext { + + /* Frame tracking */ + xlat_track_t xlat; +- int pending_hw; +- int pending_n; + + pts_stats_t pts_stat; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 6ffd28e76d..de2d39de9a 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -349,41 +349,54 @@ static void + xlat_flush(xlat_track_t * const x) + { + unsigned int i; ++ // Do not reset track_no - this ensures that any frames left in the decoder ++ // that turn up later get discarded. ++ ++ x->last_pts = AV_NOPTS_VALUE; ++ x->last_opaque = 0; + for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { + x->track_els[i].pending = 0; + x->track_els[i].discard = 1; + } +- x->last_pts = AV_NOPTS_VALUE; ++} ++ ++static void ++xlat_init(xlat_track_t * const x) ++{ ++ memset(x, 0, sizeof(*x)); ++ xlat_flush(x); + } + + static int + xlat_pending(const xlat_track_t * const x) + { + unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; +- unsigned int i; +- int r = 0; +- int64_t now = AV_NOPTS_VALUE; ++ int i; ++ const int64_t now = x->last_pts; + +- for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { ++ for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) { + const V4L2m2mTrackEl * const t = x->track_els + n; + ++ // Discard only set on never-set or flushed entries ++ // So if we get here we've never successfully decoded a frame so allow ++ // more frames into the buffer before stalling ++ if (t->discard) ++ return i - 16; ++ ++ // If we've got this frame out then everything before this point ++ // must have entered the decoder + if (!t->pending) +- continue; ++ break; + ++ // If we've never seen a pts all we can do is count frames + if (now == AV_NOPTS_VALUE) +- now = t->dts; ++ continue; + +- if (t->pts == AV_NOPTS_VALUE || +- ((now == AV_NOPTS_VALUE || t->pts <= now) && +- (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) +- ++r; ++ if (t->dts != AV_NOPTS_VALUE && now >= t->dts) ++ break; + } + +- // If we never get any ideas about PTS vs DTS allow a lot more buffer +- if (now == AV_NOPTS_VALUE) +- r -= 16; +- +- return r; ++ return i; + } + + static inline int stream_started(const V4L2m2mContext * const s) { +@@ -557,18 +570,6 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) + return rv; + } + +-// Number of frames over what xlat_pending returns that we keep *16 +-// This is a min value - if it appears to be too small the threshold should +-// adjust dynamically. +-#define PENDING_HW_MIN (3 * 16) +-// Offset to use when setting dynamically +-// Set to %16 == 15 to avoid the threshold changing immediately as we relax +-#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) +-// Number of consecutive times we've failed to get a frame when we prefer it +-// before we increase the prefer threshold (5ms * N = max expected decode +-// time) +-#define PENDING_N_THRESHOLD 6 +- + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; +@@ -578,9 +579,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + + do { + const int pending = xlat_pending(&s->xlat); +- const int prefer_dq = (pending > s->pending_hw / 16); ++ const int prefer_dq = (pending > 3); + const int last_src_rv = src_rv; + ++ av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); ++ + // Enqueue another pkt for decode if + // (a) We don't have a lot of stuff in the buffer already OR + // (b) ... we (think we) do but we've failed to get a frame already OR +@@ -625,20 +628,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + } + +- // Adjust dynamic pending threshold +- if (dst_rv == 0) { +- if (--s->pending_hw < PENDING_HW_MIN) +- s->pending_hw = PENDING_HW_MIN; +- s->pending_n = 0; +- ++ if (dst_rv == 0) + set_best_effort_pts(avctx, &s->pts_stat, frame); +- } +- else if (dst_rv == AVERROR(EAGAIN)) { +- if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { +- s->pending_hw = pending * 16 + PENDING_HW_OFFSET; +- s->pending_n = 0; +- } +- } + + if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { + av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); +@@ -857,8 +848,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + if (ret < 0) + return ret; + ++ xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); +- s->pending_hw = PENDING_HW_MIN; + + capture = &s->capture; + output = &s->output; + +From 105c1cbdf43c288573620276ae8a90c2177fb428 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Fri, 21 Oct 2022 13:48:07 +0000 +Subject: [PATCH 085/113] pthread_frame: Fix MT hwaccel. Recent change broke + it. + +Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the +hwaccel is marked MT_SAFE. +--- + libavcodec/pthread_frame.c | 48 ++++++++++++++++++++++++++++---------- + 1 file changed, 36 insertions(+), 12 deletions(-) + +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index d98b885b0e..1169ca945c 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -244,7 +244,7 @@ FF_ENABLE_DEPRECATION_WARNINGS + p->hwaccel_serializing = 0; + pthread_mutex_unlock(&p->parent->hwaccel_mutex); + } +- av_assert0(!avctx->hwaccel); ++ av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); + + if (p->async_serializing) { + p->async_serializing = 0; +@@ -332,6 +332,12 @@ FF_ENABLE_DEPRECATION_WARNINGS + } + + dst->hwaccel_flags = src->hwaccel_flags; ++ if (src->hwaccel && ++ (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { ++ dst->hwaccel = src->hwaccel; ++ dst->hwaccel_context = src->hwaccel_context; ++ dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data; ++ } + + err = av_buffer_replace(&dst->internal->pool, src->internal->pool); + if (err < 0) +@@ -462,10 +468,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx, + } + + /* transfer the stashed hwaccel state, if any */ +- av_assert0(!p->avctx->hwaccel); +- FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); +- FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); +- FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); ++ if (p->avctx->hwaccel && ++ !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { ++ FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); ++ FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); ++ FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ } + + av_packet_unref(p->avpkt); + ret = av_packet_ref(p->avpkt, avpkt); +@@ -676,9 +685,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { + * this is done here so that this worker thread can wipe its own hwaccel + * state after decoding, without requiring synchronization */ + av_assert0(!p->parent->stash_hwaccel); +- p->parent->stash_hwaccel = avctx->hwaccel; +- p->parent->stash_hwaccel_context = avctx->hwaccel_context; +- p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { ++ p->parent->stash_hwaccel = avctx->hwaccel; ++ p->parent->stash_hwaccel_context = avctx->hwaccel_context; ++ p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; ++ } + + pthread_mutex_lock(&p->progress_mutex); + if(atomic_load(&p->state) == STATE_SETUP_FINISHED){ +@@ -733,6 +745,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) + + park_frame_worker_threads(fctx, thread_count); + ++ if (fctx->prev_thread && ++ avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && ++ avctx->internal->hwaccel_priv_data != ++ fctx->prev_thread->avctx->internal->hwaccel_priv_data) { ++ if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n"); ++ } ++ } ++ + for (i = 0; i < thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + AVCodecContext *ctx = p->avctx; +@@ -781,10 +802,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) + + /* if we have stashed hwaccel state, move it to the user-facing context, + * so it will be freed in avcodec_close() */ +- av_assert0(!avctx->hwaccel); +- FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); +- FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); +- FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)); ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { ++ FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); ++ FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); ++ FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); ++ } + + av_freep(&avctx->internal->thread_ctx); + } + +From e00ca15321f0cbe2a63475e3c611ee15fb3ee4f3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 13:18:27 +0000 +Subject: [PATCH 086/113] v4l2_req: Add swfmt to init logging + +(cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf) +--- + libavcodec/v4l2_request_hevc.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index 614a1b4d99..767ecb036a 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -26,6 +26,7 @@ + #include "v4l2_request_hevc.h" + + #include "libavutil/hwcontext_drm.h" ++#include "libavutil/pixdesc.h" + + #include "v4l2_req_devscan.h" + #include "v4l2_req_dmabufs.h" +@@ -306,10 +307,11 @@ retry_src_memtype: + // Set our s/w format + avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; + +- av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n", ++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n", + ctx->fns->name, + decdev_media_path(decdev), decdev_video_path(decdev), +- mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype)); ++ mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype), ++ av_get_pix_fmt_name(avctx->sw_pix_fmt)); + + return 0; + + +From e5ff4d9a8c7fffc587444bfbb9bd8578b1cb5d1a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 13:39:54 +0000 +Subject: [PATCH 087/113] v4l2_m2m: Avoid polling on a queue that is streamoff + +(cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b) +--- + libavcodec/v4l2_context.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index eaaec44666..3e527f666d 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -578,6 +578,11 @@ get_event(V4L2m2mContext * const m) + return 0; + } + ++static inline int ++dq_ok(const V4L2Context * const c) ++{ ++ return c->streamon && atomic_load(&c->q_count) != 0; ++} + + // Get a buffer + // If output then just gets the buffer in the expected way +@@ -613,13 +618,13 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout + } + + // If capture && timeout == -1 then also wait for rx buffer free +- if (is_cap && timeout == -1 && m->output.streamon && !m->draining) ++ if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining) + pfd.events |= poll_out; + + // If nothing Qed all we will get is POLLERR - avoid that +- if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || +- (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || +- (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { ++ if ((pfd.events == poll_out && !dq_ok(&m->output)) || ++ (pfd.events == poll_cap && !dq_ok(&m->capture)) || ++ (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) { + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); + return AVERROR(ENOSPC); + } + +From e75dba3f588461affa4a54a856418107efa036b0 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 14:07:04 +0000 +Subject: [PATCH 088/113] v4l2_m2m: Add function to get number of queued + buffers + +(cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4) +--- + libavcodec/v4l2_context.h | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 523c53e97d..8e4f681643 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -220,4 +220,15 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); + + void ff_v4l2_dq_all(V4L2Context *const ctx); + ++/** ++ * Returns the number of buffers currently queued ++ * ++ * @param[in] ctx The V4L2Context to evaluate ++ */ ++static inline int ++ff_v4l2_context_q_count(const V4L2Context* const ctx) ++{ ++ return atomic_load(&ctx->q_count); ++} ++ + #endif // AVCODEC_V4L2_CONTEXT_H + +From 3d886c89bfbcb13e230181bb611c1c2d9e235543 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 14:48:20 +0000 +Subject: [PATCH 089/113] v4l2_m2m: Add timeouts to dq_all and dequeue_packet + +Add timeouts and use them to have better flow control in encode + +(cherry picked from commit c6173cad7f21697e12887982bda796de9719bb32) +--- + libavcodec/v4l2_context.c | 16 +++++++++++----- + libavcodec/v4l2_context.h | 15 +++++++++++++-- + libavcodec/v4l2_m2m_enc.c | 28 +++++++++++++++++++--------- + 3 files changed, 43 insertions(+), 16 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 3e527f666d..042c6a976c 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -712,13 +712,19 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf) + return avbuf; + } + +-void +-ff_v4l2_dq_all(V4L2Context *const ctx) ++int ++ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1) + { + V4L2Buffer * avbuf; ++ if (timeout1 != 0) { ++ int rv = get_qbuf(ctx, &avbuf, timeout1); ++ if (rv != 0) ++ return rv; ++ } + do { + get_qbuf(ctx, &avbuf, 0); + } while (avbuf); ++ return 0; + } + + static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) +@@ -727,7 +733,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) + + /* get back as many output buffers as possible */ + if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- ff_v4l2_dq_all(ctx); ++ ff_v4l2_dq_all(ctx, 0); + + for (i = 0; i < ctx->num_buffers; i++) { + V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; +@@ -1047,7 +1053,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) + return 0; + } + +-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) ++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); + AVCodecContext *const avctx = s->avctx; +@@ -1055,7 +1061,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) + int rv; + + do { +- if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) ++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) + return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC + if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0) + return rv; +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 8e4f681643..5afed3e6ec 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -179,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd); + * @param[inout] pkt The AVPacket to dequeue to. + * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. + */ +-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); ++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout); + + /** + * Dequeues a buffer from a V4L2Context to an AVFrame. +@@ -218,7 +218,18 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const + */ + int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f); + +-void ff_v4l2_dq_all(V4L2Context *const ctx); ++/** ++ * Dequeue all buffers on this queue ++ * ++ * Used to recycle output buffers ++ * ++ * @param[in] ctx The V4L2Context to dequeue from. ++ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, ++ * all others have a timeout of zero ++ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return ++ * of the first dequeue operation, 0 otherwise. ++ */ ++int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1); + + /** + * Returns the number of buffers currently queued +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index d8ee7fd2f2..40bbe499f0 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -420,16 +420,24 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; ++ int rv; ++ int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers; + +- ff_v4l2_dq_all(output); ++ av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); + +- // Signal EOF if needed ++ // Signal EOF if needed (doesn't need q slot) + if (!frame) { + return ff_v4l2_context_enqueue_frame(output, frame); + } + ++ if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) { ++ // We should be able to return AVERROR(EAGAIN) to indicate buffer ++ // exhaustion, but ffmpeg currently treats that as fatal. ++ av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv)); ++ return rv; ++ } ++ + if (s->input_drm && !output->streamon) { +- int rv; + struct v4l2_format req_format = {.type = output->format.type}; + + // Set format when we first get a buffer +@@ -494,7 +502,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + AVFrame *frame = s->frame; + int ret; + +- ff_v4l2_dq_all(output); ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ ff_v4l2_dq_all(output, 0); + + if (s->draining) + goto dequeue; +@@ -532,10 +542,10 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + } + + dequeue: +- ret = ff_v4l2_context_dequeue_packet(capture, avpkt); +- ff_v4l2_dq_all(output); ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0); ++ ff_v4l2_dq_all(output, 0); + if (ret) +- return ret; ++ return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; + + if (capture->first_buf == 1) { + uint8_t * data; +@@ -566,8 +576,8 @@ dequeue: + s->extdata_size = len; + } + +- ret = ff_v4l2_context_dequeue_packet(capture, avpkt); +- ff_v4l2_dq_all(output); ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0); ++ ff_v4l2_dq_all(output, 0); + if (ret) + return ret; + } + +From 41810850e382858c2c74453c472c3c4278e69286 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 14:23:32 +0000 +Subject: [PATCH 090/113] v4l2_m2m_enc: Improve debug trace + +(cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5) +--- + libavcodec/v4l2_m2m_enc.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index 40bbe499f0..2c81031336 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -427,6 +427,7 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + + // Signal EOF if needed (doesn't need q slot) + if (!frame) { ++ av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__); + return ff_v4l2_context_enqueue_frame(output, frame); + } + +@@ -491,7 +492,12 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1); + #endif + +- return ff_v4l2_context_enqueue_frame(output, frame); ++ rv = ff_v4l2_context_enqueue_frame(output, frame); ++ if (rv) { ++ av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv)); ++ } ++ ++ return rv; + } + + static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) +@@ -502,7 +508,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + AVFrame *frame = s->frame; + int ret; + +- av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__, ++ ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture)); + + ff_v4l2_dq_all(output, 0); + +@@ -615,11 +622,11 @@ dequeue: + avpkt->size = newlen; + } + +-// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret); + capture->first_buf = 0; + return 0; + + fail_no_mem: ++ av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n"); + ret = AVERROR(ENOMEM); + av_packet_unref(avpkt); + return ret; + +From e0b4608b940bb16b8326fa6680c22bf5513cceac Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 18 Oct 2022 13:22:36 +0000 +Subject: [PATCH 091/113] v4l2_m2m_enc: Copy dest packets to memory if short of + v4l2 buffers + +(cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5) +--- + libavcodec/v4l2_m2m_enc.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index 2c81031336..3773297543 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -621,6 +621,22 @@ dequeue: + avpkt->data = buf->data; + avpkt->size = newlen; + } ++ else if (ff_v4l2_context_q_count(capture) < 2) { ++ // Avoid running out of capture buffers ++ // In most cases the buffers will be returned quickly in which case ++ // we don't copy and can use the v4l2 buffers directly but sometimes ++ // ffmpeg seems to hold onto all of them for a long time (.mkv ++ // creation?) so avoid deadlock in those cases. ++ AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE); ++ if (buf == NULL) ++ goto fail_no_mem; ++ ++ memcpy(buf->data, avpkt->data, avpkt->size); ++ av_buffer_unref(&avpkt->buf); // Will recycle the V4L2 buffer ++ ++ avpkt->buf = buf; ++ avpkt->data = buf->data; ++ } + + capture->first_buf = 0; + return 0; + +From 4b1a4ef2e33c2d4c463bf052843715ee35604625 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 19 Oct 2022 11:00:16 +0000 +Subject: [PATCH 092/113] v4l2_m2m_dec: Fix pts_best_effort guessing for + initial pts + +(cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67) +--- + libavcodec/v4l2_m2m_dec.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index de2d39de9a..73ce427052 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -113,6 +113,8 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len) + + static int64_t pts_stats_guess(const pts_stats_t * const stats) + { ++ if (stats->last_count <= 1) ++ return stats->last_pts; + if (stats->last_pts == AV_NOPTS_VALUE || + stats->last_interval == 0 || + stats->last_count >= STATS_LAST_COUNT_MAX) + +From 519f5e1936caab8436e105d1732e61f8eb538f45 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 19 Oct 2022 14:47:04 +0000 +Subject: [PATCH 093/113] v4l2_m2m_enc: Wait for frame or space in src Q in + rx_pkt + +If receive_packet we should ensure that there is space in the source Q +if we return EAGAIN so wait for either an output packet or space if +the source Q is currently full. + +(cherry picked from commit 82f0c55782a67a8cc665d937647706c2a75f5548) +--- + libavcodec/v4l2_m2m_enc.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index 3773297543..684acf9dc7 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -415,13 +415,17 @@ static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * + return 1; + } + ++static inline int q_full(const V4L2Context *const output) ++{ ++ return ff_v4l2_context_q_count(output) == output->num_buffers; ++} + + static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const output = &s->output; + int rv; +- int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers; ++ const int needs_slot = q_full(output); + + av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot); + +@@ -549,8 +553,20 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) + } + + dequeue: +- ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0); +- ff_v4l2_dq_all(output, 0); ++ // Dequeue a frame ++ for (;;) { ++ int t = q_full(output) ? -1 : s->draining ? 300 : 0; ++ int rv2; ++ ++ // If output is full wait for either a packet or output to become not full ++ ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t); ++ ++ // If output was full retry packet dequeue ++ t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300; ++ rv2 = ff_v4l2_dq_all(output, t); ++ if (t == 0 || rv2 != 0) ++ break; ++ } + if (ret) + return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret; + + +From 6039d4edf2ee1b2e037666b896d924347583dd75 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 19 Oct 2022 14:54:29 +0000 +Subject: [PATCH 094/113] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS + in trace + +(cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a) +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index ce875c2c61..7c6751b69c 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -1668,8 +1668,8 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) + V4L2Queue *output = &ctx->output; + int ret; + +- av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", +- __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); ++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n", ++ __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); + av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, + avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); + + +From ff3a6c6b3085192be00feb501e0a692dda941f07 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 19 Oct 2022 14:55:21 +0000 +Subject: [PATCH 095/113] vf_deinterlace_v4l2m2m: Ignore "wanted" when + processing input + +If we gate send a frame to the outlink on its frame_wanted flag then we +will sometimes stall as the flag may not get set by ffmpeg's filter +processing. So stuff the output whether or not it wants it which works +much better. + +(cherry picked from commit 808254cc04e5e6574cbab9af254b6c2f3d4142e3) +--- + libavfilter/vf_deinterlace_v4l2m2m.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c +index 7c6751b69c..a173a291f8 100644 +--- a/libavfilter/vf_deinterlace_v4l2m2m.c ++++ b/libavfilter/vf_deinterlace_v4l2m2m.c +@@ -1812,10 +1812,7 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx) + + ack_inlink(avctx, s, inlink); + +- if (!ff_outlink_frame_wanted(outlink)) { +- av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); +- } +- else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! ++ if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! + { + AVFrame * frame = av_frame_alloc(); + int rv; + +From 7ba15c81a923b2e63974e36034492597559cb07b Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 19 Oct 2022 15:00:43 +0000 +Subject: [PATCH 096/113] conf_native: Add --enable-gpl + +(cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15) +--- + pi-util/conf_native.sh | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh +index f22d531ca4..082d9b5832 100755 +--- a/pi-util/conf_native.sh ++++ b/pi-util/conf_native.sh +@@ -94,6 +94,7 @@ $FFSRC/configure \ + --enable-libdrm\ + --enable-vout-egl\ + --enable-vout-drm\ ++ --enable-gpl\ + $SHARED_LIBS\ + $RPIOPTS\ + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ + +From 577cd088426ecffbce2ab88c40869f690a57ead0 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 15 Nov 2022 13:33:00 +0000 +Subject: [PATCH 097/113] egl_vout: Make formatting consistent - no code + changes + +--- + libavdevice/egl_vout.c | 741 ++++++++++++++++++++--------------------- + 1 file changed, 369 insertions(+), 372 deletions(-) + +diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c +index 0195c9d026..8fd3c58bac 100644 +--- a/libavdevice/egl_vout.c ++++ b/libavdevice/egl_vout.c +@@ -48,20 +48,20 @@ + #define TRACE_ALL 0 + + struct egl_setup { +- int conId; +- +- Display *dpy; +- EGLDisplay egl_dpy; +- EGLContext ctx; +- EGLSurface surf; +- Window win; +- +- uint32_t crtcId; +- int crtcIdx; +- uint32_t planeId; +- struct { +- int x, y, width, height; +- } compose; ++ int conId; ++ ++ Display *dpy; ++ EGLDisplay egl_dpy; ++ EGLContext ctx; ++ EGLSurface surf; ++ Window win; ++ ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ struct { ++ int x, y, width, height; ++ } compose; + }; + + typedef struct egl_aux_s { +@@ -70,8 +70,7 @@ typedef struct egl_aux_s { + + } egl_aux_t; + +-typedef struct egl_display_env_s +-{ ++typedef struct egl_display_env_s { + AVClass *class; + + struct egl_setup setup; +@@ -89,8 +88,8 @@ typedef struct egl_display_env_s + sem_t display_start_sem; + sem_t q_sem; + int q_terminate; +- AVFrame * q_this; +- AVFrame * q_next; ++ AVFrame *q_this; ++ AVFrame *q_next; + + } egl_display_env_t; + +@@ -99,45 +98,44 @@ typedef struct egl_display_env_s + * Remove window border/decorations. + */ + static void +-no_border( Display *dpy, Window w) ++no_border(Display *dpy, Window w) + { +- static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); +- static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; +- +- typedef struct +- { +- unsigned long flags; +- unsigned long functions; +- unsigned long decorations; +- long inputMode; +- unsigned long status; +- } PropMotifWmHints; +- +- PropMotifWmHints motif_hints; +- Atom prop, proptype; +- unsigned long flags = 0; +- +- /* setup the property */ +- motif_hints.flags = MWM_HINTS_DECORATIONS; +- motif_hints.decorations = flags; +- +- /* get the atom for the property */ +- prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); +- if (!prop) { +- /* something went wrong! */ +- return; +- } +- +- /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ +- proptype = prop; +- +- XChangeProperty( dpy, w, /* display, window */ ++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); ++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; ++ ++ typedef struct { ++ unsigned long flags; ++ unsigned long functions; ++ unsigned long decorations; ++ long inputMode; ++ unsigned long status; ++ } PropMotifWmHints; ++ ++ PropMotifWmHints motif_hints; ++ Atom prop, proptype; ++ unsigned long flags = 0; ++ ++ /* setup the property */ ++ motif_hints.flags = MWM_HINTS_DECORATIONS; ++ motif_hints.decorations = flags; ++ ++ /* get the atom for the property */ ++ prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True); ++ if (!prop) { ++ /* something went wrong! */ ++ return; ++ } ++ ++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ ++ proptype = prop; ++ ++ XChangeProperty(dpy, w, /* display, window */ + prop, proptype, /* property, type */ + 32, /* format: 32-bit datums */ + PropModeReplace, /* mode */ +- (unsigned char *) &motif_hints, /* data */ ++ (unsigned char *)&motif_hints, /* data */ + PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ +- ); ++ ); + } + + +@@ -146,247 +144,247 @@ no_border( Display *dpy, Window w) + * Return the window and context handles. + */ + static int +-make_window(struct AVFormatContext * const s, +- egl_display_env_t * const de, ++make_window(struct AVFormatContext *const s, ++ egl_display_env_t *const de, + Display *dpy, EGLDisplay egl_dpy, const char *name, + Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) + { +- int scrnum = DefaultScreen( dpy ); +- XSetWindowAttributes attr; +- unsigned long mask; +- Window root = RootWindow( dpy, scrnum ); +- Window win; +- EGLContext ctx; +- const int fullscreen = de->fullscreen; +- EGLConfig config; +- int x = de->window_x; +- int y = de->window_y; +- int width = de->window_width ? de->window_width : 1280; +- int height = de->window_height ? de->window_height : 720; +- +- +- if (fullscreen) { +- int scrnum = DefaultScreen(dpy); +- +- x = 0; y = 0; +- width = DisplayWidth(dpy, scrnum); +- height = DisplayHeight(dpy, scrnum); +- } +- +- { +- EGLint num_configs; +- static const EGLint attribs[] = { +- EGL_RED_SIZE, 1, +- EGL_GREEN_SIZE, 1, +- EGL_BLUE_SIZE, 1, +- EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, +- EGL_NONE +- }; +- +- if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { +- av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); +- return -1; +- } +- } +- +- { +- EGLint vid; +- if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { +- av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); +- return -1; +- } +- +- { +- XVisualInfo visTemplate = { +- .visualid = vid, +- }; +- int num_visuals; +- XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, +- &visTemplate, &num_visuals); +- +- /* window attributes */ +- attr.background_pixel = 0; +- attr.border_pixel = 0; +- attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); +- attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; +- /* XXX this is a bad way to get a borderless window! */ +- mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; +- +- win = XCreateWindow( dpy, root, x, y, width, height, +- 0, visinfo->depth, InputOutput, +- visinfo->visual, mask, &attr ); +- XFree(visinfo); +- } +- } +- +- if (fullscreen) +- no_border(dpy, win); +- +- /* set hints and properties */ +- { +- XSizeHints sizehints; +- sizehints.x = x; +- sizehints.y = y; +- sizehints.width = width; +- sizehints.height = height; +- sizehints.flags = USSize | USPosition; +- XSetNormalHints(dpy, win, &sizehints); +- XSetStandardProperties(dpy, win, name, name, +- None, (char **)NULL, 0, &sizehints); +- } +- +- eglBindAPI(EGL_OPENGL_ES_API); +- +- { +- static const EGLint ctx_attribs[] = { +- EGL_CONTEXT_CLIENT_VERSION, 2, +- EGL_NONE +- }; +- ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); +- if (!ctx) { +- av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); +- return -1; +- } +- } +- +- +- XMapWindow(dpy, win); +- +- { +- EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); +- if (!surf) { +- av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); +- return -1; +- } +- +- if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { +- av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); +- return -1; +- } +- +- *winRet = win; +- *ctxRet = ctx; +- *surfRet = surf; +- } +- +- return 0; ++ int scrnum = DefaultScreen(dpy); ++ XSetWindowAttributes attr; ++ unsigned long mask; ++ Window root = RootWindow(dpy, scrnum); ++ Window win; ++ EGLContext ctx; ++ const int fullscreen = de->fullscreen; ++ EGLConfig config; ++ int x = de->window_x; ++ int y = de->window_y; ++ int width = de->window_width ? de->window_width : 1280; ++ int height = de->window_height ? de->window_height : 720; ++ ++ ++ if (fullscreen) { ++ int scrnum = DefaultScreen(dpy); ++ ++ x = 0; y = 0; ++ width = DisplayWidth(dpy, scrnum); ++ height = DisplayHeight(dpy, scrnum); ++ } ++ ++ { ++ EGLint num_configs; ++ static const EGLint attribs[] = { ++ EGL_RED_SIZE, 1, ++ EGL_GREEN_SIZE, 1, ++ EGL_BLUE_SIZE, 1, ++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, ++ EGL_NONE ++ }; ++ ++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { ++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); ++ return -1; ++ } ++ } ++ ++ { ++ EGLint vid; ++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); ++ return -1; ++ } ++ ++ { ++ XVisualInfo visTemplate = { ++ .visualid = vid, ++ }; ++ int num_visuals; ++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, ++ &visTemplate, &num_visuals); ++ ++ /* window attributes */ ++ attr.background_pixel = 0; ++ attr.border_pixel = 0; ++ attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone); ++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; ++ /* XXX this is a bad way to get a borderless window! */ ++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; ++ ++ win = XCreateWindow(dpy, root, x, y, width, height, ++ 0, visinfo->depth, InputOutput, ++ visinfo->visual, mask, &attr); ++ XFree(visinfo); ++ } ++ } ++ ++ if (fullscreen) ++ no_border(dpy, win); ++ ++ /* set hints and properties */ ++ { ++ XSizeHints sizehints; ++ sizehints.x = x; ++ sizehints.y = y; ++ sizehints.width = width; ++ sizehints.height = height; ++ sizehints.flags = USSize | USPosition; ++ XSetNormalHints(dpy, win, &sizehints); ++ XSetStandardProperties(dpy, win, name, name, ++ None, (char **)NULL, 0, &sizehints); ++ } ++ ++ eglBindAPI(EGL_OPENGL_ES_API); ++ ++ { ++ static const EGLint ctx_attribs[] = { ++ EGL_CONTEXT_CLIENT_VERSION, 2, ++ EGL_NONE ++ }; ++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs); ++ if (!ctx) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ } ++ ++ ++ XMapWindow(dpy, win); ++ ++ { ++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); ++ if (!surf) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); ++ return -1; ++ } ++ ++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ ++ *winRet = win; ++ *ctxRet = ctx; ++ *surfRet = surf; ++ } ++ ++ return 0; + } + + static GLint +-compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) ++compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source) + { +- GLuint s = glCreateShader(target); ++ GLuint s = glCreateShader(target); + +- if (s == 0) { +- av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); +- return 0; +- } ++ if (s == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); ++ return 0; ++ } + +- glShaderSource(s, 1, (const GLchar **) &source, NULL); +- glCompileShader(s); ++ glShaderSource(s, 1, (const GLchar **)&source, NULL); ++ glCompileShader(s); + +- { +- GLint ok; +- glGetShaderiv(s, GL_COMPILE_STATUS, &ok); ++ { ++ GLint ok; ++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); + +- if (!ok) { +- GLchar *info; +- GLint size; ++ if (!ok) { ++ GLchar *info; ++ GLint size; + +- glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); +- info = malloc(size); ++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); ++ info = malloc(size); + +- glGetShaderInfoLog(s, size, NULL, info); +- av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); ++ glGetShaderInfoLog(s, size, NULL, info); ++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); + +- return 0; +- } +- } ++ return 0; ++ } ++ } + +- return s; ++ return s; + } + +-static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) ++static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs) + { +- GLuint prog = glCreateProgram(); +- +- if (prog == 0) { +- av_log(s, AV_LOG_ERROR, "Failed to create program\n"); +- return 0; +- } +- +- glAttachShader(prog, vs); +- glAttachShader(prog, fs); +- glLinkProgram(prog); +- +- { +- GLint ok; +- glGetProgramiv(prog, GL_LINK_STATUS, &ok); +- if (!ok) { +- /* Some drivers return a size of 1 for an empty log. This is the size +- * of a log that contains only a terminating NUL character. +- */ +- GLint size; +- GLchar *info = NULL; +- glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); +- if (size > 1) { +- info = malloc(size); +- glGetProgramInfoLog(prog, size, NULL, info); +- } ++ GLuint prog = glCreateProgram(); + +- av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", +- (info != NULL) ? info : ""); +- return 0; +- } +- } ++ if (prog == 0) { ++ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); ++ return 0; ++ } ++ ++ glAttachShader(prog, vs); ++ glAttachShader(prog, fs); ++ glLinkProgram(prog); ++ ++ { ++ GLint ok; ++ glGetProgramiv(prog, GL_LINK_STATUS, &ok); ++ if (!ok) { ++ /* Some drivers return a size of 1 for an empty log. This is the size ++ * of a log that contains only a terminating NUL character. ++ */ ++ GLint size; ++ GLchar *info = NULL; ++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); ++ if (size > 1) { ++ info = malloc(size); ++ glGetProgramInfoLog(prog, size, NULL, info); ++ } + +- return prog; ++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", ++ (info != NULL) ? info : ""); ++ return 0; ++ } ++ } ++ ++ return prog; + } + + static int +-gl_setup(struct AVFormatContext * const s) ++gl_setup(struct AVFormatContext *const s) + { +- const char *vs = +- "attribute vec4 pos;\n" +- "varying vec2 texcoord;\n" +- "\n" +- "void main() {\n" +- " gl_Position = pos;\n" +- " texcoord.x = (pos.x + 1.0) / 2.0;\n" +- " texcoord.y = (-pos.y + 1.0) / 2.0;\n" +- "}\n"; +- const char *fs = +- "#extension GL_OES_EGL_image_external : enable\n" +- "precision mediump float;\n" +- "uniform samplerExternalOES s;\n" +- "varying vec2 texcoord;\n" +- "void main() {\n" +- " gl_FragColor = texture2D(s, texcoord);\n" +- "}\n"; +- +- GLuint vs_s; +- GLuint fs_s; +- GLuint prog; +- +- if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || +- !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || +- !(prog = link_program(s, vs_s, fs_s))) +- return -1; +- +- glUseProgram(prog); +- +- { +- static const float verts[] = { +- -1, -1, +- 1, -1, +- 1, 1, +- -1, 1, +- }; +- glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); +- } +- +- glEnableVertexAttribArray(0); +- return 0; ++ const char *vs = ++ "attribute vec4 pos;\n" ++ "varying vec2 texcoord;\n" ++ "\n" ++ "void main() {\n" ++ " gl_Position = pos;\n" ++ " texcoord.x = (pos.x + 1.0) / 2.0;\n" ++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" ++ "}\n"; ++ const char *fs = ++ "#extension GL_OES_EGL_image_external : enable\n" ++ "precision mediump float;\n" ++ "uniform samplerExternalOES s;\n" ++ "varying vec2 texcoord;\n" ++ "void main() {\n" ++ " gl_FragColor = texture2D(s, texcoord);\n" ++ "}\n"; ++ ++ GLuint vs_s; ++ GLuint fs_s; ++ GLuint prog; ++ ++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || ++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || ++ !(prog = link_program(s, vs_s, fs_s))) ++ return -1; ++ ++ glUseProgram(prog); ++ ++ { ++ static const float verts[] = { ++ -1, -1, ++ 1, -1, ++ 1, 1, ++ -1, 1, ++ }; ++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); ++ } ++ ++ glEnableVertexAttribArray(0); ++ return 0; + } + + static int egl_vout_write_trailer(AVFormatContext *s) +@@ -400,12 +398,12 @@ static int egl_vout_write_trailer(AVFormatContext *s) + + static int egl_vout_write_header(AVFormatContext *s) + { +- const AVCodecParameters * const par = s->streams[0]->codecpar; ++ const AVCodecParameters *const par = s->streams[0]->codecpar; + + #if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); + #endif +- if ( s->nb_streams > 1 ++ if (s->nb_streams > 1 + || par->codec_type != AVMEDIA_TYPE_VIDEO + || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { + av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); +@@ -416,10 +414,10 @@ static int egl_vout_write_header(AVFormatContext *s) + } + + +-static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) ++static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame) + { +- const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; +- egl_aux_t * da = NULL; ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0]; ++ egl_aux_t *da = NULL; + unsigned int i; + + #if TRACE_ALL +@@ -440,26 +438,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + + if (da->texture == 0) { + EGLint attribs[50]; +- EGLint * a = attribs; ++ EGLint *a = attribs; + int i, j; + static const EGLint anames[] = { +- EGL_DMA_BUF_PLANE0_FD_EXT, +- EGL_DMA_BUF_PLANE0_OFFSET_EXT, +- EGL_DMA_BUF_PLANE0_PITCH_EXT, +- EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, +- EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, +- EGL_DMA_BUF_PLANE1_FD_EXT, +- EGL_DMA_BUF_PLANE1_OFFSET_EXT, +- EGL_DMA_BUF_PLANE1_PITCH_EXT, +- EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, +- EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, +- EGL_DMA_BUF_PLANE2_FD_EXT, +- EGL_DMA_BUF_PLANE2_OFFSET_EXT, +- EGL_DMA_BUF_PLANE2_PITCH_EXT, +- EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, +- EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE0_FD_EXT, ++ EGL_DMA_BUF_PLANE0_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE0_PITCH_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE1_FD_EXT, ++ EGL_DMA_BUF_PLANE1_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE1_PITCH_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE2_FD_EXT, ++ EGL_DMA_BUF_PLANE2_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE2_PITCH_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, + }; +- const EGLint * b = anames; ++ const EGLint *b = anames; + + *a++ = EGL_WIDTH; + *a++ = av_frame_cropped_width(frame); +@@ -470,8 +468,8 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + + for (i = 0; i < desc->nb_layers; ++i) { + for (j = 0; j < desc->layers[i].nb_planes; ++j) { +- const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; +- const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index; + *a++ = *b++; + *a++ = obj->fd; + *a++ = *b++; +@@ -479,13 +477,13 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + *a++ = *b++; + *a++ = p->pitch; + if (obj->format_modifier == 0) { +- b += 2; ++ b += 2; + } + else { +- *a++ = *b++; +- *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); +- *a++ = *b++; +- *a++ = (EGLint)(obj->format_modifier >> 32); ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier >> 32); + } + } + } +@@ -494,26 +492,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + + #if TRACE_ALL + for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { +- av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); ++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); + } + #endif + { +- const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, +- EGL_NO_CONTEXT, +- EGL_LINUX_DMA_BUF_EXT, +- NULL, attribs); +- if (!image) { +- av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); +- return -1; +- } +- +- glGenTextures(1, &da->texture); +- glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); +- glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); +- glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); +- glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); +- +- eglDestroyImageKHR(de->setup.egl_dpy, image); ++ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, ++ EGL_NO_CONTEXT, ++ EGL_LINUX_DMA_BUF_EXT, ++ NULL, attribs); ++ if (!image) { ++ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); ++ return -1; ++ } ++ ++ glGenTextures(1, &da->texture); ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); ++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); ++ ++ eglDestroyImageKHR(de->setup.egl_dpy, image); + } + + da->fd = desc->objects[0].fd; +@@ -540,7 +538,7 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + (long long)modifiers[1], + (long long)modifiers[2], + (long long)modifiers[3] +- ); ++ ); + #endif + } + +@@ -558,55 +556,55 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A + return 0; + } + +-static void * display_thread(void * v) ++static void* display_thread(void *v) + { +- AVFormatContext * const s = v; +- egl_display_env_t * const de = s->priv_data; ++ AVFormatContext *const s = v; ++ egl_display_env_t *const de = s->priv_data; + + #if TRACE_ALL + av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); + #endif + { +- EGLint egl_major, egl_minor; +- +- de->setup.dpy = XOpenDisplay(NULL); +- if (!de->setup.dpy) { +- av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); +- goto fail; +- } +- +- de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); +- if (!de->setup.egl_dpy) { +- av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); +- goto fail; +- } +- +- if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { +- av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); +- goto fail; +- } +- +- av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); +- +- if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { +- av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); +- goto fail; +- } ++ EGLint egl_major, egl_minor; ++ ++ de->setup.dpy = XOpenDisplay(NULL); ++ if (!de->setup.dpy) { ++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); ++ goto fail; ++ } ++ ++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); ++ if (!de->setup.egl_dpy) { ++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); ++ goto fail; ++ } ++ ++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); ++ goto fail; ++ } ++ ++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); ++ ++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { ++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); ++ goto fail; ++ } + } + + if (!de->window_width || !de->window_height) { +- de->window_width = 1280; +- de->window_height = 720; ++ de->window_width = 1280; ++ de->window_height = 720; + } + if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", + &de->setup.win, &de->setup.ctx, &de->setup.surf)) { +- av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); +- goto fail; ++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); ++ goto fail; + } + + if (gl_setup(s)) { +- av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); +- goto fail; ++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); ++ goto fail; + } + + #if TRACE_ALL +@@ -615,7 +613,7 @@ static void * display_thread(void * v) + sem_post(&de->display_start_sem); + + for (;;) { +- AVFrame * frame; ++ AVFrame *frame; + + while (sem_wait(&de->q_sem) != 0) { + av_assert0(errno == EINTR); +@@ -653,9 +651,9 @@ fail: + + static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) + { +- const AVFrame * const src_frame = (AVFrame *)pkt->data; +- AVFrame * frame; +- egl_display_env_t * const de = s->priv_data; ++ const AVFrame *const src_frame = (AVFrame *)pkt->data; ++ AVFrame *frame; ++ egl_display_env_t *const de = s->priv_data; + + #if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +@@ -668,8 +666,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) + else if (src_frame->format == AV_PIX_FMT_VAAPI) { + frame = av_frame_alloc(); + frame->format = AV_PIX_FMT_DRM_PRIME; +- if (av_hwframe_map(frame, src_frame, 0) != 0) +- { ++ if (av_hwframe_map(frame, src_frame, 0) != 0) { + av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); + av_frame_free(&frame); + return AVERROR(EINVAL); +@@ -682,12 +679,12 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) + + // Really hacky sync + while (de->show_all && de->q_next) { +- usleep(3000); ++ usleep(3000); + } + + pthread_mutex_lock(&de->q_lock); + { +- AVFrame * const t = de->q_next; ++ AVFrame *const t = de->q_next; + de->q_next = frame; + frame = t; + } +@@ -702,7 +699,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) + } + + static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, +- unsigned flags) ++ unsigned flags) + { + av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags); + return AVERROR_PATCHWELCOME; +@@ -713,7 +710,7 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si + #if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); + #endif +- switch(type) { ++ switch (type) { + case AV_APP_TO_DEV_WINDOW_REPAINT: + return 0; + default: +@@ -723,14 +720,14 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si + } + + // deinit is called if init fails so no need to clean up explicity here +-static int egl_vout_init(struct AVFormatContext * s) ++static int egl_vout_init(struct AVFormatContext *s) + { +- egl_display_env_t * const de = s->priv_data; ++ egl_display_env_t *const de = s->priv_data; + unsigned int i; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + +- de->setup = (struct egl_setup){0}; ++ de->setup = (struct egl_setup) { 0 }; + + for (i = 0; i != 32; ++i) { + de->aux[i].fd = -1; +@@ -744,8 +741,8 @@ static int egl_vout_init(struct AVFormatContext * s) + + sem_wait(&de->display_start_sem); + if (de->q_terminate) { +- av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); +- return -1; ++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); ++ return -1; + } + + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); +@@ -753,9 +750,9 @@ static int egl_vout_init(struct AVFormatContext * s) + return 0; + } + +-static void egl_vout_deinit(struct AVFormatContext * s) ++static void egl_vout_deinit(struct AVFormatContext *s) + { +- egl_display_env_t * const de = s->priv_data; ++ egl_display_env_t *const de = s->priv_data; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + +@@ -773,11 +770,11 @@ static void egl_vout_deinit(struct AVFormatContext * s) + + #define OFFSET(x) offsetof(egl_display_env_t, x) + static const AVOption options[] = { +- { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, +- { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, +- { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, +- { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, +- { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } + + }; + +From 06d2761f87ee240f2604f19659ad00ee34c728b4 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Dec 2022 16:49:43 +0000 +Subject: [PATCH 098/113] v4l2m2m: reporganise get_raw_format for loop logic + +--- + libavcodec/v4l2_context.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 042c6a976c..fcd5fdf359 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -828,28 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + return 0; + } + +- for (;;) { ++ for (;; ++fdesc.index) { + ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc); + if (ret) + return AVERROR(EINVAL); + + if (priv->pix_fmt != AV_PIX_FMT_NONE) { +- if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { +- fdesc.index++; ++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) + continue; +- } + } + + pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); + ret = v4l2_try_raw_format(ctx, pixfmt); +- if (ret){ +- fdesc.index++; +- continue; ++ if (ret == 0) { ++ *p = pixfmt; ++ return 0; + } +- +- *p = pixfmt; +- +- return 0; + } + + return AVERROR(EINVAL); + +From 01a67c1059c5003c29d5177f1067156c95a82e58 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Dec 2022 17:49:12 +0000 +Subject: [PATCH 099/113] drm_vout: Set zpos on the plane we pick to ensure it + is at the front + +--- + libavdevice/drm_vout.c | 38 +++++++++++++++++++++++++++++++++----- + 1 file changed, 33 insertions(+), 5 deletions(-) + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +index 15ed1b8825..8369dc6411 100644 +--- a/libavdevice/drm_vout.c ++++ b/libavdevice/drm_vout.c +@@ -115,9 +115,11 @@ static int find_plane(struct AVFormatContext * const avctx, + { + drmModePlaneResPtr planes; + drmModePlanePtr plane; ++ drmModeObjectPropertiesPtr props = NULL; ++ drmModePropertyPtr prop = NULL; + unsigned int i; + unsigned int j; +- int ret = 0; ++ int ret = -1; + + planes = drmModeGetPlaneResources(drmfd); + if (!planes) +@@ -154,11 +156,37 @@ static int find_plane(struct AVFormatContext * const avctx, + break; + } + +- if (i == planes->count_planes) +- ret = -1; ++ if (i == planes->count_planes) { ++ ret = -1; ++ goto fail; ++ } + +- drmModeFreePlaneResources(planes); +- return ret; ++ props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE); ++ if (!props) ++ goto fail; ++ for (i = 0; i != props->count_props; ++i) { ++ if (prop) ++ drmModeFreeProperty(prop); ++ prop = drmModeGetProperty(drmfd, props->props[i]); ++ if (!prop) ++ goto fail; ++ if (strcmp("zpos", prop->name) == 0) { ++ if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0) ++ av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]); ++ else ++ av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n"); ++ break; ++ } ++ } ++ ++ ret = 0; ++fail: ++ if (props) ++ drmModeFreeObjectProperties(props); ++ if (prop) ++ drmModeFreeProperty(prop); ++ drmModeFreePlaneResources(planes); ++ return ret; + } + + static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) + +From a92f752c7f6c5ba8d10d0d28bd5e00b01e1f20ef Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Dec 2022 17:51:46 +0000 +Subject: [PATCH 100/113] drm_vout: Only set modifier flag and pass modifiers + if there are some + +--- + libavdevice/drm_vout.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +index 8369dc6411..ccb9484668 100644 +--- a/libavdevice/drm_vout.c ++++ b/libavdevice/drm_vout.c +@@ -34,6 +34,7 @@ + + #include + #include ++#include + + #define TRACE_ALL 0 + +@@ -249,6 +250,7 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A + uint32_t offsets[4] = {0}; + uint64_t modifiers[4] = {0}; + uint32_t bo_handles[4] = {0}; ++ int has_mods = 0; + int i, j, n; + + da->frame = frame; +@@ -258,6 +260,9 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A + av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); + return -1; + } ++ if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR && ++ desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID) ++ has_mods = 1; + } + + n = 0; +@@ -299,11 +304,13 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A + #endif + + if (drmModeAddFB2WithModifiers(de->drm_fd, +- av_frame_cropped_width(frame), +- av_frame_cropped_height(frame), +- desc->layers[0].format, bo_handles, +- pitches, offsets, modifiers, +- &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, bo_handles, ++ pitches, offsets, ++ has_mods ? modifiers : NULL, ++ &da->fb_handle, ++ has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) { + av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); + return -1; + } + +From fb14b3bd5f5377118bd36c7fa59c97a323c76937 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Dec 2022 17:52:58 +0000 +Subject: [PATCH 101/113] drm_vout: Fix typo in error message + +--- + libavdevice/drm_vout.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +index ccb9484668..0510834942 100644 +--- a/libavdevice/drm_vout.c ++++ b/libavdevice/drm_vout.c +@@ -596,7 +596,7 @@ static int drm_vout_init(struct AVFormatContext * s) + sem_init(&de->q_sem_out, 0, 0); + if (pthread_create(&de->q_thread, NULL, display_thread, s)) { + rv = AVERROR(errno); +- av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv)); ++ av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv)); + goto fail_close; + } + + +From 0bf60f487e3f61f7d2f464f0fa682ecb8f942daf Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 12 Dec 2022 18:00:41 +0000 +Subject: [PATCH 102/113] drm_vout: Add option to name the drm_module to use + +--- + libavdevice/drm_vout.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +index 0510834942..07dc656777 100644 +--- a/libavdevice/drm_vout.c ++++ b/libavdevice/drm_vout.c +@@ -70,7 +70,9 @@ typedef struct drm_display_env_s + uint32_t con_id; + struct drm_setup setup; + enum AVPixelFormat avfmt; ++ + int show_all; ++ const char * drm_module; + + unsigned int ano; + drm_aux_t aux[AUX_SIZE]; +@@ -569,7 +571,6 @@ static int drm_vout_init(struct AVFormatContext * s) + { + drm_display_env_t * const de = s->priv_data; + int rv; +- const char * drm_module = DRM_MODULE; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + +@@ -578,10 +579,10 @@ static int drm_vout_init(struct AVFormatContext * s) + de->setup = (struct drm_setup){0}; + de->q_terminate = 0; + +- if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0) ++ if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0) + { + rv = AVERROR(errno); +- av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv)); ++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv)); + return rv; + } + +@@ -641,6 +642,7 @@ static void drm_vout_deinit(struct AVFormatContext * s) + #define OFFSET(x) offsetof(drm_display_env_t, x) + static const AVOption options[] = { + { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } + }; + + +From b44057a48d104b374eed0ac0f0d635b024af6930 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 13:01:00 +0000 +Subject: [PATCH 103/113] dmabufs: Rework to allow for non-CMA backends + +--- + libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++---------- + 1 file changed, 116 insertions(+), 45 deletions(-) + +diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c +index c4bbed18c6..1c3a5e861f 100644 +--- a/libavcodec/v4l2_req_dmabufs.c ++++ b/libavcodec/v4l2_req_dmabufs.c +@@ -1,3 +1,4 @@ ++#include + #include + #include + #include +@@ -19,9 +20,21 @@ + + #define TRACE_ALLOC 0 + ++struct dmabufs_ctl; ++struct dmabuf_h; ++ ++struct dmabuf_fns { ++ int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size); ++ void (*buf_free)(struct dmabuf_h * dh); ++ int (*ctl_new)(struct dmabufs_ctl * dbsc); ++ void (*ctl_free)(struct dmabufs_ctl * dbsc); ++}; ++ + struct dmabufs_ctl { + int fd; + size_t page_size; ++ void * v; ++ const struct dmabuf_fns * fns; + }; + + struct dmabuf_h { +@@ -29,6 +42,8 @@ struct dmabuf_h { + size_t size; + size_t len; + void * mapptr; ++ void * v; ++ const struct dmabuf_fns * fns; + }; + + #if TRACE_ALLOC +@@ -88,15 +103,8 @@ struct dmabuf_h * dmabuf_import(int fd, size_t size) + struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) + { + struct dmabuf_h * dh; +- struct dma_heap_allocation_data data = { +- .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), +- .fd = 0, +- .fd_flags = O_RDWR, +- .heap_flags = 0 +- }; +- + if (old != NULL) { +- if (old->size == data.len) { ++ if (old->size >= size) { + return old; + } + dmabuf_free(old); +@@ -106,24 +114,16 @@ struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * ol + (dh = malloc(sizeof(*dh))) == NULL) + return NULL; + +- while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { +- int err = errno; +- request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", +- (uint64_t)data.len, +- dbsc->fd, +- err, +- strerror(err)); +- if (err == EINTR) +- continue; +- goto fail; +- } +- + *dh = (struct dmabuf_h){ +- .fd = data.fd, +- .size = (size_t)data.len, +- .mapptr = MAP_FAILED ++ .fd = -1, ++ .mapptr = MAP_FAILED, ++ .fns = dbsc->fns + }; + ++ if (dh->fns->buf_alloc(dbsc, dh, size) != 0) ++ goto fail; ++ ++ + #if TRACE_ALLOC + ++total_bufs; + total_size += dh->size; +@@ -220,8 +220,6 @@ void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) + dh->len = len; + } + +- +- + void dmabuf_free(struct dmabuf_h * dh) + { + if (!dh) +@@ -233,20 +231,63 @@ void dmabuf_free(struct dmabuf_h * dh) + request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); + #endif + +- if (dh->mapptr != MAP_FAILED) ++ dh->fns->buf_free(dh); ++ ++ if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL) + munmap(dh->mapptr, dh->size); +- while (close(dh->fd) == -1 && errno == EINTR) +- /* loop */; ++ if (dh->fd != -1) ++ while (close(dh->fd) == -1 && errno == EINTR) ++ /* loop */; + free(dh); + } + +-struct dmabufs_ctl * dmabufs_ctl_new(void) ++static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns) + { +- struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc)); ++ struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc)); + + if (!dbsc) + return NULL; + ++ dbsc->fd = -1; ++ dbsc->fns = fns; ++ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); ++ ++ if (fns->ctl_new(dbsc) != 0) ++ goto fail; ++ ++ return dbsc; ++ ++fail: ++ free(dbsc); ++ return NULL; ++} ++ ++static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) ++{ ++ request_debug(NULL, "Free dmabuf ctl\n"); ++ ++ dbsc->fns->ctl_free(dbsc); ++ ++ free(dbsc); ++} ++ ++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) ++{ ++ struct dmabufs_ctl * const dbsc = *pDbsc; ++ ++ if (!dbsc) ++ return; ++ *pDbsc = NULL; ++ ++ dmabufs_ctl_free(dbsc); ++} ++ ++//----------------------------------------------------------------------------- ++// ++// Alloc dmabuf via CMA ++ ++static int ctl_cma_new(struct dmabufs_ctl * dbsc) ++{ + while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && + errno == EINTR) + /* Loop */; +@@ -258,31 +299,61 @@ struct dmabufs_ctl * dmabufs_ctl_new(void) + if (dbsc->fd == -1) { + request_log("Unable to open either %s or %s\n", + DMABUF_NAME1, DMABUF_NAME2); +- goto fail; ++ return -1; + } + } ++ return 0; ++} + +- dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); +- +- return dbsc; ++static void ctl_cma_free(struct dmabufs_ctl * dbsc) ++{ ++ if (dbsc->fd != -1) ++ while (close(dbsc->fd) == -1 && errno == EINTR) ++ /* loop */; + +-fail: +- free(dbsc); +- return NULL; + } + +-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) ++static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size) + { +- struct dmabufs_ctl * const dbsc = *pDbsc; ++ struct dma_heap_allocation_data data = { ++ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), ++ .fd = 0, ++ .fd_flags = O_RDWR, ++ .heap_flags = 0 ++ }; + +- if (!dbsc) +- return; +- *pDbsc = NULL; ++ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { ++ int err = errno; ++ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", ++ (uint64_t)data.len, ++ dbsc->fd, ++ err, ++ strerror(err)); ++ if (err == EINTR) ++ continue; ++ return -err; ++ } + +- while (close(dbsc->fd) == -1 && errno == EINTR) +- /* loop */; ++ dh->fd = data.fd; ++ dh->size = (size_t)data.len; ++ return 0; ++} + +- free(dbsc); ++static void buf_cma_free(struct dmabuf_h * dh) ++{ ++ // Nothing needed + } + ++static const struct dmabuf_fns dmabuf_cma_fns = { ++ .buf_alloc = buf_cma_alloc, ++ .buf_free = buf_cma_free, ++ .ctl_new = ctl_cma_new, ++ .ctl_free = ctl_cma_free, ++}; ++ ++struct dmabufs_ctl * dmabufs_ctl_new(void) ++{ ++ request_debug(NULL, "Dmabufs using CMA\n");; ++ return dmabufs_ctl_new2(&dmabuf_cma_fns); ++} + + +From 108360ef02f6c16872147cb1d8b8b4fe906a6f67 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 13:07:58 +0000 +Subject: [PATCH 104/113] dmabufs: Use unref rather than deleet on cmabufs_ctl + +--- + libavcodec/v4l2_req_dmabufs.c | 12 +++++++++++- + libavcodec/v4l2_req_dmabufs.h | 3 ++- + libavcodec/v4l2_request_hevc.c | 4 ++-- + 3 files changed, 15 insertions(+), 4 deletions(-) + +diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c +index 1c3a5e861f..acc0366e76 100644 +--- a/libavcodec/v4l2_req_dmabufs.c ++++ b/libavcodec/v4l2_req_dmabufs.c +@@ -31,6 +31,7 @@ struct dmabuf_fns { + }; + + struct dmabufs_ctl { ++ atomic_int ref_count; + int fd; + size_t page_size; + void * v; +@@ -271,7 +272,7 @@ static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc) + free(dbsc); + } + +-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) ++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc) + { + struct dmabufs_ctl * const dbsc = *pDbsc; + +@@ -279,9 +280,18 @@ void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) + return; + *pDbsc = NULL; + ++ if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0) ++ return; ++ + dmabufs_ctl_free(dbsc); + } + ++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc) ++{ ++ atomic_fetch_add(&dbsc->ref_count, 1); ++ return dbsc; ++} ++ + //----------------------------------------------------------------------------- + // + // Alloc dmabuf via CMA +diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h +index c1d3d8c8d7..381ba2708d 100644 +--- a/libavcodec/v4l2_req_dmabufs.h ++++ b/libavcodec/v4l2_req_dmabufs.h +@@ -7,7 +7,8 @@ struct dmabufs_ctl; + struct dmabuf_h; + + struct dmabufs_ctl * dmabufs_ctl_new(void); +-void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc); ++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc); ++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc); + + // Need not preserve old contents + // On NULL return old buffer is freed +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +index 767ecb036a..db7ed13b6d 100644 +--- a/libavcodec/v4l2_request_hevc.c ++++ b/libavcodec/v4l2_request_hevc.c +@@ -105,7 +105,7 @@ static int v4l2_request_hevc_uninit(AVCodecContext *avctx) + mediabufs_ctl_unref(&ctx->mbufs); + media_pool_delete(&ctx->mpool); + pollqueue_unref(&ctx->pq); +- dmabufs_ctl_delete(&ctx->dbufs); ++ dmabufs_ctl_unref(&ctx->dbufs); + devscan_delete(&ctx->devscan); + + decode_q_uninit(&ctx->decode_q); +@@ -324,7 +324,7 @@ fail3: + fail2: + pollqueue_unref(&ctx->pq); + fail1: +- dmabufs_ctl_delete(&ctx->dbufs); ++ dmabufs_ctl_unref(&ctx->dbufs); + fail0: + devscan_delete(&ctx->devscan); + return ret; + +From 067f436e1d053af11bb6c8d6ba42dede164247c0 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 14:21:40 +0000 +Subject: [PATCH 105/113] egl_vout: Remove redundant & completely broken debug + +--- + libavdevice/egl_vout.c | 25 ------------------------- + 1 file changed, 25 deletions(-) + +diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c +index 8fd3c58bac..c2659e96dc 100644 +--- a/libavdevice/egl_vout.c ++++ b/libavdevice/egl_vout.c +@@ -515,31 +515,6 @@ static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVF + } + + da->fd = desc->objects[0].fd; +- +-#if 0 +- av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," +- " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", +- av_frame_cropped_width(frame), +- av_frame_cropped_height(frame), +- desc->layers[0].format, +- bo_plane_handles[0], +- bo_plane_handles[1], +- bo_plane_handles[2], +- bo_plane_handles[3], +- pitches[0], +- pitches[1], +- pitches[2], +- pitches[3], +- offsets[0], +- offsets[1], +- offsets[2], +- offsets[3], +- (long long)modifiers[0], +- (long long)modifiers[1], +- (long long)modifiers[2], +- (long long)modifiers[3] +- ); +-#endif + } + + glClearColor(0.5, 0.5, 0.5, 0.5); + +From ba04b91800e5a17ff1cee5023d94e304037cd7d5 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 16:12:12 +0000 +Subject: [PATCH 106/113] v4l2m2m: Use offset from querybuf rather than always + 0 + +--- + libavcodec/v4l2_buffers.c | 4 +++- + libavcodec/v4l2_buffers.h | 3 ++- + 2 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 9ef2f40e39..5ca58ea593 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -379,7 +379,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) + + for (int i = 0; i < avbuf->num_planes; i++) { + layer->planes[i].object_index = i; +- layer->planes[i].offset = 0; ++ layer->planes[i].offset = avbuf->plane_info[i].offset; + layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; + } + +@@ -934,6 +934,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; ++ avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset; + + if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, +@@ -941,6 +942,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); + } else { + avbuf->plane_info[i].length = avbuf->buf.length; ++ avbuf->plane_info[i].offset = 0; + + if (want_mmap) + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 1ac32c5989..d91d5d1dd0 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -66,7 +66,8 @@ typedef struct V4L2Buffer { + + /* keep track of the mmap address and mmap length */ + struct V4L2Plane_info { +- int bytesperline; ++ size_t bytesperline; ++ size_t offset; + void * mm_addr; + size_t length; + } plane_info[VIDEO_MAX_PLANES]; + +From e7b15fa9f59f25e074a1f59ed5d16e158bdf8a76 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 17:57:27 +0000 +Subject: [PATCH 107/113] v4l2m2m: Fix crash if init errors out before setting + avctx + +--- + libavcodec/v4l2_m2m.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index e29df41729..cc4503dcae 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -276,7 +276,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + + av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); + +- if (av_codec_is_decoder(s->avctx->codec)) ++ if (s->avctx && av_codec_is_decoder(s->avctx->codec)) + av_packet_unref(&s->buf_pkt); + + if (s->fd >= 0) { + +From 536e74b776084d66d33730114c96cfc17866180a Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 18:10:30 +0000 +Subject: [PATCH 108/113] v4l2_buffers: Add and use ctx_to_m2mctx + error debug + +--- + libavcodec/v4l2_buffers.c | 22 +++++++++++++++------- + 1 file changed, 15 insertions(+), 7 deletions(-) + +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 5ca58ea593..e28ef2d1e8 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -41,11 +41,16 @@ + #define USEC_PER_SEC 1000000 + static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; + ++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) ++{ ++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ? ++ container_of(ctx, V4L2m2mContext, output) : ++ container_of(ctx, V4L2m2mContext, capture); ++} ++ + static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) + { +- return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? +- container_of(buf->context, V4L2m2mContext, output) : +- container_of(buf->context, V4L2m2mContext, capture); ++ return ctx_to_m2mctx(buf->context); + } + + static inline AVCodecContext *logger(const V4L2Buffer * const buf) +@@ -883,6 +888,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + int ret, i; + V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); + AVBufferRef * bufref; ++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + + *pbufref = NULL; + if (avbuf == NULL) +@@ -910,7 +916,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + avbuf->buf.m.planes = avbuf->planes; + } + +- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); ++ ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf); + if (ret < 0) + goto fail; + +@@ -969,10 +975,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct + } + + if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- if (buf_to_m2mctx(avbuf)->output_drm) { ++ if (s->output_drm) { + ret = v4l2_buffer_export_drm(avbuf); +- if (ret) +- goto fail; ++ if (ret) { ++ av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n"); ++ goto fail; ++ } + } + } + + +From c691e8309d798048c52c8c32427c50cfcbe65055 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 18:53:22 +0000 +Subject: [PATCH 109/113] v4l2m2m: Add ability to use cma alloced dmabufs as + well as v4l2 mmap + +--- + libavcodec/Makefile | 2 +- + libavcodec/v4l2_buffers.c | 65 ++++++++++++++++++++++++++------------- + libavcodec/v4l2_buffers.h | 2 ++ + libavcodec/v4l2_m2m.c | 6 +++- + libavcodec/v4l2_m2m.h | 4 +++ + libavcodec/v4l2_m2m_dec.c | 16 ++++++++++ + 6 files changed, 71 insertions(+), 24 deletions(-) + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 928756850f..daf7456d73 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -162,7 +162,7 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o + OBJS-$(CONFIG_VP56DSP) += vp56dsp.o + OBJS-$(CONFIG_VP8DSP) += vp8dsp.o + OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ +- weak_link.o ++ weak_link.o v4l2_req_dmabufs.o + OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ + v4l2_req_devscan.o weak_link.o + OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index e28ef2d1e8..8d80d19788 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -36,6 +36,7 @@ + #include "v4l2_context.h" + #include "v4l2_buffers.h" + #include "v4l2_m2m.h" ++#include "v4l2_req_dmabufs.h" + #include "weak_link.h" + + #define USEC_PER_SEC 1000000 +@@ -477,33 +478,46 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data) + av_buffer_unref(&bufref); + } + ++static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i) ++{ ++ return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length; ++} ++ + static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) + { +- struct v4l2_exportbuffer expbuf; + int i, ret; ++ const V4L2m2mContext * const s = buf_to_m2mctx(avbuf); + + for (i = 0; i < avbuf->num_planes; i++) { +- memset(&expbuf, 0, sizeof(expbuf)); +- +- expbuf.index = avbuf->buf.index; +- expbuf.type = avbuf->buf.type; +- expbuf.plane = i; ++ int dma_fd = -1; ++ const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i); ++ ++ if (s->db_ctl != NULL) { ++ if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL) ++ return AVERROR(ENOMEM); ++ dma_fd = dmabuf_fd(avbuf->dmabuf[i]); ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) ++ avbuf->buf.m.planes[i].m.fd = dma_fd; ++ else ++ avbuf->buf.m.fd = dma_fd; ++ } ++ else { ++ struct v4l2_exportbuffer expbuf; ++ memset(&expbuf, 0, sizeof(expbuf)); + +- ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); +- if (ret < 0) +- return AVERROR(errno); ++ expbuf.index = avbuf->buf.index; ++ expbuf.type = avbuf->buf.type; ++ expbuf.plane = i; + +- if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { +- /* drm frame */ +- avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; +- avbuf->drm_frame.objects[i].fd = expbuf.fd; +- avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; +- } else { +- /* drm frame */ +- avbuf->drm_frame.objects[0].size = avbuf->buf.length; +- avbuf->drm_frame.objects[0].fd = expbuf.fd; +- avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ dma_fd = expbuf.fd; + } ++ ++ avbuf->drm_frame.objects[i].size = blen; ++ avbuf->drm_frame.objects[i].fd = dma_fd; ++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; + } + + return 0; +@@ -870,9 +884,16 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) + munmap(p->mm_addr, p->length); + } + +- for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { +- if (avbuf->drm_frame.objects[i].fd != -1) +- close(avbuf->drm_frame.objects[i].fd); ++ if (avbuf->dmabuf[0] == NULL) { ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { ++ if (avbuf->drm_frame.objects[i].fd != -1) ++ close(avbuf->drm_frame.objects[i].fd); ++ } ++ } ++ else { ++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) { ++ dmabuf_free(avbuf->dmabuf[i]); ++ } + } + + av_buffer_unref(&avbuf->ref_buf); +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index d91d5d1dd0..444ad94b14 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -46,6 +46,7 @@ enum V4L2Buffer_status { + */ + struct V4L2Context; + struct ff_weak_link_client; ++struct dmabuf_h; + + typedef struct V4L2Buffer { + /* each buffer needs to have a reference to its context +@@ -80,6 +81,7 @@ typedef struct V4L2Buffer { + + enum V4L2Buffer_status status; + ++ struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here + } V4L2Buffer; + + /** +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index cc4503dcae..cb95031610 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -34,6 +34,7 @@ + #include "v4l2_context.h" + #include "v4l2_fmt.h" + #include "v4l2_m2m.h" ++#include "v4l2_req_dmabufs.h" + + static void + xlat_init(xlat_track_t * const x) +@@ -75,7 +76,7 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) + + s->capture.done = s->output.done = 0; + s->capture.name = "capture"; +- s->capture.buf_mem = V4L2_MEMORY_MMAP; ++ s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + s->output.name = "output"; + s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + atomic_init(&s->refcount, 0); +@@ -94,12 +95,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe) + if (v4l2_mplane_video(&cap)) { + s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ s->output.format.type = s->output.type; + return 0; + } + + if (v4l2_splane_video(&cap)) { + s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ s->output.format.type = s->output.type; + return 0; + } + +@@ -291,6 +294,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + + ff_v4l2_context_release(&s->output); + ++ dmabufs_ctl_unref(&s->db_ctl); + close(s->fd); + s->fd = -1; + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 26a7161042..0f41f94694 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -71,6 +71,8 @@ typedef struct xlat_track_s { + V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; + } xlat_track_t; + ++struct dmabufs_ctl; ++ + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; + int fd; +@@ -124,6 +126,7 @@ typedef struct V4L2m2mContext { + /* Quirks */ + unsigned int quirks; + ++ struct dmabufs_ctl * db_ctl; + } V4L2m2mContext; + + typedef struct V4L2m2mPriv { +@@ -134,6 +137,7 @@ typedef struct V4L2m2mPriv { + + int num_output_buffers; + int num_capture_buffers; ++ const char * dmabuf_alloc; + enum AVPixelFormat pix_fmt; + } V4L2m2mPriv; + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 73ce427052..663cb60a60 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -41,6 +41,7 @@ + #include "v4l2_context.h" + #include "v4l2_m2m.h" + #include "v4l2_fmt.h" ++#include "v4l2_req_dmabufs.h" + + // Pick 64 for max last count - that is >1sec at 60fps + #define STATS_LAST_COUNT_MAX 64 +@@ -896,6 +897,20 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + s->output_drm = 0; + } + ++ s->db_ctl = NULL; ++ if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) { ++ if (strcmp(priv->dmabuf_alloc, "cma") == 0) ++ s->db_ctl = dmabufs_ctl_new(); ++ else { ++ av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc); ++ return AVERROR(EINVAL); ++ } ++ if (!s->db_ctl) { ++ av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc); ++ return AVERROR(ENOMEM); ++ } ++ } ++ + s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); + if (!s->device_ref) { + ret = AVERROR(ENOMEM); +@@ -1000,6 +1015,7 @@ static const AVOption options[] = { + { "num_capture_buffers", "Number of buffers in the capture context", + OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, + { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, ++ { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS }, + { NULL}, + }; + + +From a6fb8bbf969abb06e18ac5b3fd5ad4b24791f228 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Tue, 13 Dec 2022 19:05:47 +0000 +Subject: [PATCH 110/113] testfilt: Skeleton of hw filter test code + +--- + pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 83 insertions(+) + create mode 100755 pi-util/testfilt.py + +diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py +new file mode 100755 +index 0000000000..b322dac0c2 +--- /dev/null ++++ b/pi-util/testfilt.py +@@ -0,0 +1,83 @@ ++#!/usr/bin/env python3 ++ ++import string ++import os ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class validator: ++ def __init__(self): ++ self.ok = False ++ ++ def isok(self): ++ return self.ok ++ ++ def setok(self): ++ self.ok = True ++ ++class valid_regex(validator): ++ def __init__(self, regex): ++ super().__init__() ++ self.regex = re.compile(regex) ++ ++ def scanline(self, line): ++ if self.isok() or self.regex.search(line): ++ self.setok() ++ ++ ++def validate(validators, flog): ++ for line in flog: ++ for v in validators: ++ v.scanline(line) ++ ++ ok = True ++ for v in validators: ++ if not v.isok(): ++ ok = False ++ # complain ++ print("Test failed") ++ ++ if ok: ++ print("OK") ++ return ok ++ ++def runtest(name, ffmpeg, args, suffix, validators): ++ log_root = os.path.join("/tmp", "testfilt", name) ++ ofilename = os.path.join(log_root, name + suffix) ++ ++ if not os.path.exists(log_root): ++ os.makedirs(log_root) ++ ++ try: ++ os.remove(ofilename) ++ except: ++ pass ++ ++ flog = open(os.path.join(log_root, name + ".log"), "wb") ++ ffargs = [ffmpeg] + args + [ofilename] ++ ++ subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False) ++ flog.close ++ ++ flog = open(os.path.join(log_root, name + ".log"), "rt") ++ return validate(validators, flog) ++ ++def sayok(log_root, flog): ++ print("Woohoo") ++ return True ++ ++if __name__ == '__main__': ++ ++ argp = argparse.ArgumentParser(description="FFmpeg filter tester") ++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name") ++ args = argp.parse_args() ++ ++ runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i", ++ "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv", ++# "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv", ++ "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv", ++ [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')]) + +From f3e75d79420faff8244d53cf60b555df4d1549f3 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Thu, 5 Jan 2023 14:39:30 +0000 +Subject: [PATCH 111/113] pixfmt: Add a #define to indicate presence of SAND + formats + +--- + libavutil/pixfmt.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index b0dae0fe83..c917e5ac62 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -350,6 +350,8 @@ enum AVPixelFormat { + AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian + AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian + // RPI - not on ifdef so can be got at by calling progs ++// #define so code that uses this can know it is there ++#define AVUTIL_HAVE_PIX_FMT_SAND 1 + AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + +From 1f8906af1d2a493b1b594e7329cdef219e1660f8 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Wed, 11 Jan 2023 16:30:37 +0000 +Subject: [PATCH 112/113] v4l2_m2m_dec: Fix initial pkt send if no extradata + +--- + libavcodec/v4l2_m2m_dec.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 663cb60a60..1668945cff 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -240,7 +240,7 @@ copy_extradata(AVCodecContext * const avctx, + else + len = src_len < 0 ? AVERROR(EINVAL) : src_len; + +- // Zero length is OK but we swant to stop - -ve is error val ++ // Zero length is OK but we want to stop - -ve is error val + if (len <= 0) + return len; + +@@ -525,7 +525,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const + + if (s->extdata_sent) + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); +- else if (s->extdata_data) ++ else + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); + + if (ret == AVERROR(EAGAIN)) { + +From 1beaf80a9f23aa703c50d1882117ccf1de8a3eb4 Mon Sep 17 00:00:00 2001 +From: John Cox +Date: Mon, 16 Jan 2023 16:05:09 +0000 +Subject: [PATCH 113/113] v4l2m2m_dec: Make capture timeout long once pending + count > 31 + +For some applications (ffmpeg command line) the current heuristic of adding +a short timeout and preferring DQ over Q once we think we have buffers +Qed in V4L2 is insufficient to prevent arbitrary buffer growth. +Unfortunately the current method of guessing the number of Qed buffers isn't +reliable enough to allow for a long timeout with only a few few buffers +believed pending so only do so once the number of buffers believed pending +exceeds plausible inaccuracies caused by buffer reordering. + +The limit could be optimised by codec or apparent latency but a simple +number should reduce the unexpected consequences. +--- + libavcodec/v4l2_m2m.h | 3 ++- + libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++---- + 2 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 0f41f94694..ded1478a49 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -66,7 +66,7 @@ typedef struct pts_stats_s + + typedef struct xlat_track_s { + unsigned int track_no; +- int64_t last_pts; ++ int64_t last_pts; // Last valid PTS decoded + int64_t last_opaque; + V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; + } xlat_track_t; +@@ -88,6 +88,7 @@ typedef struct V4L2m2mContext { + + /* null frame/packet received */ + int draining; ++ int running; + AVPacket buf_pkt; + + /* Reference to a frame. Only used during encoding */ +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 1668945cff..32d5548707 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -582,7 +582,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + + do { + const int pending = xlat_pending(&s->xlat); +- const int prefer_dq = (pending > 3); ++ const int prefer_dq = (pending > 4); + const int last_src_rv = src_rv; + + av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt); +@@ -611,10 +611,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + // (b) enqueue returned a status indicating that decode should be attempted + if (dst_rv != 0 && TRY_DQ(src_rv)) { + // Pick a timeout depending on state ++ // The pending count isn't completely reliable so it is good enough ++ // hint that we want a frame but not good enough to require it in ++ // all cases; however if it has got > 31 that exceeds its margin of ++ // error so require a frame to prevent ridiculous levels of latency + const int t = + src_rv == NQ_Q_FULL ? -1 : + src_rv == NQ_DRAINING ? 300 : +- prefer_dq ? 5 : 0; ++ prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0; + + // Dequeue frame will unref any previous contents of frame + // if it returns success so we don't need an explicit unref +@@ -631,8 +635,13 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + } + } + +- if (dst_rv == 0) ++ if (dst_rv == 0) { + set_best_effort_pts(avctx, &s->pts_stat, frame); ++ if (!s->running) { ++ s->running = 1; ++ av_log(avctx, AV_LOG_VERBOSE, "Decode running\n"); ++ } ++ } + + if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { + av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); +@@ -998,7 +1007,8 @@ static void v4l2_decode_flush(AVCodecContext *avctx) + + // resend extradata + s->extdata_sent = 0; +- // clear EOS status vars ++ // clear status vars ++ s->running = 0; + s->draining = 0; + output->done = 0; + capture->done = 0; From c57e7bc2833c31d6e89d194505ebf7e6d384786b Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Thu, 16 Feb 2023 10:01:55 +0100 Subject: [PATCH 11/12] ffmpeg: drop kodi patch Signed-off-by: Matthias Reichl --- packages/multimedia/ffmpeg/package.mk | 2 +- .../ffmpeg/patches/kodi/ffmpeg-001-kodi.patch | 215 ------------------ 2 files changed, 1 insertion(+), 216 deletions(-) delete mode 100644 packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 38bdf869f5..5080b803d0 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -10,7 +10,7 @@ PKG_SITE="https://ffmpeg.org" PKG_URL="http://ffmpeg.org/releases/ffmpeg-${PKG_VERSION}.tar.xz" PKG_DEPENDS_TARGET="toolchain zlib bzip2 openssl speex" PKG_LONGDESC="FFmpeg is a complete, cross-platform solution to record, convert and stream audio and video." -PKG_PATCH_DIRS="kodi libreelec" +PKG_PATCH_DIRS="libreelec" case "${PROJECT}" in Amlogic) diff --git a/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch b/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch deleted file mode 100644 index c1aefeac04..0000000000 --- a/packages/multimedia/ffmpeg/patches/kodi/ffmpeg-001-kodi.patch +++ /dev/null @@ -1,215 +0,0 @@ -From c4b5aa630053c59eac2c2fe52071cd26c570107a Mon Sep 17 00:00:00 2001 -From: marc -Date: Mon, 18 Feb 2013 17:18:18 +0000 -Subject: [PATCH 1/6] dxva-h264: Fix an AMD driver issue with playback of - streams that don't start with an I-Frame - ---- - libavcodec/dxva2_h264.c | 8 ++++++++ - libavcodec/h264_slice.c | 1 + - libavcodec/h264dec.c | 1 + - libavcodec/h264dec.h | 2 ++ - 4 files changed, 12 insertions(+) - -diff --git a/libavcodec/dxva2_h264.c b/libavcodec/dxva2_h264.c -index 6300b1418d..9e53355fae 100644 ---- a/libavcodec/dxva2_h264.c -+++ b/libavcodec/dxva2_h264.c -@@ -506,6 +506,14 @@ static int dxva2_h264_end_frame(AVCodecContext *avctx) - - if (ctx_pic->slice_count <= 0 || ctx_pic->bitstream_size <= 0) - return -1; -+ -+ // Wait for an I-frame before start decoding. Workaround for ATI UVD and UVD+ GPUs -+ if (!h->got_first_iframe) { -+ if (!(ctx_pic->pp.wBitFields & (1 << 15))) -+ return -1; -+ h->got_first_iframe = 1; -+ } -+ - ret = ff_dxva2_common_end_frame(avctx, h->cur_pic_ptr->f, - &ctx_pic->pp, sizeof(ctx_pic->pp), - &ctx_pic->qm, sizeof(ctx_pic->qm), -diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c -index d56722a5c2..a94a5a1784 100644 ---- a/libavcodec/h264_slice.c -+++ b/libavcodec/h264_slice.c -@@ -971,6 +971,7 @@ static int h264_slice_header_init(H264Context *h) - - h->first_field = 0; - h->prev_interlaced_frame = 1; -+ h->got_first_iframe = 0; - - init_scan_tables(h); - ret = ff_h264_alloc_tables(h); -diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c -index 2a5b53ea56..8689b462d1 100644 ---- a/libavcodec/h264dec.c -+++ b/libavcodec/h264dec.c -@@ -448,6 +448,7 @@ void ff_h264_flush_change(H264Context *h) - - h->next_outputed_poc = INT_MIN; - h->prev_interlaced_frame = 1; -+ h->got_first_iframe = 0; - idr(h); - - h->poc.prev_frame_num = -1; -diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h -index 9a1ec1bace..eab495cdd0 100644 ---- a/libavcodec/h264dec.h -+++ b/libavcodec/h264dec.h -@@ -532,6 +532,8 @@ typedef struct H264Context { - * slices) anymore */ - int setup_finished; - -+ int got_first_iframe; -+ - int cur_chroma_format_idc; - int cur_bit_depth_luma; - int16_t slice_row[MAX_SLICES]; ///< to detect when MAX_SLICES is too low - -From df996a4c35b85b61c73fa7cabc587299ed6b3957 Mon Sep 17 00:00:00 2001 -From: Rechi -Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 2/6] use Kodi as extra version - ---- - Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Makefile b/Makefile -index 61f79e27ae..7ab083fd70 100644 ---- a/Makefile -+++ b/Makefile -@@ -137,7 +137,7 @@ GIT_LOG = $(SRC_PATH)/.git/logs/HEAD - .version: M=@ - - libavutil/ffversion.h .version: -- $(M)$(VERSION_SH) $(SRC_PATH) libavutil/ffversion.h $(EXTRA_VERSION) -+ $(M)$(VERSION_SH) $(SRC_PATH) libavutil/ffversion.h Kodi - $(Q)touch .version - - # force version.sh to run whenever version might have changed - -From e02f5681f78b18d886af512d0ad5d553faa8968d Mon Sep 17 00:00:00 2001 -From: Rechi -Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 3/6] common.mak: never ignore an error if strip doesn't - succeed - ---- - ffbuild/common.mak | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ffbuild/common.mak b/ffbuild/common.mak -index f52473453e..85f1d43bb8 100644 ---- a/ffbuild/common.mak -+++ b/ffbuild/common.mak -@@ -101,7 +101,7 @@ COMPILE_LASX = $(call COMPILE,CC,LASXFLAGS) - - %.o: %.asm - $(COMPILE_X86ASM) -- -$(if $(ASMSTRIPFLAGS), $(STRIP) $(ASMSTRIPFLAGS) $@) -+ $(if $(STRIP), $(if $(ASMSTRIPFLAGS), $(STRIP) $(ASMSTRIPFLAGS) $@)) - - %.o: %.rc - $(WINDRES) $(IFLAGS) $(foreach ARG,$(CC_DEPFLAGS),--preprocessor-arg "$(ARG)") -o $@ $< - -From 43ad570ce0dd168bdcc206244594302f922d95f2 Mon Sep 17 00:00:00 2001 -From: wsnipex -Date: Tue, 21 Nov 2017 08:16:53 +0100 -Subject: [PATCH 4/6] only check for a git rev if the src tree is in a git repo - -fixes the version string when building from the kodi depends src tree ---- - ffbuild/version.sh | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/ffbuild/version.sh b/ffbuild/version.sh -index edc4dd33c5..239a138ca7 100755 ---- a/ffbuild/version.sh -+++ b/ffbuild/version.sh -@@ -2,6 +2,7 @@ - - # Usage: version.sh - -+if [ -d $1/.git ]; then # only check for a git rev, if the src tree is in a git repo - # check for git short hash - if ! test "$revision"; then - if (cd "$1" && grep git RELEASE 2> /dev/null >/dev/null) ; then -@@ -27,6 +28,7 @@ if [ -z "$revision" ]; then - git_hash="${srcdir##*-}";; - esac - fi -+fi - - # no revision number found - test "$revision" || revision=$(cd "$1" && cat RELEASE 2> /dev/null) - -From fec4c2031296c414685477c75e6a4bdff105a3ac Mon Sep 17 00:00:00 2001 -From: Anton Fedchin -Date: Fri, 11 Jan 2019 10:47:43 +0100 -Subject: [PATCH 5/6] after 153b36f there is a possibility to crash when trying - to get index of a surface which points to nirvana. - -it may occurs when a stream starts with non i-frame. ---- - libavcodec/dxva2.c | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/libavcodec/dxva2.c b/libavcodec/dxva2.c -index 568d686f39..735127f83a 100644 ---- a/libavcodec/dxva2.c -+++ b/libavcodec/dxva2.c -@@ -777,16 +777,18 @@ unsigned ff_dxva2_get_surface_index(const AVCodecContext *avctx, - #if CONFIG_D3D11VA - if (avctx->pix_fmt == AV_PIX_FMT_D3D11) - return (intptr_t)frame->data[1]; -- if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD) { -+ if (avctx->pix_fmt == AV_PIX_FMT_D3D11VA_VLD && surface) { - D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC viewDesc; - ID3D11VideoDecoderOutputView_GetDesc((ID3D11VideoDecoderOutputView*) surface, &viewDesc); - return viewDesc.Texture2D.ArraySlice; - } - #endif - #if CONFIG_DXVA2 -- for (i = 0; i < DXVA_CONTEXT_COUNT(avctx, ctx); i++) { -- if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD && ctx->dxva2.surface[i] == surface) -- return i; -+ if (avctx->pix_fmt == AV_PIX_FMT_DXVA2_VLD) { -+ for (i = 0; i < DXVA_CONTEXT_COUNT(avctx, ctx); i++) { -+ if (ctx->dxva2.surface[i] == surface) -+ return i; -+ } - } - #endif - - -From ca8882fcaf5da0192772733a7ba832765df7c391 Mon Sep 17 00:00:00 2001 -From: Rainer Hochecker -Date: Sat, 26 Jan 2019 19:48:35 +0100 -Subject: [PATCH 6/6] avcodec/vaapi_h264: skip decode if pic has no slices - -This fixes / workarounds https://bugs.freedesktop.org/show_bug.cgi?id=105368. -It was hit frequently when watching h264 channels received via DVB-X. -Corresponding kodi bug: https://github.com/xbmc/xbmc/issues/15704 ---- - libavcodec/vaapi_h264.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/libavcodec/vaapi_h264.c b/libavcodec/vaapi_h264.c -index 9332aa6f31..d4494beebf 100644 ---- a/libavcodec/vaapi_h264.c -+++ b/libavcodec/vaapi_h264.c -@@ -314,6 +314,11 @@ static int vaapi_h264_end_frame(AVCodecContext *avctx) - H264SliceContext *sl = &h->slice_ctx[0]; - int ret; - -+ if (pic->nb_slices == 0) { -+ ret = AVERROR_INVALIDDATA; -+ goto finish; -+ } -+ - ret = ff_vaapi_decode_issue(avctx, pic); - if (ret < 0) - goto finish; From 4257240e8cc1782fa7da4b1e7e39dd26f1f7c3ae Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Thu, 16 Feb 2023 10:02:28 +0100 Subject: [PATCH 12/12] tools/ffmpeg/gen-patches.sh: drop kodi patch generation Signed-off-by: Matthias Reichl --- tools/ffmpeg/gen-patches.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/ffmpeg/gen-patches.sh b/tools/ffmpeg/gen-patches.sh index 5ce4e33615..f6d296e5e0 100755 --- a/tools/ffmpeg/gen-patches.sh +++ b/tools/ffmpeg/gen-patches.sh @@ -7,7 +7,7 @@ FFMPEG_VERSION="n5.1.2" KODI_FFMPEG_REPO="https://github.com/xbmc/FFmpeg" KODI_FFMPEG_VERSION="5.1.2-Nexus-Alpha3" -ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi kodi vf-deinterlace-v4l2m2m" +ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi vf-deinterlace-v4l2m2m" if [ $# -eq 0 ]; then echo "usage: $0 all|featureset [githash]" @@ -40,11 +40,6 @@ create_patch() { REPO="https://github.com/jc-kynesim/rpi-ffmpeg" REFSPEC="dev/5.1.2/rpi_import_1" ;; - kodi) - REPO="${KODI_FFMPEG_REPO}" - REFSPEC="${KODI_FFMPEG_VERSION}" - REFTYPE="tag" - ;; *) echo "illegal feature set ${FEATURE_SET}" exit 1