diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index b3fb4b36ac..325c99a41e 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -65,7 +65,7 @@ index 18d80ee87a..9e621d09c1 100755 vaguedenoiser_filter_deps="gpl" vidstabdetect_filter_deps="libvidstab" diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 3ee31473dc..312864d737 100644 +index 3ee31473dc..6875200380 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -24,6 +24,12 @@ @@ -337,15 +337,15 @@ index 3ee31473dc..312864d737 100644 + if (de->conn != NULL) { + mmal_connection_destroy(de->conn); + } ++ if (de->rpi_pool != NULL) { ++ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool); ++ } + if (de->isp != NULL) { + mmal_component_destroy(de->isp); + } + if (de->display != NULL) { + mmal_component_destroy(de->display); + } -+ if (de->rpi_pool != NULL) { -+ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool); -+ } + + av_free(de); + } @@ -388,7 +388,7 @@ index 3ee31473dc..312864d737 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -1052,6 +1321,15 @@ static void do_video_out(OutputFile *of, +@@ -1052,6 +1321,17 @@ static void do_video_out(OutputFile *of, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; @@ -396,7 +396,9 @@ index 3ee31473dc..312864d737 100644 + if (next_picture && ist != NULL) + { + if (rpi_display_env == NULL) -+ rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); ++ rpi_display_env = display_init(next_picture->format, 0, 0, ++ next_picture->width - next_picture->crop_right, ++ next_picture->height - next_picture->crop_bottom); + display_frame(ist->dec_ctx, rpi_display_env, next_picture); + } +#endif @@ -404,7 +406,7 @@ index 3ee31473dc..312864d737 100644 frame_rate = av_buffersink_get_frame_rate(filter); if (frame_rate.num > 0 && frame_rate.den > 0) duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base)); -@@ -2165,8 +2443,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) +@@ -2165,8 +2445,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) ifilter->channel_layout != frame->channel_layout; break; case AVMEDIA_TYPE_VIDEO: @@ -415,7 +417,7 @@ index 3ee31473dc..312864d737 100644 break; } -@@ -2896,6 +3174,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2896,6 +3176,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -4944,10 +4946,10 @@ index 0000000000..7dfcc2751a + diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S new file mode 100644 -index 0000000000..b56dc8ccc5 +index 0000000000..12ffc5708a --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S -@@ -0,0 +1,2156 @@ +@@ -0,0 +1,2199 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * 2017 John Cox (for Raspberry Pi) @@ -4974,45 +4976,72 @@ index 0000000000..b56dc8ccc5 + +.set EDGE_SRC_STRIDE, 160 + -+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128 -+ vshr.u8 q12, q8, #3 -+ vadd.s8 q8, \Q_K128 -+ vshr.u8 q13, q9, #3 -+ vadd.s8 q9, \Q_K128 ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4 ++ vshr.u8 q12, q8, #3 ++ \I1 ++ vadd.i8 q8, \Q_K128 ++ \I2 ++ vshr.u8 q13, q9, #3 ++ \I3 ++ vadd.i8 q9, \Q_K128 ++ \I4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 + -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q8, q12 ++ vshr.u8 q12, q10, #3 ++ vadd.i8 q10, \Q_K128 ++ vqadd.s8 q9, q13 ++ vshr.u8 q13, q11, #3 ++ vadd.i8 q11, \Q_K128 + -+ vqadd.s8 q8, q12 -+ vshr.u8 q12, q10, #3 -+ vadd.s8 q10, \Q_K128 -+ vqadd.s8 q9, q13 -+ vshr.u8 q13, q11, #3 -+ vadd.s8 q11, \Q_K128 -+ -+ vsub.s8 q8, \Q_K128 -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vsub.s8 q9, \Q_K128 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ vqadd.s8 q10, q12 -+ vqadd.s8 q11, q13 -+ vsub.s8 q10, \Q_K128 -+ vsub.s8 q11, \Q_K128 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vsub.i8 q8, \Q_K128 ++ vqadd.s8 q11, q13 ++ vsub.i8 q9, \Q_K128 ++ vsub.i8 q10, \Q_K128 ++ vsub.i8 q11, \Q_K128 +.endm + -+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128 -+ vshr.u8 q12, q8, #3 -+ vadd.s8 q8, \Q_K128 -+ -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 -+ -+ vqadd.s8 q8, q12 -+ vsub.s8 q8, \Q_K128 ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vsub.i8 q13, q12, \Q_K128 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bpl 1b ++2: vsub.i8 q13, q12, \Q_K128 ++ \S1 ++ \S2 ++ \S3 ++ \S4 +.endm + + @@ -5028,13 +5057,15 @@ index 0000000000..b56dc8ccc5 +.endm + +@ Clobbers q12, q13 -+.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2 + vshrn.i16 d24, \Q0, #(\bit_depth - 5) + vshrn.i16 d25, \Q1, #(\bit_depth - 5) + vshrn.i16 d26, \Q2, #(\bit_depth - 5) -+ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ \I1 + vtbl.8 d24, \XLAT0, d24 ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) + vtbl.8 d25, \XLAT1, d25 ++ \I2 + vtbl.8 d26, \XLAT0, d26 + vtbl.8 d27, \XLAT1, d27 + vaddw.s8 \Q0, d24 @@ -5044,18 +5075,48 @@ index 0000000000..b56dc8ccc5 + clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX +.endm + -+@ Clobbers q12 -+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth -+ vshrn.i16 d24, \Q0, #(\bit_depth - 5) -+ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++@ Clobbers q10, q11, q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 + vtbl.8 d24, \XLAT0, d24 + vtbl.8 d25, \XLAT1, d25 -+ vaddw.s8 \Q0, d24 -+ vaddw.s8 \Q1, d25 -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bpl 1b ++2: vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 +.endm + + @@ -5072,85 +5133,59 @@ index 0000000000..b56dc8ccc5 +@ +@ It also loads other common regs + ++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately +function band_load_y ++ ldr ip, [sp, #16] @ &sao_offset_val[0] ++ ldr r4, [sp, #20] @ sao_left_class ++ vmov.i64 d4, #0 + vmov.i64 q0, #0 -+ ldr r12, [sp, #8] @ &sao_offset_val[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ vmov.i64 q1, #0 -+ ldr r12, [sp, #12] @ sao_left_class -+ -+ mov r4, sp -+ sub sp, #32 -+ and sp, #~63 @ Align stack so we can wrap with a simple AND -+ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array -+ mov sp, r4 -+ -+ ldr r12, [sp, #20] @ height + pld [r1] -+ -+ sub r12, #1 ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q1, #0 ++ add r4, ip, r4 ++ vpush {d0-d4} @ Put zero array on stack ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [ip, #8*5 + 28] @ height ++ vst1.32 {d16[0]}, [r4] + add r4, r1, r3 ++ vpop {d0-d4} @ Pop modified array ++ sub ip, ip, #1 ++ vorr d0, d0, d4 + bx lr +endfunc + -+ ++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately +function band_load_c -+ vmov.i64 q2, #0 -+ ldr r12, [sp, #8] @ &sao_offset_val1[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ vmov.i64 q3, #0 -+ ldr r12, [sp, #12] @ sao_left_class -+ -+ mov r4, sp @ Remember SP -+ sub sp, #32 -+ and sp, #~63 @ Align stack so we can wrap with a simple AND -+ -+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array -+ -+ @ And again for the 2nd set -+ ldr r12, [r4, #16] @ &sao_offset_val2[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ ldr r12, [r4, #20] @ sao_left_class2 -+ -+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again) -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array -+ -+ mov sp, r4 -+ -+ ldr r12, [sp, #28] @ height ++ ldr ip, [sp, #16] @ &sao_offset_val1[0] ++ ldr r4, [sp, #20] @ sao_left_class1 ++ vmov.i64 d24, #0 ++ vmov.i64 q10, #0 + pld [r1] -+ -+ subs r12, #1 ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q11, #0 ++ add r4, ip, r4 ++ ldr ip, [sp, #24] @ &sao_offset_val2[0] ++ vpush {d20-d24} @ Put zero array on stack ++ vld2.8 {q9}, [ip] ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [sp, #8*5 + 28] @ sao_left_class2 ++ vst1.32 {d16[0]}, [r4] ++ add ip, sp, ip ++ vshr.u64 d18, d18, #8 @ 1st interesting val is [1] ++ vldmia sp, {d0-d3} @ Load modified array ++ vldr d16, [sp, #8*4] + add r4, r1, r3 ++ vstmia sp, {d20-d24} @ Put zero array on stack (again) ++ vst1.32 {d18[0]}, [ip] ++ vorr d0, d0, d16 ++ vldmia sp, {d4-d7} @ Load modified array ++ vldr d18, [sp, #8*4] ++ ldr ip, [sp, #8*5 + 36] @ height ++ add sp, sp, #8*5 ++ vorr d4, d4, d18 ++ sub ip, ip, #1 + bx lr +endfunc + @@ -5166,24 +5201,21 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_64_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y ++ push {r4-r6, lr} + vmov.u8 q15, #128 ++ bl band_load_y + -+1: subs r12, #1 -+ vldm r1, {q8-q11} -+ pld [r4] -+ add r1, r3 -+ -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 -+ -+ it ne -+ addne r4, r3 -+ vstm r0, {q8-q11} ++1: vldmia r1, {q8-q11} ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \ ++ "pld [r4]", \ ++ "subs ip, #1", \ ++ "it ne; addne r4, r3", \ ++ "add r1, r3" ++ vstmia r0, {q8-q11} + add r0, r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_32_neon_8 ( @@ -5197,21 +5229,25 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_32_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 + vmov.u8 q15, #128 ++ bl band_load_y + -+1: subs r12, #2 -+ vld1.8 { q8, q9 }, [r1, :128], r3 -+ vld1.8 {q10, q11}, [r1, :128], r3 ++1: vld1.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld1.8 {q10, q11}, [r6, :128], r3 + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 + + vst1.8 { q8, q9 }, [r0, :128], r2 -+ vst1.8 {q10, q11}, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r5, :128], r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_16_neon_8 ( @@ -5225,25 +5261,29 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_16_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 + vmov.u8 q15, #128 ++ bl band_load_y + -+1: subs r12, #4 -+ vld1.8 { q8}, [r1, :128], r3 -+ vld1.8 { q9}, [r1, :128], r3 ++1: vld1.8 { q8}, [r1, :128], r3 ++ subs ip, #4 ++ vld1.8 { q9}, [r6, :128], r3 + vld1.8 {q10}, [r1, :128], r3 -+ vld1.8 {q11}, [r1, :128], r3 ++ vld1.8 {q11}, [r6, :128], r3 + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 + + vst1.8 { q8}, [r0, :128], r2 -+ vst1.8 { q9}, [r0, :128], r2 ++ vst1.8 { q9}, [r5, :128], r2 + vst1.8 {q10}, [r0, :128], r2 -+ vst1.8 {q11}, [r0, :128], r2 ++ vst1.8 {q11}, [r5, :128], r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_8_neon_8 ( @@ -5257,39 +5297,38 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_8_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y -+ ldr lr, [sp, #16] @ width ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} + vmov.u8 q15, #128 -+ cmp lr, #8 ++ cmp ip, #8 ++ bl band_load_y ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 + blt 4f + -+1: subs r12, #2 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 -+ -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 -+ -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ bpl 1b -+ pop {r4, pc} -+ ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "", \ ++ "", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} +4: -+1: subs r12, #4 -+ vld1.32 {d16[0]}, [r1, :32], r3 -+ vld1.32 {d16[1]}, [r1, :32], r3 -+ vld1.32 {d17[0]}, [r1, :32], r3 -+ vld1.32 {d17[1]}, [r1, :32], r3 -+ -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 -+ -+ vst1.32 {d16[0]}, [r0, :32], r2 -+ vst1.32 {d16[1]}, [r0, :32], r2 -+ vst1.32 {d17[0]}, [r0, :32], r2 -+ vst1.32 {d17[1]}, [r0, :32], r2 -+ bpl 1b -+ pop {r4, pc} ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.32 {d16[0]}, [r1, :32], r3", \ ++ "subs ip, #4", \ ++ "vld1.32 {d16[1]}, [r6, :32], r3", \ ++ "vld1.32 {d17[0]}, [r1, :32], r3", \ ++ "vld1.32 {d17[1]}, [r6, :32], r3", \ ++ "vst1.32 {d26[0]}, [r0, :32], r2", \ ++ "vst1.32 {d26[1]}, [r5, :32], r2", \ ++ "vst1.32 {d27[0]}, [r0, :32], r2", \ ++ "vst1.32 {d27[1]}, [r5, :32], r2" ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_32_neon_8( @@ -5305,31 +5344,25 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ vmov.u8 q15, #128 ++ bl band_load_c + -+ vmov.i8 q15, #128 -+ sub r3, #32 -+ sub r2, #32 ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.8 {q10, q11}, [r6, :128], r3 + -+1: subs r12, #1 -+ vld2.8 { q8, q9 }, [r1, :128]! -+ vld2.8 {q10, q11}, [r1, :128], r3 -+ -+ pld [r4] -+ -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 -+ -+ vst2.8 { q8, q9 }, [r0, :128]! -+ vst2.8 {q10, q11}, [r0, :128], r2 -+ -+ itt ne -+ addne r4, r3 -+ addne r4, #32 ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" + ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r5, :128], r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_16_neon_8( @@ -5345,21 +5378,25 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c -+ vmov.i8 q15, #128 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_c + -+1: subs r12, #2 -+ vld2.8 { q8, q9 }, [r1, :128], r3 -+ vld2.8 {q10, q11}, [r1, :128], r3 ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld2.8 {q10, q11}, [r6, :128], r3 + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15 + + vst2.8 { q8, q9 }, [r0, :128], r2 -+ vst2.8 {q10, q11}, [r0, :128], r2 -+ ++ vst2.8 {q10, q11}, [r5, :128], r2 + bpl 1b -+ pop {r4, pc} ++ ++ pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_8_neon_8( @@ -5375,35 +5412,36 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c -+ ldr lr, [sp, #16] @ width ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} + vmov.u8 q15, #128 -+ cmp lr, #8 ++ cmp ip, #8 ++ bl band_load_c + blt 4f + -+1: subs r12, #1 -+ vld2.8 {d16, d17}, [r1, :128], r3 -+ -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 -+ -+ vst2.8 {d16, d17}, [r0, :128], r2 -+ bpl 1b -+ pop {r4, pc} -+ ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld2.8 {d16-d17}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.8 {d26-d27}, [r0, :128], r2" ++ pop {r4-r6, pc} +4: -+1: subs r12, #1 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 -+ vuzp.8 d16, d17 -+ -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 -+ -+ vzip.8 d16, d17 -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ bpl 1b -+ pop {r4, pc} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "vuzp.8 d16, d17", \ ++ "", \ ++ "vzip.8 d26, d27", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} +endfunc + + @@ -5418,24 +5456,23 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +.macro band_64_16 bit_depth -+ push {r4, lr} -+ movw lr, #(1 << \bit_depth) - 1 ++ push {r4-r6, lr} + vmov.i64 q2, #0 -+ vdup.i16 q3, lr ++ vmov.i16 q3, #(1 << \bit_depth) - 1 + bl band_load_y + vpush {q4-q7} + -+1: subs r12, #1 -+ vldm r1, {q4-q11} -+ add r1, r3 -+ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth -+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++1: vldm r1, {q4-q11} ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth + vstm r0, {q4-q11} + add r0, r2 + bpl 1b + + vpop {q4-q7} -+ pop {r4, pc} ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_64_neon_10, export=1 @@ -5453,21 +5490,20 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +.macro band_32_16 bit_depth -+ push {r4, lr} -+ movw lr, #(1 << \bit_depth) - 1 ++ push {r4-r6, lr} + vmov.i64 q2, #0 -+ vdup.i16 q3, lr ++ vmov.i16 q3, #(1 << \bit_depth) - 1 + bl band_load_y + -+1: subs r12, #1 -+ vldm r1, {q8-q11} -+ add r1, r3 -+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++1: vldm r1, {q8-q11} ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" + vstm r0, {q8-q11} + add r0, r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_32_neon_10, export=1 @@ -5485,21 +5521,24 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +.macro band_16_16 bit_depth -+ push {r4, lr} -+ movw lr, #(1 << \bit_depth) - 1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 + vmov.i64 q14, #0 -+ vdup.i16 q15, lr ++ vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_y + -+1: subs r12, #2 -+ vld1.16 { q8, q9 }, [r1, :128], r3 -+ vld1.16 {q10, q11}, [r1, :128], r3 -+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++1: vld1.16 { q8, q9 }, [r1, :128], r3 ++ subs r12, #2 ++ vld1.16 {q10, q11}, [r6, :128], r3 ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth + vst1.16 { q8, q9 }, [r0, :128], r2 -+ vst1.16 {q10, q11}, [r0, :128], r2 ++ vst1.16 {q10, q11}, [r5, :128], r2 + bpl 1b + -+ pop {r4, pc} ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_16_neon_10, export=1 @@ -5517,37 +5556,39 @@ index 0000000000..b56dc8ccc5 +@ int height) [sp, #12] + +.macro band_8_16 bit_depth -+ push {r4, lr} -+ movw lr, #(1 << \bit_depth) - 1 ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} + vmov.i64 q14, #0 -+ vdup.i16 q15, lr ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_y -+ ldr lr, [sp, #16] -+ cmp lr, #8 ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 + blt 4f + -+1: subs r12, #2 -+ vld1.16 { q8}, [r1, :128], r3 -+ vld1.16 { q9}, [r1, :128], r3 -+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth -+ vst1.16 { q8}, [r0, :128], r2 -+ vst1.16 { q9}, [r0, :128], r2 -+ bpl 1b -+ pop {r4, pc} -+ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {q8}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld1.16 {q9}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst1.16 {q10}, [r0, :128], r2", \ ++ "vst1.16 {q11}, [r5, :128], r2" ++ pop {r4-r6, pc} +4: -+1: subs r12, #4 -+ vld1.16 {d16}, [r1, :64], r3 -+ vld1.16 {d17}, [r1, :64], r3 -+ vld1.16 {d18}, [r1, :64], r3 -+ vld1.16 {d19}, [r1, :64], r3 -+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth -+ vst1.16 {d16}, [r0, :64], r2 -+ vst1.16 {d17}, [r0, :64], r2 -+ vst1.16 {d18}, [r0, :64], r2 -+ vst1.16 {d19}, [r0, :64], r2 -+ bpl 1b -+ pop {r4, pc} ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {d16}, [r1, :64], r3", \ ++ "subs ip, #4", \ ++ "vld1.16 {d17}, [r6, :64], r3", \ ++ "vld1.16 {d18}, [r1, :64], r3", \ ++ "vld1.16 {d19}, [r6, :64], r3", \ ++ "vst1.16 {d20}, [r0, :64], r2", \ ++ "vst1.16 {d21}, [r5, :64], r2", \ ++ "vst1.16 {d22}, [r0, :64], r2", \ ++ "vst1.16 {d23}, [r5, :64], r2" ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_8_neon_10, export=1 @@ -5568,39 +5609,37 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +.macro band_c_32_16 bit_depth -+ push {r4, lr} -+ bl band_load_c -+ vpush {q4-q7} -+ movw lr, #(1 << \bit_depth) - 1 ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ sub r2, #64 ++ sub r3, #64 + vmov.i64 q14, #0 -+ vdup.i16 q15, lr -+ sub r2, #96 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ mov lr, #64 ++ vpush {q4-q7} + -+1: subs r12, #1 ++1: vld2.16 { q4, q5 }, [r1, :128], lr ++ subs ip, #1 ++ vld2.16 { q6, q7 }, [r6, :128], lr ++ vld2.16 { q8, q9 }, [r1, :128], r3 ++ vld2.16 {q10, q11}, [r6, :128], r3 + -+ vld2.16 { q4, q5 }, [r1, :128]! -+ vld2.16 { q6, q7 }, [r1, :128]! -+ vld2.16 { q8, q9 }, [r1, :128]! -+ vld2.16 {q10, q11}, [r1, :128], r3 ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth + -+ pld [r4] -+ sub r1, #96 -+ -+ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth -+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth -+ -+ it ne -+ addne r4, r3 -+ -+ vst2.16 { q4, q5 }, [r0, :128]! -+ vst2.16 { q6, q7 }, [r0, :128]! -+ vst2.16 { q8, q9 }, [r0, :128]! -+ vst2.16 {q10, q11}, [r0, :128], r2 ++ vst2.16 { q4, q5 }, [r0, :128], lr ++ vst2.16 { q6, q7 }, [r5, :128], lr ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 + + bpl 1b + + vpop {q4-q7} -+ pop {r4, pc} ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 @@ -5621,26 +5660,25 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +.macro band_c_16_16 bit_depth -+ push {r4, lr} -+ bl band_load_c -+ movw lr, #(1 << \bit_depth) - 1 ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 + vmov.i64 q14, #0 -+ vdup.i16 q15, lr -+ sub r2, #32 -+ sub r3, #32 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c + -+1: subs r12, #1 ++1: vld2.16 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.16 {q10, q11}, [r6, :128], r3 + -+ vld2.16 { q8, q9 }, [r1, :128]! -+ vld2.16 {q10, q11}, [r1, :128], r3 ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth + -+ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth -+ -+ vst2.16 { q8, q9 }, [r0, :128]! -+ vst2.16 {q10, q11}, [r0, :128], r2 ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 + + bpl 1b -+ pop {r4, pc} ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 @@ -5661,37 +5699,36 @@ index 0000000000..b56dc8ccc5 +@ int height sp[20] + +.macro band_c_8_16 bit_depth -+ push {r4, lr} -+ bl band_load_c -+ movw lr, #(1 << \bit_depth) - 1 ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} + vmov.i64 q14, #0 -+ vdup.i16 q15, lr -+ ldr lr, [sp, #24] @ width -+ cmp lr, #8 ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c + blt 4f + -+1: subs r12, #1 -+ vld2.16 { q8, q9 }, [r1, :128], r3 -+ -+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth -+ -+ vst2.16 { q8, q9 }, [r0, :128], r2 -+ -+ bpl 1b -+ pop {r4, pc} -+ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {q8,q9}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.16 {q10,q11}, [r0, :128], r2" ++ pop {r4-r6, pc} +4: -+1: subs r12, #2 -+ vld2.16 {d16, d17}, [r1, :128], r3 -+ vld2.16 {d18, d19}, [r1, :128], r3 -+ -+ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth -+ -+ vst2.16 {d16, d17}, [r0, :128], r2 -+ vst2.16 {d18, d19}, [r0, :128], r2 -+ -+ bpl 1b -+ pop {r4, pc} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {d16,d18}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld2.16 {d17,d19}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst2.16 {d20,d22}, [r0, :128], r2", \ ++ "vst2.16 {d21,d23}, [r5, :128], r2" ++ pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 @@ -6286,8 +6323,10 @@ index 0000000000..b56dc8ccc5 + vmov.64 q0, q4 + pld [r1, r3] + vmov.64 q1, q5 ++ it le + pople {lr} + vmov.64 q2, q6 ++ it le + bxle lr + vmov.64 q3, q7 + add r0, r0, r2 @@ -6323,6 +6362,7 @@ index 0000000000..b56dc8ccc5 + vmov q1, q9 + vst1.8 {q2-q3}, [r0, :256], r2 + vmov q2, q10 ++ it le + bxle r6 + vmov q3, q11 + b 1b @@ -6342,6 +6382,7 @@ index 0000000000..b56dc8ccc5 + subs r12, #1 + // copy c to a + vmov.64 q0, q1 ++ it le + bxle r6 + // copy b to c + vmov.64 q1, q2 @@ -6450,8 +6491,10 @@ index 0000000000..b56dc8ccc5 + vldr d25, [r6, #-8] + vstmia r0, {q0-q3} + vext.8 q3, q6, q7, #16 - \pb ++ it le + pople {lr} + vext.8 q2, q5, q6, #16 - \pb ++ it le + bxle lr + vext.8 q1, q4, q5, #16 - \pb + add r6, r6, r3 @@ -6647,8 +6690,10 @@ index 0000000000..b56dc8ccc5 + vldr d24, [r6, #64] + vstmia r0, {q0-q3} + vext.8 q0, q4, q5, #\pb ++ it le + pople {lr} + vext.8 q1, q5, q6, #\pb ++ it le + bxle lr + vext.8 q2, q6, q7, #\pb + add r6, r6, r3 @@ -9750,10 +9795,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..a1d6d56b04 +index 0000000000..5ae479dd0b --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1067 @@ +@@ -0,0 +1,1069 @@ +/* + * HEVC video decoder + * @@ -10709,7 +10754,8 @@ index 0000000000..a1d6d56b04 +// flushes and invalidates all pixel rows in [start,end-1] +static void ff_hevc_rpi_flush_buffer_lines(HEVCRpiContext *s, int start, int end, int flush_luma, int flush_chroma) +{ -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, + 0, start, s->ps.sps->width, end - start, ctx_vshift(s, 1), flush_luma, flush_chroma); + rpi_cache_flush_finish(rfe); @@ -10749,7 +10795,8 @@ index 0000000000..a1d6d56b04 + + // Call VPU + { -+ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ vpu_qpu_job_env_t qvbuf; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); + vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands + vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id); + vpu_qpu_job_finish(vqj); @@ -11780,10 +11827,10 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..d28ae0ec92 +index 0000000000..93fc26de88 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1756 @@ +@@ -0,0 +1,1765 @@ +/* + * HEVC Parameter Set decoding + * @@ -13349,6 +13396,15 @@ index 0000000000..d28ae0ec92 + pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; + + pps->pic_init_qp_minus26 = get_se_golomb(gb); ++ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "init_qp_minus26 %d is outside the valid range " ++ "[%d, %d].\n", ++ pps->pic_init_qp_minus26, ++ -(26 + sps->qp_bd_offset), 25); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } + + pps->constrained_intra_pred_flag = get_bits1(gb); + pps->transform_skip_enabled_flag = get_bits1(gb); @@ -13542,10 +13598,10 @@ index 0000000000..d28ae0ec92 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..989f8953b4 +index 0000000000..96c3739c4f --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,446 @@ +@@ -0,0 +1,447 @@ +/* + * HEVC parameter set parsing + * @@ -13650,6 +13706,7 @@ index 0000000000..989f8953b4 + int * offset; + int * size; + int num_entry_point_offsets; ++ int offsets_allocated; + + int8_t slice_qp; + @@ -26424,10 +26481,10 @@ index 0000000000..56d5206827 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..0ad64f9f19 +index 0000000000..3cee92a11f --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5679 @@ +@@ -0,0 +1,5705 @@ +/* + * HEVC video Decoder + * @@ -27222,6 +27279,38 @@ index 0000000000..0ad64f9f19 + * Section 5.7 + */ + ++// Realloc the entry point arrays ++static int alloc_entry_points(RpiSliceHeader * const sh, const int n) ++{ ++ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0) ++ { ++ // Round up alloc to multiple of 32 ++ int a = (n + 31) & ~31; ++ ++ // We don't care about the previous contents so probably fastest to simply discard ++ av_freep(&sh->entry_point_offset); ++ av_freep(&sh->offset); ++ av_freep(&sh->size); ++ ++ if (a != 0) ++ { ++ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned)); ++ sh->offset = av_malloc_array(a, sizeof(int)); ++ sh->size = av_malloc_array(a, sizeof(int)); ++ ++ if (!sh->entry_point_offset || !sh->offset || !sh->size) { ++ sh->num_entry_point_offsets = 0; ++ sh->offsets_allocated = 0; ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ sh->offsets_allocated = a; ++ } ++ ++ return 0; ++} ++ +/* free everything allocated by pic_arrays_init() */ +static void pic_arrays_free(HEVCRpiContext *s) +{ @@ -27255,9 +27344,7 @@ index 0000000000..0ad64f9f19 + av_freep(&s->horizontal_bs); + av_freep(&s->vertical_bs); + -+ av_freep(&s->sh.entry_point_offset); -+ av_freep(&s->sh.size); -+ av_freep(&s->sh.offset); ++ alloc_entry_points(&s->sh, 0); + + av_buffer_pool_uninit(&s->tab_mvf_pool); + av_buffer_pool_uninit(&s->rpl_tab_pool); @@ -27328,7 +27415,7 @@ index 0000000000..0ad64f9f19 + s->bs_width = (width >> 2) + 1; + s->bs_height = (height >> 2) + 1; + -+ s->sao = av_mallocz_array(ctb_count, sizeof(*s->sao)); ++ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly + s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); + if (!s->sao || !s->deblock) + goto fail; @@ -28048,17 +28135,12 @@ index 0000000000..0ad64f9f19 + return AVERROR_INVALIDDATA; + } + -+ av_freep(&sh->entry_point_offset); -+ av_freep(&sh->offset); -+ av_freep(&sh->size); -+ sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned)); -+ sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int)); -+ sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int)); -+ if (!sh->entry_point_offset || !sh->offset || !sh->size) { -+ sh->num_entry_point_offsets = 0; ++ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0) ++ { + av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); -+ return AVERROR(ENOMEM); ++ return ret; + } ++ + for (i = 0; i < sh->num_entry_point_offsets; i++) { + uint32_t val_minus1 = get_bits_long(gb, offset_len); + if (val_minus1 > (1 << 28)) @@ -28071,13 +28153,7 @@ index 0000000000..0ad64f9f19 + } + sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size + } -+ if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) { -+ s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here -+ s->threads_number = 1; -+ } else -+ s->enable_parallel_tiles = 0; -+ } else -+ s->enable_parallel_tiles = 0; ++ } + } + + if (s->ps.pps->slice_header_extension_present_flag) { @@ -28251,23 +28327,23 @@ index 0000000000..0ad64f9f19 + if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) { + lc->tu.cu_qp_delta = ff_hevc_rpi_cu_qp_delta_abs(lc); + if (lc->tu.cu_qp_delta != 0) ++ { + if (ff_hevc_rpi_cu_qp_delta_sign_flag(lc) == 1) + lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta; -+ lc->tu.is_cu_qp_delta_coded = 1; + -+// Was: -+// if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || -+// if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || -+// 2016 standard says: -+ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset) || -+ lc->tu.cu_qp_delta > 25) { -+ av_log(s->avctx, AV_LOG_ERROR, -+ "The cu_qp_delta %d is outside the valid range " -+ "[%d, %d].\n", -+ lc->tu.cu_qp_delta, -+ -(26 + s->ps.sps->qp_bd_offset), 25); -+ return AVERROR_INVALIDDATA; ++ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset/2) || ++ lc->tu.cu_qp_delta > (25 + s->ps.sps->qp_bd_offset/2)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The cu_qp_delta %d is outside the valid range " ++ "[%d, %d].\n", ++ lc->tu.cu_qp_delta, ++ -(26 + s->ps.sps->qp_bd_offset/2), ++ (25 + s->ps.sps->qp_bd_offset/2)); ++ return AVERROR_INVALIDDATA; ++ } + } ++ lc->tu.is_cu_qp_delta_coded = 1; + + ff_hevc_rpi_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size); + } @@ -29854,11 +29930,15 @@ index 0000000000..0ad64f9f19 + const unsigned int yb = (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0 ? + bound_b : y - ctb_size; + -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ xl, yt, bound_r - xl, yb - yt, -+ ctx_vshift(s, 1), 1, 1); -+ rpi_cache_flush_finish(rfe); ++ if (yb > yt && bound_r > xl) ++ { ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ xl, yt, bound_r - xl, yb - yt, ++ ctx_vshift(s, 1), 1, 1); ++ rpi_cache_flush_finish(rfe); ++ } + } + + // Signal @@ -30145,9 +30225,10 @@ index 0000000000..0ad64f9f19 + +static void flush_frame(HEVCRpiContext *s,AVFrame *frame) +{ -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ rpi_cache_flush_finish(rfe); ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); +} + +static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) @@ -30181,8 +30262,10 @@ index 0000000000..0ad64f9f19 + const HEVCRpiContext * const s = s0; + vpu_qpu_wait_h sync_y; + int pred_y, pred_c; -+ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ vpu_qpu_job_env_t qvbuf; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); + + { + const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; @@ -30314,6 +30397,7 @@ index 0000000000..0ad64f9f19 + av_freep(&jb->intra.cmds); + rpi_free_inter_pred(&jb->chroma_ip); + rpi_free_inter_pred(&jb->luma_ip); ++ av_free(jb); +} + +static void jbg_delete(HEVCRpiJobGlobal * const jbg) @@ -31766,7 +31850,6 @@ index 0000000000..0ad64f9f19 + ff_hevc_rpi_progress_kill_state(s->progress_states + i); + } + job_lc_kill(s->HEVClc); -+ av_rpi_zc_uninit(avctx); + + av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] + av_freep(&s->sao_pixel_buffer_v[0]); @@ -31787,10 +31870,6 @@ index 0000000000..0ad64f9f19 + s->ps.pps = NULL; + s->ps.vps = NULL; + -+ av_freep(&s->sh.entry_point_offset); -+ av_freep(&s->sh.offset); -+ av_freep(&s->sh.size); -+ + for (i = 1; i < s->threads_number; i++) { + if (s->sList[i] != NULL) { + av_freep(&s->sList[i]); @@ -31805,6 +31884,11 @@ index 0000000000..0ad64f9f19 + + ff_h2645_packet_uninit(&s->pkt); + ++ // This must be after we free off the DPB ++ // * If the outer code is still holding any frames hopefully it will ++ // have its own ref to zc ++ av_rpi_zc_uninit(avctx); ++ + return 0; +} + @@ -32001,7 +32085,6 @@ index 0000000000..0ad64f9f19 + + hevc_init_worker(s); + -+ s->enable_parallel_tiles = 0; + s->sei.picture_timing.picture_struct = 0; + s->eos = 1; + @@ -32109,10 +32192,10 @@ index 0000000000..0ad64f9f19 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..2201017cb3 +index 0000000000..fcbf102fa1 --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1061 @@ +@@ -0,0 +1,1060 @@ +/* + * HEVC video decoder + * @@ -32932,7 +33015,6 @@ index 0000000000..2201017cb3 + uint16_t seq_decode; + uint16_t seq_output; + -+ int enable_parallel_tiles; + atomic_int wpp_err; + + const uint8_t *data; @@ -37322,10 +37404,10 @@ index 0000000000..b3168788d2 +#endif diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000000..3dfc35fa5c +index 0000000000..4a8b328236 --- /dev/null +++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,939 @@ +@@ -0,0 +1,920 @@ +#include +#include +#include @@ -37347,12 +37429,6 @@ index 0000000000..3dfc35fa5c +#include "rpi_hevc_transform10.h" +#include "libavutil/rpi_sand_fns.h" + -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#include "interface/vmcs_host/vc_vchi_gpuserv.h" -+#pragma GCC diagnostic pop -+ +// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) +#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + @@ -37420,16 +37496,7 @@ index 0000000000..3dfc35fa5c + short transMatrix2even[16*16*2]; +}; + -+#define CFE_ENTS_PER_A 8 -+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices -+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70 -+// allow 128 -+#define CFE_ENT_COUNT 128 -+#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) -+ +struct rpi_cache_flush_env_s { -+// unsigned int n; -+// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; + struct vcsm_user_clean_invalid2_s v; +}; + @@ -37777,23 +37844,18 @@ index 0000000000..3dfc35fa5c +// +// Cache flush functions + -+#define CACHE_EL_MAX 16 ++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s)) + -+rpi_cache_flush_env_t * rpi_cache_flush_init() ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf) +{ -+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) + -+ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX); -+ if (rfe == NULL) -+ return NULL; -+ ++ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf; + rfe->v.op_count = 0; + return rfe; +} + +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) +{ -+ if (rfe != NULL) -+ free(rfe); ++ // Nothing needed +} + +int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) @@ -37814,7 +37876,6 @@ index 0000000000..3dfc35fa5c +{ + int rc = rpi_cache_flush_execute(rfe);; + -+ free(rfe); + return rc; +} + @@ -37944,7 +38005,8 @@ index 0000000000..3dfc35fa5c +// Call this to clean and invalidate a region of memory +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) +{ -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_gm_ptr(rfe, p, mode); + rpi_cache_flush_finish(rfe); +} @@ -38054,26 +38116,22 @@ index 0000000000..3dfc35fa5c +#define VPU_QPU_MASK_QPU 1 +#define VPU_QPU_MASK_VPU 2 + -+#define VPU_QPU_JOB_MAX 4 -+struct vpu_qpu_job_env_s -+{ -+ unsigned int n; -+ unsigned int mask; -+ struct gpu_job_s j[VPU_QPU_JOB_MAX]; -+}; -+ +typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; + -+vpu_qpu_job_env_t * vpu_qpu_job_new(void) ++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf) +{ -+ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ vpu_qpu_job_env_t * vqj = buf; ++// memset(vqj, 0, sizeof(*vqj)); ++ vqj->n = 0; ++ vqj->mask = 0; + return vqj; +} + +void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) +{ -+ memset(vqj, 0, sizeof(*vqj)); -+ free(vqj); ++// memset(vqj, 0, sizeof(*vqj)); ++// free(vqj); +} + +static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) @@ -38091,6 +38149,8 @@ index 0000000000..3dfc35fa5c + vqj->mask |= VPU_QPU_MASK_VPU; + + j->command = EXECUTE_VPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; + // The bottom two bits of the execute address contain no-flush flags + // b0 will flush the VPU I-cache if unset so we nearly always want that set + // as we never reload code @@ -38113,6 +38173,9 @@ index 0000000000..3dfc35fa5c + vqj->mask |= VPU_QPU_MASK_QPU; + + j->command = EXECUTE_QPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; ++ + j->u.q.jobs = n; +#if RPI_TRACE_QPU_PROFILE_ALL + j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; @@ -38267,13 +38330,21 @@ index 0000000000..3dfc35fa5c + diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000000..9389047f8e +index 0000000000..1aac6babae --- /dev/null +++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,208 @@ +@@ -0,0 +1,227 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#pragma GCC diagnostic ignored "-Wstrict-prototypes" ++#include "interface/vmcs_host/vc_vchi_gpuserv.h" ++#pragma GCC diagnostic pop ++ ++ +#define RPI_ONE_BUF 1 + +typedef struct gpu_mem_ptr_s { @@ -38399,7 +38470,9 @@ index 0000000000..9389047f8e +struct rpi_cache_flush_env_s; +typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; + -+rpi_cache_flush_env_t * rpi_cache_flush_init(void); ++typedef struct {uint32_t t[33];} rpi_cache_buf_t; ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf); +// Free env without flushing +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); +// Do the accumulated flush & clear but do not free the env @@ -38457,7 +38530,16 @@ index 0000000000..9389047f8e +struct vpu_qpu_job_env_s; +typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; + -+vpu_qpu_job_h vpu_qpu_job_new(void); ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s ++{ ++ unsigned int n; ++ unsigned int mask; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf); +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, + const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);