From 1f1c5a778150b52eab5be8939f87a128a6fff6ac Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Sat, 2 Jun 2018 14:51:29 +0100 Subject: [PATCH] ffmpeg: hevc: Fix performance regression + latest ben optimisations --- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 3839 ++++++++++++----- 1 file changed, 2782 insertions(+), 1057 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 32c0f1f17b..5300c1252b 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644 /ffplay /ffprobe diff --git a/configure b/configure -index dee507cb6a..9a93189107 100755 +index dee507cb6a..0ee9efe1e7 100755 --- a/configure +++ b/configure @@ -318,6 +318,7 @@ External library support: @@ -30,6 +30,15 @@ index dee507cb6a..9a93189107 100755 --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] --disable-nvenc disable Nvidia video encoding code [autodetect] --enable-omx enable OpenMAX IL code [no] +@@ -1036,7 +1037,7 @@ EOF + + check_insn(){ + log check_insn "$@" +- check_inline_asm ${1}_inline "$2" ++ check_inline_asm ${1}_inline "\"$2\"" + check_as ${1}_external "$2" + } + @@ -1776,6 +1777,7 @@ FEATURE_LIST=" gray hardcoded_tables @@ -582,7 +591,7 @@ index 4d4ef530e4..fba8776c9f 100644 { const AVCodec *p, *experimental = NULL; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index e656011c3c..70c3f026b8 100644 +index e656011c3c..f8801dfab6 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ @@ -594,7 +603,7 @@ index e656011c3c..70c3f026b8 100644 OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o -@@ -136,10 +138,23 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -613,6 +622,7 @@ index e656011c3c..70c3f026b8 100644 + arm/rpi_hevcpred_init_neon.o \ + arm/rpi_hevcpred_intra_angular_neon.o \ + arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_filter_neon.o \ + arm/rpi_hevcpred_intra_hv_neon.o \ + arm/rpi_hevcpred_intra_planar_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o @@ -1680,10 +1690,10 @@ index 0000000000..0211e447a8 + diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S new file mode 100644 -index 0000000000..1bdf2ab09f +index 0000000000..3bbfb443bf --- /dev/null +++ b/libavcodec/arm/rpi_hevc_misc_neon.S -@@ -0,0 +1,26 @@ +@@ -0,0 +1,226 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" + @@ -1710,6 +1720,206 @@ index 0000000000..1bdf2ab09f + +2: vst1.8 {q0,q1}, [r0:256] + bx lr ++endfunc ++ ++@ PIC jump tables are more expensive than absolute for A32 code ++.set jent_pic, CONFIG_PIC || CONFIG_THUMB ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++T .short ((0 + \lab) - (0 + 98b)) / 2 ++A .short (0 + \lab) - (4 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.macro cpy_compound val, p1, p2 ++.if \p1 + \p2 != \val ++.error "Bad addition! \p1 + \p2 != \val" ++.endif ++.if \val <= 64 ++@ As max we deal with 128 vals above 64 will never be recursed to ++100\val\(): ++ push {r11, lr} ++.endif ++\val\(): ++ push {r0-r3} ++ bl 100\p1\()b ++ pop {r0-r3} ++ add r0, #\p1 ++ add r2, #\p1 ++ b \p2\()b ++.endm ++ ++@ ff_hevc_cpy_blks8x4_neon( ++@ dst [r0] ++@ dst_stride [r1] ++@ src [r2] ++@ src_stride [r3] ++@ width [sp, #0] (bytes) ++@ height) [sp, #4] ++@ ++@ Power of 2 widths are directly coded, all others are done in stripes ++@ We expect the vast majority of calls to be power of 2 ++@ ++@ Currently has min width of 8, but we could make that 4 without issue ++@ Min height is 4 ++ ++function ff_hevc_rpi_cpy_blks8x4_neon, export=1 ++ ldr r12, [sp, #0] ++ push {r11, lr} ++ sub r12, #1 ++A adr lr, 98f ++ ubfx r12, r12, #3, #4 ++ ldr r11, [sp, #(8 + 4)] ++.if jent_pic ++A lsl r12, #1 ++A ldrsh lr, [lr, r12] ++A add pc, lr ++T tbh [pc, r12, lsl #1] ++.else ++ @ A32 only, Thumb is always PIC ++ ldr pc, [lr, r12, lsl #2] ++.endif ++ ++98: ++ jent 8f ++ jent 16f ++ jent 24f ++ jent 32f ++ jent 40f ++ jent 48f ++ jent 56f ++ jent 64f ++ jent 72f ++ jent 80f ++ jent 88f ++ jent 96f ++ jent 104f ++ jent 112f ++ jent 120f ++ jent 128f ++ ++1008: ++ push {r11, lr} ++8: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {d0 }, [r2], r3 ++ vld1.32 {d1 }, [lr], r3 ++ vld1.32 {d2 }, [r2], r3 ++ vld1.32 {d3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {d0 }, [r0], r1 ++ vst1.32 {d1 }, [r12], r1 ++ vst1.32 {d2 }, [r0], r1 ++ vst1.32 {d3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10016: ++ push {r11, lr} ++16: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q0 }, [r2], r3 ++ vld1.32 {q1 }, [lr], r3 ++ vld1.32 {q2 }, [r2], r3 ++ vld1.32 {q3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q0 }, [r0], r1 ++ vst1.32 {q1 }, [r12], r1 ++ vst1.32 {q2 }, [r0], r1 ++ vst1.32 {q3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 24, 16, 8 ++ ++10032: ++ push {r11, lr} ++32: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 40, 32, 8 ++cpy_compound 48, 32, 16 ++cpy_compound 56, 32, 24 ++ ++10064: ++ push {r11, lr} ++64: ++ add lr, r2, #32 ++ add r12, r0, #32 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #2 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++cpy_compound 72, 64, 8 ++cpy_compound 80, 64, 16 ++cpy_compound 88, 64, 24 ++cpy_compound 96, 64, 32 ++cpy_compound 104, 64, 40 ++cpy_compound 112, 64, 48 ++cpy_compound 120, 64, 56 ++ ++128: ++ push {r4, r5} ++ @ We could do this with fewer registers if we jump around but I ++ @ have a primative urge to load sequentially ++ mov r4, #64 ++ add lr, r2, #32 ++ add r12, r0, #32 ++ sub r3, r4 ++ sub r1, r4 ++1: ++ vld1.32 {q8, q9 }, [r2], r4 ++ vld1.32 {q10, q11}, [lr], r4 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #1 ++ vst1.32 {q8, q9 }, [r0], r4 ++ vst1.32 {q10, q11}, [r12], r4 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r4, r5, r11, pc} ++ ++endfunc ++ diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h new file mode 100644 index 0000000000..62b9326532 @@ -1744,10 +1954,10 @@ index 0000000000..62b9326532 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 -index 0000000000..f75c82671e +index 0000000000..98512d21dc --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1593 @@ +@@ -0,0 +1,1625 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -2472,6 +2682,7 @@ index 0000000000..f75c82671e +function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 + sub r12, r0, r1 + cmp r2, #0 ++ it eq + bxeq lr + vld1.8 {d26,d27}, [r0] + lsl r1, #1 @@ -2484,10 +2695,14 @@ index 0000000000..f75c82671e + "sub r12, r0, r1, asr #1" + + lsls r3, #29 @ b2 -> N, b3 -> C ++ it pl + vstrpl d26, [r0, #0] ++ it cc + vstrcc d27, [r0, #8] + lsls r3, #2 @ b0 -> N, b1 -> C ++ it pl + vstrpl d18, [r12, #0] ++ it cc + vstrcc d19, [r12, #8] + bx lr + @@ -2506,6 +2721,7 @@ index 0000000000..f75c82671e +.macro m_filter_h_uv_16 bit_depth + sub r12, r0, r1 + cmp r2, #0 ++ it eq + bxeq lr + vld1.16 {q12, q13}, [r0] + lsl r1, #1 @@ -2527,13 +2743,17 @@ index 0000000000..f75c82671e + @ Which means we need to break this apart in an ugly fashion +1: + lsls r3, #29 @ b2 -> N, b3 -> C ++ itt pl + vstrpl d24, [r0, #0] + vstrpl d25, [r0, #8] ++ itt cc + vstrcc d26, [r0, #16] + vstrcc d27, [r0, #24] + lsls r3, #2 @ b0 -> N, b1 -> C ++ itt pl + vstrpl d20, [r12, #0] + vstrpl d21, [r12, #8] ++ itt cc + vstrcc d22, [r12, #16] + vstrcc d23, [r12, #24] + bx lr @@ -2554,6 +2774,7 @@ index 0000000000..f75c82671e + +function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + cmp r2, #0 ++ it eq + bxeq lr + push {lr} + vld2.16 {d16[0], d18[0]}, [r3], r1 @@ -2610,6 +2831,7 @@ index 0000000000..f75c82671e +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f @@ -2619,6 +2841,7 @@ index 0000000000..f75c82671e + vst1.16 {d21[1]}, [r0], r1 + vst1.16 {d21[0]}, [r2], r1 +1: ++ ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f @@ -2700,6 +2923,7 @@ index 0000000000..f75c82671e + +.macro m_filter_v_uv2_16 bit_depth + cmp r2, #0 ++ it eq + bxeq lr + push {lr} + vld2.32 {d16[0], d18[0]}, [r3], r1 @@ -2756,6 +2980,7 @@ index 0000000000..f75c82671e +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f @@ -2765,6 +2990,7 @@ index 0000000000..f75c82671e + vst1.32 {d28[1]}, [r0], r1 + vst1.32 {d28[0]}, [r2], r1 +1: ++ ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f @@ -2938,9 +3164,11 @@ index 0000000000..f75c82671e + vmovl.s16 q5, d29 + teq lr, #1 + vmovl.s16 q14, d30 -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vmovl.s16 q15, d31 -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vbif q0, q1, q4 + vbif q2, q3, q14 + vbif q1, q0, q5 @@ -3022,13 +3250,21 @@ index 0000000000..f75c82671e + lsl ip, v2 + lsl lr, v2 + ldr v2, [sp, #6*8 + 10*4 + 1*4] -+ orr a2, ip, a2, lsr v1 ++T lsr a2, v1 ++T orr a2, ip, a2 ++A orr a2, ip, a2, lsr v1 + lsl ip, v1, #1 -+ orr v8, lr, v8, lsr v1 ++T lsr v8, v1 ++T orr v8, lr, v8 ++A orr v8, lr, v8, lsr v1 + lsl lr, v1, #2 -+ orr a2, v8, a2, lsr ip ++T lsr a2, ip ++T orr a2, v8, a2 ++A orr a2, v8, a2, lsr ip + ldr v1, [sp, #6*8 + 10*4] -+ orr v7, a2, v7, lsr lr ++T lsr v7, lr ++T orr v7, a2, v7 ++A orr v7, a2, v7, lsr lr + bhi 1b + + vpop {d8-d13} @@ -3094,11 +3330,12 @@ index 0000000000..f75c82671e + vtst.16 d22, d16, d18 + vadd.i16 d30, d16, d17 + vswp d2, d3 -+ ldr lr, [sp] ++ ldr lr, [sp] + vmovl.s16 q10, d20 -+ teq lr, #1 ++ teq lr, #1 + vmovl.s16 q11, d22 -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vbif d0, d1, d20 + vbif d4, d6, d20 + vbif d3, d2, d21 @@ -3124,7 +3361,8 @@ index 0000000000..f75c82671e + vshrn.i32 d7, q11, #8 + vmovn.i32 d3, q10 + vand q0, q3, q1 -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vrev16.8 q3, q3 + vand q0, q3 + vsra.u64 d30, #32 @@ -3141,6 +3379,7 @@ index 0000000000..f75c82671e + cmp a1, #2 + vmov.u16 a1, d0[1] + vmov.u16 a2, d0[0] ++ it eq + orreq a1, a2, a1, lsl #2 + pop {a2,v1-v8,pc} +10: @@ -3153,7 +3392,10 @@ index 0000000000..f75c82671e + pkhbt a1, a1, a1, lsl #16 + lsr a2, v2 + lsr a1, v2 -+ orreq a1, a2, a1, lsl v1 ++T itt eq ++T lsleq a1, v1 ++T orreq a1, a2, a1 ++A orreq a1, a2, a1, lsl v1 + pop {a2,v1-v8,pc} +endfunc + @@ -3570,10 +3812,10 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..ce7e6091f1 +index 0000000000..8a94a644a4 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c -@@ -0,0 +1,465 @@ +@@ -0,0 +1,467 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -3808,6 +4050,7 @@ index 0000000000..ce7e6091f1 +uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc); ++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); + + +static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) @@ -4038,6 +4281,7 @@ index 0000000000..ce7e6091f1 + assert(offsetof(MvField, ref_idx) == 8); + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; ++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; +} diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S new file mode 100644 @@ -7682,10 +7926,10 @@ index 0000000000..80724d4cf3 + diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c new file mode 100644 -index 0000000000..8c267a0368 +index 0000000000..21e7700174 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_init_neon.c -@@ -0,0 +1,188 @@ +@@ -0,0 +1,210 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -7708,6 +7952,15 @@ index 0000000000..8c267a0368 + +#include "rpi_hevcpred_arm.h" + ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; ++ +void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); @@ -7788,6 +8041,12 @@ index 0000000000..8c267a0368 + switch (bit_depth) + { + case 8: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; @@ -7829,6 +8088,13 @@ index 0000000000..8c267a0368 + c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; + break; + case 10: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; ++ + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; @@ -7876,10 +8142,10 @@ index 0000000000..8c267a0368 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S new file mode 100644 -index 0000000000..1a2d413ea2 +index 0000000000..8063a1521e --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S -@@ -0,0 +1,2352 @@ +@@ -0,0 +1,2373 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -8005,8 +8271,11 @@ index 0000000000..1a2d413ea2 + @ r2=left (variable), r1=up (const) + adds r8, r7 + vmov d24, d16 ++T itee mi + ldrbmi r12, [r2, #-1]! -+ ldrbpl r12, [r1, r8, asr #8] ++T asrpl r12, r8, #8 ++T ldrbpl r12, [r1, r12] ++A ldrbpl r12, [r1, r8, asr #8] + vext.8 d16, d16, d16, #7 + sub r6, #32 + vmov.8 d16[0], r12 @@ -8028,7 +8297,11 @@ index 0000000000..1a2d413ea2 + bne 2b + b store_tran_8x8_8 @ This will return + -+ ++.macro ADRT reg, val ++@ adr in T32 has enough range but not in A32 ++A adrl \reg, \val ++T adr \reg, \val ++.endm + +@ ff_hevc_rpi_pred_angular_4_neon_8 +@ uint8_t *_src, [r0] @@ -8040,8 +8313,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_4_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8101,8 +8374,11 @@ index 0000000000..1a2d413ea2 + @ r2=left (variable), r1=up (const) + adds r8, r7 + vmov d24, d16 ++T itee mi + ldrbmi r12, [r2, #-1]! -+ ldrbpl r12, [r1, r8, asr #8] ++T asrpl r12, r8, #8 ++T ldrbpl r12, [r1, r12] ++A ldrbpl r12, [r1, r8, asr #8] + vext.8 d16, d16, d16, #7 + sub r6, #32 + vmov.8 d16[0], r12 @@ -8135,7 +8411,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov d24, d16 + add r8, r7 @@ -8197,8 +8475,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_8_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8241,7 +8519,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov d24, d16 + add r8, r7 @@ -8301,8 +8581,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_16_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8370,7 +8650,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov q12, q8 + add r8, r7 @@ -8441,8 +8723,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_32_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + @@ -8513,7 +8795,9 @@ index 0000000000..1a2d413ea2 + ble 1f + + @ For other widths we may want different logic -+ ldrb r12, [r2, r8, asr #8] ++T asr r12, r8, #8 ++T ldrb r12, [r2, r12] ++A ldrb r12, [r2, r8, asr #8] + + vmov q12, q8 + add r8, r7 @@ -8641,6 +8925,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #2 ++ it pl + addpl r2, r1, r8, asr #7 + vext.16 d16, d16, d16, #3 + @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 @@ -8673,8 +8958,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -8779,8 +9064,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -8918,8 +9203,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 + lsl r3, #1 @@ -9145,6 +9430,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #2 ++ it pl + addpl r2, r1, r8, asr #7 + vext.16 d16, d16, d16, #3 + @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 @@ -9178,8 +9464,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_4_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9282,8 +9568,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_8_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9409,8 +9695,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_16_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r10, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9556,8 +9842,8 @@ index 0000000000..1a2d413ea2 + ldr r12, [sp, #0] + push {r4-r10, lr} + vpush {q4 } -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #1 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9791,6 +10077,7 @@ index 0000000000..1a2d413ea2 + @ Use r2 for both up and left, we only ever go from left->up so + @ we assume that we are left and thenm overwrite with up if wanted + sub r2, #4 ++ it pl + addpl r2, r1, r8, asr #6 + vext.32 q8, q8, #3 + @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1 @@ -9825,8 +10112,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -9931,8 +10218,8 @@ index 0000000000..1a2d413ea2 +function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 + ldr r12, [sp, #0] + push {r4-r8, lr} -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -10071,8 +10358,8 @@ index 0000000000..1a2d413ea2 + ldr r12, [sp, #0] + push {r4-r10, lr} + vpush {q4 } -+ adrl r4, angle_2 - 2 -+ adrl r7, inv_angle - 11*2 ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 + lsl r3, #2 + ldrsb r4, [r4, r12] + add r7, r7, r12, lsl #1 @@ -10234,10 +10521,10 @@ index 0000000000..1a2d413ea2 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S new file mode 100644 -index 0000000000..af7ba1f45e +index 0000000000..75a1789c25 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S -@@ -0,0 +1,682 @@ +@@ -0,0 +1,695 @@ +/* + * Copyright (c) 2017 John Cox (for Raspberry Pi) + * @@ -10284,7 +10571,7 @@ index 0000000000..af7ba1f45e + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10359,7 +10646,7 @@ index 0000000000..af7ba1f45e + + @ Average the els of top & left + vld1.8 {d0}, [r1] -+ mov r1, #2 ++ mov r1, #2 + vld1.8 {d16}, [r2] + vmov.i16 q2, #3 + vmov.i64 d7, #0xffff @@ -10367,7 +10654,7 @@ index 0000000000..af7ba1f45e + vmovl.u8 q0, d0 + vadd.i16 d6, d2, d3 @ d6 has 4 vals + vmov.16 d4[0], r1 @ 2, 3, 3, 3... -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10418,23 +10705,30 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 {q8 }, [r1] -+ vld1.8 {q12}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q2, d24, d25 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ mov r1, #4 -+ vpadd.i32 d0, d0 @ This add U & V separately -+ lsl r3, #1 @ pels -+ vrshrn.u16 d0, q0, #4 -+ vdup.u16 q0, d0[0] @ Dup results ++ vld1.8 {q0}, [r1] ++ mov r1, #8 ++ vld1.8 {q1}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vadd.i16 q1, q0 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshrn.u16 d0, q1, #4 ++ vrshrn.u16 d1, q1, #4 + + @ Store +1: -+ vst1.8 {q0 }, [r0], r3 -+ subs r1, #1 -+ vst1.8 {q0 }, [r0], r3 ++ vst1.8 {q0}, [r0], r3 ++ subs r1, #4 ++ vst1.8 {q0}, [r2], r3 ++ vst1.8 {q0}, [r0], r3 ++ vst1.8 {q0}, [r2], r3 + bne 1b + + bx lr @@ -10450,52 +10744,55 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_16_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 { q8}, [r1] -+ vld1.8 {q12}, [r2] -+ vaddl.u8 q0, d16, d24 -+ vaddl.u8 q2, d17, d25 -+ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ vrshr.u16 d0, #5 -+ -+ vmov.i64 d31, #0xff ++ vld1.8 {q8}, [r1] ++ mov r1, #2 ++ vld1.8 {q9}, [r2] ++ vaddl.u8 q10, d16, d17 ++ vaddl.u8 q11, d16, d18 ++ vaddl.u8 q0, d18, d19 ++ vmov.i16 q1, #3 ++ vadd.i16 q10, q0 ++ vmovl.u8 q0, d18 ++ vadd.i16 d20, d21 ++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left + @ top_line[0] is extra special -+ @ (top[0] + left[0] + dc * 2) ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 + -+ vmov.u16 r12, d0[0] @ dc -+ add r2, r12, r12, lsl #1 @ dc*3 -+ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 -+ -+ vdup.u16 q3, r2 -+ vaddw.u8 q1, q3, d16 -+ vaddw.u8 q2, q3, d17 -+ vmov.u16 d2[0], r1 -+ vrshrn.u16 d2, q1, #2 -+ vrshrn.u16 d3, q2, #2 -+ -+ @ Construct lhs pels -+ vaddw.u8 q2, q3, d24 -+ vaddw.u8 q3, q3, d25 -+ vrshrn.u16 d4, q2, #2 -+ vrshrn.u16 d5, q3, #2 ++ vmovl.u8 q2, d16 ++ vmovl.u8 q9, d19 ++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) ++ vmov.i64 d7, #0xffff ++ vmovl.u8 q8, d17 ++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] ++ vmov.i64 d7, #0xff ++ vpadd.i16 d20, d20 @ 1 (all the same) ++ vrshr.u16 d21, d20, #5 ++ vrshr.u16 d20, d20, #5 ++ vmla.i16 q0, q10, d2[1] ++ vmla.i16 q9, q10, d2[1] ++ vmla.i16 q2, q10, q1 ++ vmla.i16 q8, q10, d2[1] ++ vdup.8 q1, d20[0] ++ vrshrn.i16 d0, q0, #2 ++ vrshrn.i16 d1, q9, #2 ++ vrshrn.i16 d4, q2, #2 ++ vrshrn.i16 d5, q8, #2 ++ vext.8 q0, q0, q0, #1 + + @ Store top line -+ vst1.8 { q1}, [r0], r3 -+ -+ mov r1, #15 -+ vdup.u8 q0, d0[0] ++ vst1.8 {q2}, [r0], r3 + ++ @ Store the rest ++ mov r1, #15 +1: -+ vext.8 q2, q2, #1 -+ vbit d0, d4, d31 -+ subs r1, #1 -+ vst1.8 { q0}, [r0], r3 ++ vbit d2, d0, d7 ++ vext.8 q0, q0, q0, #1 ++ subs r1, #1 ++ vst1.8 {q1}, [r0], r3 + bne 1b + + bx lr @@ -10511,33 +10808,34 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 { q8, q9}, [r1] -+ vld1.8 {q12,q13}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q1, d18, d19 -+ vaddl.u8 q2, d24, d25 -+ vaddl.u8 q3, d26, d27 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ lsl r3, #1 -+ vadd.i16 d0, d1 @ d0 has 2 val pairs -+ mov r1, #4 -+ vpadd.i32 d0, d0 @ This add U & V separately -+ add r2, r0, r3 -+ vmov d1, d0 -+ lsl r3, #1 -+ vrshrn.u16 d0, q0, #5 -+ vmov d1, d0 @ Dup results -+ vmov q1, q0 ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #16 ++ vld1.8 {q2-q3}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vaddl.u8 q1, d2, d3 ++A lsl r3, #2 ++T lsl r3, #1 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshrn.u16 d0, q2, #5 ++ vrshrn.u16 d1, q2, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q2, #5 + + @ Store +1: -+ vst1.8 { q0, q1}, [r0], r3 -+ vst1.8 { q0, q1}, [r2], r3 -+ subs r1, #1 -+ vst1.8 { q0, q1}, [r0], r3 -+ vst1.8 { q0, q1}, [r2], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10553,32 +10851,32 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_32_neon_8, export=1 + + @ Average the els of top & left -+ vld1.8 {q8, q9 }, [r1] -+ vld1.8 {q12, q13}, [r2] -+ vaddl.u8 q0, d16, d17 -+ vaddl.u8 q1, d18, d19 -+ vaddl.u8 q2, d24, d25 -+ vaddl.u8 q3, d26, d27 -+ vadd.i16 q0, q1 -+ vadd.i16 q2, q3 -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ mov r1, #8 -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ add r2, r0, r3 -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ lsl r3, #1 -+ vrshrn.u16 d0, q0, #6 -+ vdup.u8 q1, d0[0] @ Dup results -+ vdup.u8 q0, d0[0] ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #32 ++ vld1.8 {q2-q3}, [r2] ++ add r2, r0, r3 ++ vaddl.u8 q0, d0, d1 ++ lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshrn.u16 d0, q2, #6 ++ vrshrn.u16 d1, q2, #6 ++ vrshrn.u16 d2, q2, #6 ++ vrshrn.u16 d3, q2, #6 + + @ Store +1: -+ vst1.8 {q0, q1 }, [r0], r3 -+ vst1.8 {q0, q1 }, [r2], r3 -+ subs r1, #1 -+ vst1.8 {q0, q1 }, [r0], r3 -+ vst1.8 {q0, q1 }, [r2], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10616,7 +10914,7 @@ index 0000000000..af7ba1f45e +T lsl r3, #1 + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vmov.i64 d7, #0xffff -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10701,7 +10999,7 @@ index 0000000000..af7ba1f45e + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vadd.i16 d6, d2, d3 @ d6 has 4 vals -+ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 @@ -10748,24 +11046,30 @@ index 0000000000..af7ba1f45e +@ ptrdiff_t stride) [r3] (In pels - needs * 4) + +function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 -+ vld1.8 { q8, q9 }, [r1] -+ vld1.8 {q12, q13}, [r2] -+ vadd.i16 q8, q9 -+ vadd.i16 q12, q13 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 2 pairs -+ mov r1, #4 -+ vpadd.i32 d16, d16 -+ lsl r3, #2 @ stride in pels -+ vrshr.u16 d16, #4 -+ vdup.u32 q9, d16[0]; -+ vdup.u32 q8, d16[0]; ++ ++ @ Average the els of top & left ++ vld1.16 {q0-q1}, [r1] ++ mov r1, #8 ++ vld1.16 {q2-q3}, [r2] ++T lsl r3, #2 ++ vadd.i16 q1, q0 ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q2, q3 ++ vadd.i16 q1, q2 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshr.u16 q0, q1, #4 ++ vrshr.u16 q1, q1, #4 + + @ Store +1: -+ vst1.16 {q8, q9 }, [r0], r3 -+ subs r1, #1 -+ vst1.16 {q8, q9 }, [r0], r3 ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr @@ -10781,55 +11085,57 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_16_neon_10, export=1 + + @ Average the els of top & left -+ vld1.16 {q8, q9 }, [r1] -+ vld1.16 {q12, q13}, [r2] -+ lsl r3, #1 @ stride given in pels -+ vadd.u16 q0, q8, q12 -+ vadd.u16 q2, q9, q13 -+ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] -+ vadd.i16 q0, q2 -+ vadd.i16 d0, d1 @ d0 has 4 vals -+ vpadd.i16 d0, d0 @ 2 (top & bottom the same) -+ vpadd.i16 d0, d0 @ 1 (all the same) -+ vrshr.u16 d0, #5 -+ ++ vld1.16 {q8-q9}, [r1] ++ mov r1, #2 ++ vld1.16 {q10-q11}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.i16 q0, q8, q9 ++ vadd.i16 q1, q10, q11 ++ vmov.i16 q3, #3 ++ vadd.i16 q1, q0 ++ vadd.i16 d0, d16, d20 + vmov.i64 d31, #0xffff ++ vadd.i16 d3, d2 ++ vmov.16 d6[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 -+ @ top_line[0] is extra special -+ @ (top[0] + left[0] + dc * 2) ++ @ as does left ++ @ topline[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 + -+ vmov.u16 r12, d0[0] @ dc -+ add r2, r12, r12, lsl #1 @ dc*3 -+ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 -+ -+ vdup.u16 q3, r2 -+ vadd.u16 q8, q3 -+ vadd.u16 q9, q3 -+ vmov.u16 d16[0], r1 -+ vrshr.u16 q8, #2 -+ vrshr.u16 q9, #2 -+ -+ @ Construct lhs pels -+ vadd.u16 q12, q3 -+ vadd.u16 q13, q3 -+ vrshr.u16 q12, #2 -+ vrshr.u16 q13, #2 ++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] ++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d3, d3 @ 1 (all the same) ++ vrshr.u16 d2, d3, #5 ++ vrshr.u16 d3, d3, #5 ++ vmov q0, q1 ++ vmla.i16 q10, q1, d6[1] ++ vmla.i16 q11, q1, d6[1] ++ vmla.i16 q8, q1, q3 ++ vmla.i16 q9, q1, d6[1] ++ vrshr.u16 q2, q10, #2 ++ vrshr.u16 q3, q11, #2 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vext.16 q2, q2, q2, #1 ++ mov r1, #7<<29 + + @ Store top line -+ vst1.16 {q8, q9 }, [r0], r3 -+ -+ mov r1, #15 -+ vdup.u16 q1, d0[0] -+ vdup.u16 q0, d0[0] ++ vst1.16 {q8-q9}, [r0], r3 + ++ @ Store the rest +1: -+ vext.16 q12, q13, #1 -+ vext.16 q13, q13, #1 -+ vbit d0, d24, d31 -+ subs r1, #1 -+ vst1.16 {q0, q1 }, [r0], r3 ++ vbit d0, d4, d31 ++ vext.16 q2, q2, q2, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++1: ++ vbit d0, d6, d31 ++ vext.16 q3, q3, q3, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 + bne 1b + + bx lr @@ -10845,33 +11151,30 @@ index 0000000000..af7ba1f45e +function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 + + @ Average the els of top & left -+ vldm r1, { q8-q11} -+ vldm r2, {q12-q15} -+ vadd.i16 q8, q9 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #16 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #2 + vadd.i16 q10, q11 -+ vadd.i16 q12, q13 -+ vadd.i16 q14, q15 -+ vadd.i16 q8, q10 -+ vadd.i16 q12, q14 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 2 pairs -+ mov r1, #8 -+ vpadd.i32 d16, d16 -+ lsl r3, #2 @ stride in pels -+ vrshr.u16 d16, #5 -+ vmov d17, d16 @ Dup results -+ vmov q9, q8 -+ vmov q10, q8 -+ vmov q11, q8 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshr.u16 q0, q2, #5 ++ vrshr.u16 q1, q2, #5 + + @ Store +1: -+ vstm r0, {q8-q11} -+ add r0, r3 -+ subs r1, #1 -+ vstm r0, {q8-q11} -+ add r0, r3 -+ bne 1b ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b + + bx lr +endfunc @@ -10887,39 +11190,920 @@ index 0000000000..af7ba1f45e + + @ Average the els of top & left + @ With 10 bits we are (just) safe from overflow in i16 -+ vldm r1, { q8-q11} -+ vldm r2, {q12-q15} -+ vadd.i16 q8, q9 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #32 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #1 + vadd.i16 q10, q11 -+ vadd.i16 q12, q13 -+ vadd.i16 q14, q15 -+ vadd.i16 q8, q10 -+ vadd.i16 q12, q14 -+ vadd.i16 q8, q12 -+ vadd.i16 d16, d17 @ d16 has 4 vals -+ mov r1, #16 -+ vpadd.i16 d16, d16 @ 2 (top & bottom the same) -+ lsl r3, #1 @ stride in pels -+ vpadd.i16 d16, d16 @ 1 (all the same) -+ vrshr.u16 d16, #6 -+ vmov d17, d16 @ Dup results -+ vmov q9, q8 -+ vmov q10, q8 -+ vmov q11, q8 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshr.u16 q0, q2, #6 ++ vrshr.u16 q1, q2, #6 + + @ Store +1: -+ vstm r0, { q8-q11} -+ add r0, r3 -+ subs r1, #1 -+ vstm r0, { q8-q11} -+ add r0, r3 -+ bne 1b ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b + + bx lr +endfunc + + +diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +new file mode 100644 +index 0000000000..11773f918e +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +@@ -0,0 +1,878 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ All functions have the call ++@ ++@ int ff_hevc_rpi_intra_filter_N_neon_PW( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++@ ++@ Assumptions: ++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware ++@ if reuseing this code) ++@ ++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for ++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore ++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger ++@ ++@ We always have at least 64 pixel H frame width rounding - this lets us ++@ load UR widthout having to worry about exactly how many pixels are actually ++@ within the frame. As partial loads will only occur very occasionally this ++@ should be a win in nearly all cases. ++@ ++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters ++@ so we do no maths on the contents ++@ ++@ No filtering in 32bit fns as they are chroma only ++ ++ ++.equ AVAIL_UR, 1 ++.equ AVAIL_U, 2 ++.equ AVAIL_UL, 4 ++.equ AVAIL_L, 8 ++.equ AVAIL_DL, 16 ++ ++.equ FILTER_LIGHT, 0x40 ++.equ FILTER_STRONG, 0x80 ++ ++.equ AVAIL_S_UR_N_U_C, 32 - 1 ++.equ AVAIL_S_U_N_UL_C, 32 - 2 ++.equ AVAIL_S_UL_N_L_C, 32 - 3 ++.equ AVAIL_S_L_N_DL_C, 32 - 4 ++ ++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr ++ ++@ On entry ++@ r2 req ++@ r3 avail ++@ [sp, #sp_offset...] args ++@ ++@ On Exit: ++@ ++@ Extend values: ++@ d_l scalar contains value for L & DL ++@ d_ul scalar containing value for UL ++@ d_u scalar containing value for U ++@ d_ur scalar containing value for UR ++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... ++@ This means that L-filter works even if nreq DL (we never filter ++@ req-DL without req-L, but we do filter req-L without req-DL) ++@ If UR avail then d_ur == a_ur so U-filter good too ++@ ++@ Data load pointers (only load if req & avail): ++@ r4 DL ++@ r10 L ++@ r6 U ++@ r5 UR ++@ ++@ Others: ++@ r2 req ++@ r7 req & avail ++@ r3 L + stride ++@ r8 DL + stride ++@ r9 stride * 2 ++@ cs Load U ++@ mi Load UR ++@ ++@ Clobbered: ++@ r12 ++ ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur ++ ++.equ src_l\@, \sp_offset + 0 ++.equ src_u\@, \sp_offset + 4 ++.equ src_ur\@, \sp_offset + 8 ++.equ stride\@, \sp_offset + 12 ++.equ pw\@, (1 << \pw_s) @ pel width in bytes ++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes ++ ++@ r9 stride ++@ r7 = ab_ul, r6 = a_u, r5 = a_ur ++@ r4 = b_dl, r10 = b_l, r8 = b_u ++ ++ ldr r5, [sp, #src_ur\@] ++ lsl r12, r3, #AVAIL_S_U_DL_CPSR ++ ldr r10, [sp, #src_l\@] ++ ldr r9, [sp, #stride\@] ++ ldr r6, [sp, #src_u\@] ++ ++ @ This is quite a slow instruction but it replaces ++ @ a decent number of tests that yield a max of 2 flags/op ++ @ It is annoying we can't branch on Q! ++ @ If L navail (ne) then DL must be navail (pl) ++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur ++ ++ mov r4, r5 ++ sub r7, r10, r9 ++ it vs ++ movvs r4, r6 ++ add r8, r6, #b_size\@ - pw\@ ++ it cs ++ movcs r4, r7 ++ ite ne ++ movne r10, r4 ++ addeq r4, r7, r9, lsl #\log2_s ++ it cc ++ movcc r7, r10 ++ it mi ++ addmi r4, r10, r9, lsl #\log2_s ++ vld1.\d_type {\d_ul}, [r7] ++ itt vc ++ movvc r8, r7 ++ movvc r6, r7 ++ vld1.\d_type {\d_l }, [r4] ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_u }, [r6] ++ it eq ++ moveq r5, r8 ++ and r7, r2, r3 ++ add r8, r4, r9 ++ vld1.\d_type {\d_ur}, [r5] ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ add r3, r10, r9 ++ lsl r9, #1 ++.endm ++ ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] ++ ++ it cs ++ vldrcs s2, [r6] ++ ite pl ++ vmovpl s3, s4 ++ vldrmi s3, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10] ++ vld1.8 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d0[4]}, [r4], r9 ++ vld1.8 {d0[5]}, [r8], r9 ++ vld1.8 {d0[6]}, [r4] ++ vld1.8 {d0[7]}, [r8] ++1: ++ vstr d1, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] ++ vstr d0, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] ++ ++ it cs ++ vldrcs d2, [r6] ++ it mi ++ vldrmi d3, [r5] ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10] ++ vld1.16 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d1[0]}, [r4], r9 ++ vld1.16 {d1[1]}, [r8], r9 ++ vld1.16 {d1[2]}, [r4] ++ vld1.16 {d1[3]}, [r8] ++1: ++ vst1.16 {q1}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] ++ vst1.16 {q0}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++ ++function ff_hevc_rpi_intra_filter_8_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] ++ ++ it cs ++ vldrcs d4, [r6] ++ it mi ++ vldrmi d5, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10], r9 ++ vld1.8 {d0[3]}, [r3], r9 ++ vld1.8 {d0[4]}, [r10], r9 ++ vld1.8 {d0[5]}, [r3], r9 ++ vld1.8 {d0[6]}, [r10] ++ vld1.8 {d0[7]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d1[0]}, [r4], r9 ++ vld1.8 {d1[1]}, [r8], r9 ++ vld1.8 {d1[2]}, [r4], r9 ++ vld1.8 {d1[3]}, [r8], r9 ++ vld1.8 {d1[4]}, [r4], r9 ++ vld1.8 {d1[5]}, [r8], r9 ++ vld1.8 {d1[6]}, [r4] ++ vld1.8 {d1[7]}, [r8] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.8 q8, q15, q2, #15 ++ vext.8 q12, q15, q0, #15 ++ vaddl.u8 q9, d17, d5 ++ vaddl.u8 q8, d16, d4 ++ vaddl.u8 q13, d25, d1 ++ vaddl.u8 q12, d24, d0 ++ vmov.u8 r3, d5[7] @ Save final pel ++ vmov.u8 r2, d1[7] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ vrshrn.u16 d0, q0, #2 ++ vrshrn.u16 d1, q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u8 d5[7], r3 @ Restore final pel ++ vmov.u8 d1[7], r2 @ Restore final pel ++ vdup.u8 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.8 {q2 }, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] @ Up-left ++ vst1.8 {q0 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #4 ++ vldm r5, {d6, d7} ++ bgt 1f ++ vdup.16 d7, d6[3] ++1: ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vdup.16 q1, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10] ++ vld1.16 {d1[3]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.16 {d2[0]}, [r4], r9 ++ vld1.16 {d2[1]}, [r8], r9 ++ cmp r12, #p_size ++ vld1.16 {d2[2]}, [r4], r9 ++ vld1.16 {d2[3]}, [r8], r9 ++ blt 2f ++ vld1.16 {d3[0]}, [r4], r9 ++ vld1.16 {d3[1]}, [r8], r9 ++ vld1.16 {d3[2]}, [r4] ++ vld1.16 {d3[3]}, [r8] ++ b 1f ++2: ++ vdup.16 d3, d2[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.16 q9, q2, q3, #7 ++ vext.16 q8, q15, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ vadd.u16 q9, q3 ++ vadd.u16 q8, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r3, d7[3] @ Save final pel ++ vmov.u16 r2, d3[3] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r3 @ Restore final pel ++ vmov.u16 d3[3], r2 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.16 {q2, q3}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vst1.16 {q0, q1}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.16 q9, d16[0] ++ vdup.16 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {d16-d19} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #12 ++ @ Given chroma frame layout, if UR exists then it is always legit to ++ @ load all of it even if most of it is outside the frame. ++ vldm r5, {d20-d23} ++ bgt 1f ++ bge 4f ++ cmp r5, #8 ++ bge 3f ++ vdup.16 d21, d20[3] ++3: vdup.16 d22, d21[3] ++4: vdup.16 d23, d22[3] ++ ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ ldr r12, [sp, #dl_size] ++ vdup.16 q1, d0[0] ++ vdup.16 q2, d0[0] ++ vdup.16 q3, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10], r9 ++ vld1.16 {d1[3]}, [r3], r9 ++ vld1.16 {d2[0]}, [r10], r9 ++ vld1.16 {d2[1]}, [r3], r9 ++ vld1.16 {d2[2]}, [r10], r9 ++ vld1.16 {d2[3]}, [r3], r9 ++ vld1.16 {d3[0]}, [r10], r9 ++ vld1.16 {d3[1]}, [r3], r9 ++ vld1.16 {d3[2]}, [r10] ++ vld1.16 {d3[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d4[0]}, [r4], r9 ++ vld1.16 {d4[1]}, [r8], r9 ++ cmp r12, #4 ++ vld1.16 {d4[2]}, [r4], r9 ++ vld1.16 {d4[3]}, [r8], r9 ++ ble 2f ++ vld1.16 {d5[0]}, [r4], r9 ++ vld1.16 {d5[1]}, [r8], r9 ++ cmp r12, #12 ++ vld1.16 {d5[2]}, [r4], r9 ++ vld1.16 {d5[3]}, [r8], r9 ++ blt 3f ++ vld1.16 {d6[0]}, [r4], r9 ++ vld1.16 {d6[1]}, [r8], r9 ++ vld1.16 {d6[2]}, [r4], r9 ++ vld1.16 {d6[3]}, [r8], r9 ++ ble 4f ++ vld1.16 {d7[0]}, [r4], r9 ++ vld1.16 {d7[1]}, [r8], r9 ++ vld1.16 {d7[2]}, [r4] ++ vld1.16 {d7[3]}, [r8] ++ b 1f ++2: vdup.16 d5, d4[3] ++3: vdup.16 d6, d5[3] ++4: vdup.16 d7, d6[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ vpush {q5} ++ @ Luma light filter ++ @ Left ++ vext.16 q5, q2, q3, #7 ++ vext.16 q14, q1, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ ++ vadd.u16 q5, q3 ++ vadd.u16 q14, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r2, d7[3] @ Save final pel ++ ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q14, #1 ++ vext.16 q2, q14, q5, #1 ++ vext.16 q3, q5, q5, #1 ++ ++ vmov d30, d24 @ d30[0] = l[0] + ul ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ vadd.u16 q2, q14 ++ vadd.u16 q3, q5 ++ ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ ++ @ Up ++ vext.16 q5, q10, q11, #7 ++ vext.16 q14, q9, q10, #7 ++ vext.16 q13, q8, q9, #7 ++ vext.16 q12, q15, q8, #7 ++ ++ vadd.u16 q5, q11 ++ vadd.u16 q14, q10 ++ vadd.u16 q13, q9 ++ vadd.u16 q12, q8 ++ vmov.u16 r3, d23[3] @ Save final pel ++ ++ vext.16 q8, q12, q13, #1 ++ vext.16 q9, q13, q14, #1 ++ vext.16 q10, q14, q5, #1 ++ vext.16 q11, q5, q5, #1 ++ ++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q8, q12 ++ vadd.u16 q9, q13 ++ vadd.u16 q10, q14 ++ vadd.u16 q11, q5 ++ ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vrshr.u16 q10, #2 ++ vrshr.u16 q11, #2 ++ ++ @ Misc ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r2 @ Restore final pel ++ vmov.u16 d23[3], r3 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vpop {q5} ++ ++10: ++ vstm r1, {d16-d23} @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vstm r0, { d0-d7 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ it mi ++ vldmmi r5, {d6, d7} ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10] ++ vld1.32 {d1[1]}, [r3] ++1: ++ bcc 1f ++ vld1.32 {d2[0]}, [r4], r9 ++ vld1.32 {d2[1]}, [r8], r9 ++ vld1.32 {d3[0]}, [r4] ++ vld1.32 {d3[1]}, [r8] ++1: ++ vst1.32 {q2, q3 }, [r1] @ Up ++ vst1.32 {d31[1]}, [r12] ++ vst1.32 {q0, q1 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.32 q9, d16[0] ++ vdup.32 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {q8, q9 } ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #p_size ++ vldm r5, {q10, q11} ++ bge 1f ++ vdup.32 q11, d21[1] ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ vdup.32 q2, d0[0] ++ vdup.32 q3, d0[0] ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10], r9 ++ vld1.32 {d1[1]}, [r3], r9 ++ vld1.32 {d2[0]}, [r10], r9 ++ vld1.32 {d2[1]}, [r3], r9 ++ vld1.32 {d3[0]}, [r10] ++ vld1.32 {d3[1]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.32 {d4[0]}, [r4], r9 ++ vld1.32 {d4[1]}, [r8], r9 ++ cmp r12, #p_size ++ vld1.32 {d5[0]}, [r4], r9 ++ vld1.32 {d5[1]}, [r8], r9 ++ blt 2f ++ vld1.32 {d6[0]}, [r4], r9 ++ vld1.32 {d6[1]}, [r8], r9 ++ vld1.32 {d7[0]}, [r4] ++ vld1.32 {d7[1]}, [r8] ++ b 1f ++2: ++ vdup.32 q3, d5[1] ++1: ++ add r12, r0, #-pw ++ vstm r1, { q8-q11} @ Up ++ vst1.32 {d31[1]}, [r12] ++ vstm r0, { q0-q3 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (bytes) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] ++ ++ @ Once we get this big we have run out of neon regs to store ++ @ everything at once so do in pieces ++ ++ @ Up (have) ++ it cs ++ vldmcs r6, { q0-q3 } ++ ldr r12, [sp, #ur_size] ++ it mi ++ vldmmi r5, { q8-q11} ++ it cs ++ vstmcs r1, { q0-q3 } ++ bpl 1f ++ cmp r12, #12 ++ add lr, r1, #(pw << log2_s) ++ bgt 2f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 q9, d17[1] ++4: vdup.16 d10, d19[1] ++3: vdup.16 q11, d21[1] ++2: vstm lr, { q8-q11} ++1: ++ ++ @ Left (have) ++ add lr, r0, #-pw ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vst1.32 {d30[1]}, [lr] @ UL ++ bpl 1f ++ vld1.32 { d0[0]}, [r10], r9 ++ vld1.32 { d0[1]}, [r3], r9 ++ vld1.32 { d1[0]}, [r10], r9 ++ vld1.32 { d1[1]}, [r3], r9 ++ vld1.32 { d2[0]}, [r10], r9 ++ vld1.32 { d2[1]}, [r3], r9 ++ vld1.32 { d3[0]}, [r10], r9 ++ vld1.32 { d3[1]}, [r3], r9 ++ vld1.32 { d4[0]}, [r10], r9 ++ vld1.32 { d4[1]}, [r3], r9 ++ vld1.32 { d5[0]}, [r10], r9 ++ vld1.32 { d5[1]}, [r3], r9 ++ vld1.32 { d6[0]}, [r10], r9 ++ vld1.32 { d6[1]}, [r3], r9 ++ vld1.32 { d7[0]}, [r10] ++ vld1.32 { d7[1]}, [r3] ++ vstm r0, { q0-q3 } ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ add lr, r0, #(pw << log2_s) ++ vld1.32 {d16[0]}, [r4], r9 ++ vld1.32 {d16[1]}, [r8], r9 ++ cmp r12, #4 ++ vld1.32 {d17[0]}, [r4], r9 ++ vld1.32 {d17[1]}, [r8], r9 ++ ble 2f ++ vld1.32 {d18[0]}, [r4], r9 ++ vld1.32 {d18[1]}, [r8], r9 ++ cmp r12, #12 ++ vld1.32 {d19[0]}, [r4], r9 ++ vld1.32 {d19[1]}, [r8], r9 ++ blt 3f ++ vld1.32 {d20[0]}, [r4], r9 ++ vld1.32 {d20[1]}, [r8], r9 ++ vld1.32 {d21[0]}, [r4], r9 ++ vld1.32 {d21[1]}, [r8], r9 ++ ble 4f ++ vld1.32 {d22[0]}, [r4], r9 ++ vld1.32 {d22[1]}, [r8], r9 ++ vld1.32 {d23[0]}, [r4] ++ vld1.32 {d23[1]}, [r8] ++ b 5f ++2: vdup.32 q9, d17[1] ++3: vdup.32 q10, d19[1] ++4: vdup.32 q11, d21[1] ++5: vstm lr, { q8-q11} ++1: ++ eors r7, r2 ++ beq 99f ++ ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ vdup.32 q0, d31[0] ++ vdup.32 q1, d31[0] ++ vdup.32 q2, d31[0] ++ vdup.32 q3, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ vdup.32 q10, d31[1] ++ vdup.32 q11, d31[1] ++ it cs ++ vstmcs r1, { q0-q3 } ++ it mi ++ vstmmi lr, { q8-q11} ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q0, d30[0] ++ vdup.32 q1, d30[0] ++ vdup.32 q2, d30[0] ++ vdup.32 q3, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0, { q0-q3 } ++ it cs ++ vstmcs lr, { q0-q3 } ++ ++99: ++ pop {r4-r10, pc} ++endfunc ++ ++ ++ ++ diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S new file mode 100644 index 0000000000..ccf13a081f @@ -12940,10 +14124,10 @@ index d181b74570..c52c450956 100644 if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c new file mode 100644 -index 0000000000..4891a79eb5 +index 0000000000..f053ebcc59 --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c -@@ -0,0 +1,2269 @@ +@@ -0,0 +1,2266 @@ +/* + * HEVC CABAC decoding + * @@ -13831,9 +15015,9 @@ index 0000000000..4891a79eb5 + int x_cb = x0 >> s->ps.sps->log2_min_cb_size; + int y_cb = y0 >> s->ps.sps->log2_min_cb_size; + -+ if (lc->ctb_left_flag || x0b) ++ if ((lc->ctb_avail & AVAIL_L) != 0 || x0b) + depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1]; -+ if (lc->ctb_up_flag || y0b) ++ if ((lc->ctb_avail & AVAIL_U) != 0 || y0b) + depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb]; + + inc += (depth_left > ct_depth); @@ -14371,7 +15555,6 @@ index 0000000000..4891a79eb5 + + // Rewrite as add residual - must rewrite all fields as different union member + pc->type = RPI_PRED_ADD_RESIDUAL_V; -+ pc->c_idx = c_idx; + pc->ta.buf = coeffs; + pc->ta.dst = dst; + pc->ta.stride = stride; @@ -14384,7 +15567,6 @@ index 0000000000..4891a79eb5 + + cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); + cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; + cmd->ta.buf = coeffs; + cmd->ta.dst = dst; + cmd->ta.stride = stride; @@ -14440,7 +15622,6 @@ index 0000000000..4891a79eb5 + + cmd->type = RPI_PRED_ADD_DC + c_idx; + cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; + cmd->dc.dst = dst; + cmd->dc.stride = stride; + cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; @@ -15215,7 +16396,7 @@ index 0000000000..4891a79eb5 +#endif diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h new file mode 100644 -index 0000000000..a360815a36 +index 0000000000..f6daf936ca --- /dev/null +++ b/libavcodec/rpi_hevc_cabac_fns.h @@ -0,0 +1,190 @@ @@ -15349,9 +16530,9 @@ index 0000000000..a360815a36 + const uint8_t * const skip_bits = s->skip_flag + y_cb * stride; + + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + -+ ((!lc->ctb_left_flag && (x0 & ctb_mask) == 0) ? 0 : ++ (((lc->ctb_avail & AVAIL_L) == 0 && (x0 & ctb_mask) == 0) ? 0 : + (skip_bits[((x_cb - 1) >> 3)] >> ((x_cb - 1) & 7)) & 1) + -+ ((!lc->ctb_up_flag && (y0 & ctb_mask) == 0) ? 0 : ++ (((lc->ctb_avail & AVAIL_U) == 0 && (y0 & ctb_mask) == 0) ? 0 : + (skip_bits[(x_cb >> 3) - stride] >> (x_cb & 7)) & 1)); +} + @@ -15529,10 +16710,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..4bfa000da4 +index 0000000000..05d447eaa5 --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1236 @@ +@@ -0,0 +1,1210 @@ +/* + * HEVC video decoder + * @@ -15653,28 +16834,6 @@ index 0000000000..4bfa000da4 + return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; +} + -+static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+int i, j; -+ -+ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=8) -+ AV_COPY64U(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } else { -+ for (i = 0; i < height; i++) { -+ for (j = 0; j < width; j+=16) -+ AV_COPY128(dst+j, src+j); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } -+} -+ +// "DSP" these? +static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) +{ @@ -15914,7 +17073,7 @@ index 0000000000..4bfa000da4 + [2*MAX_PB_SIZE*MAX_PB_SIZE]; + dst = dstbuf; + stride_dst = 2*MAX_PB_SIZE; -+ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, @@ -16028,10 +17187,7 @@ index 0000000000..4bfa000da4 + } + } + -+ copy_CTB(dst, -+ src, -+ width << sh, -+ height, stride_dst, stride_src); ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); @@ -16063,7 +17219,6 @@ index 0000000000..4bfa000da4 + horiz_edge, + diag_edge); + } -+ // ??? Does this actually work for chroma ??? + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + sao->type_idx[c_idx] = SAO_APPLIED; @@ -16771,10 +17926,10 @@ index 0000000000..4bfa000da4 + diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 -index 0000000000..93a6294c76 +index 0000000000..f283f01489 --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,759 @@ +@@ -0,0 +1,704 @@ +/* + * HEVC video decoder + * @@ -16816,43 +17971,6 @@ index 0000000000..93a6294c76 + { 3, 2, }, +}; + -+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH) -+{ -+ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); -+ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); -+ -+ lc->na.cand_up = (lc->ctb_up_flag || y0b); -+ lc->na.cand_left = (lc->ctb_left_flag || x0b); -+ lc->na.cand_up_left = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up; -+ lc->na.cand_up_right = (x0 + nPbW) >= lc->end_of_ctb_x ? -+ (lc->ctb_up_right_flag && !y0b) : lc->na.cand_up; -+ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_ctb_y) ? 0 : lc->na.cand_left; -+} -+ -+/* -+ * 6.4.1 Derivation process for z-scan order block availability -+ */ -+static av_always_inline int z_scan_block_avail(const HEVCRpiContext * const s, const int xCurr, const int yCurr, -+ const int xN, const int yN) -+{ -+#define MIN_TB_ADDR_ZS(x, y) \ -+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] -+ -+ int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size; -+ int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size; -+ int xN_ctb = xN >> s->ps.sps->log2_ctb_size; -+ int yN_ctb = yN >> s->ps.sps->log2_ctb_size; -+ if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb ) -+ return 1; -+ else { -+ int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, -+ (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); -+ int N = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, -+ (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); -+ return N <= Curr; -+ } -+} + +//check if the two luma locations belong to the same motion estimation region +static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP) @@ -17042,9 +18160,6 @@ index 0000000000..93a6294c76 +#define AVAILABLE(cand, v) \ + (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA)) + -+#define PRED_BLOCK_AVAILABLE(v) \ -+ z_scan_block_avail(s, x0, y0, x ## v, y ## v) -+ +#define COMPARE_MV_REFIDX(a, b) \ + compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b)) + @@ -17053,7 +18168,7 @@ index 0000000000..93a6294c76 + */ +static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, + int nPbW, int nPbH, -+ int log2_cb_size, ++ int log2_cb_size, const unsigned int avail, + int singleMCLFlag, int part_idx, + int merge_idx, + struct MvField mergecandlist[]) @@ -17062,13 +18177,6 @@ index 0000000000..93a6294c76 + const MvField * const tab_mvf = s->ref->tab_mvf; + + const int min_pu_width = s->ps.sps->min_pu_width; -+ -+ const int cand_bottom_left = lc->na.cand_bottom_left; -+ const int cand_left = lc->na.cand_left; -+ const int cand_up_left = lc->na.cand_up_left; -+ const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right; -+ + const int xA1 = x0 - 1; + const int yA1 = y0 + nPbH - 1; + @@ -17106,7 +18214,7 @@ index 0000000000..93a6294c76 + is_diff_mer(s, xA1, yA1, x0, y0)) { + is_available_a1 = 0; + } else { -+ is_available_a1 = AVAILABLE(cand_left, A1); ++ is_available_a1 = AVAILABLE((avail & AVAIL_L) != 0, A1); + if (is_available_a1) { + mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1); + if (merge_idx == 0) @@ -17122,7 +18230,7 @@ index 0000000000..93a6294c76 + is_diff_mer(s, xB1, yB1, x0, y0)) { + is_available_b1 = 0; + } else { -+ is_available_b1 = AVAILABLE(cand_up, B1); ++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); + if (is_available_b1 && + !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) { + mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1); @@ -17133,8 +18241,7 @@ index 0000000000..93a6294c76 + } + + // above right spatial merge candidate -+ is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ PRED_BLOCK_AVAILABLE(B0) && ++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0) && + !is_diff_mer(s, xB0, yB0, x0, y0); + + if (is_available_b0 && @@ -17146,8 +18253,7 @@ index 0000000000..93a6294c76 + } + + // left bottom spatial merge candidate -+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ PRED_BLOCK_AVAILABLE(A0) && ++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0) && + !is_diff_mer(s, xA0, yA0, x0, y0); + + if (is_available_a0 && @@ -17159,7 +18265,7 @@ index 0000000000..93a6294c76 + } + + // above left spatial merge candidate -+ is_available_b2 = AVAILABLE(cand_up_left, B2) && ++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2) && + !is_diff_mer(s, xB2, yB2, x0, y0); + + if (is_available_b2 && @@ -17261,8 +18367,8 @@ index 0000000000..93a6294c76 + part_idx = 0; + } + -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); + derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), + singleMCLFlag, part_idx, + merge_idx, mergecand_list); + @@ -17344,8 +18450,9 @@ index 0000000000..93a6294c76 + (y ## v) >> s->ps.sps->log2_min_pu_size, \ + pred, &mx, ref_idx_curr, ref_idx) + -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, ++ int x0, int y0, int nPbW, int nPbH, ++ int log2_cb_size, const unsigned int avail, int part_idx, + int merge_idx, MvField * const mv, + int mvp_lx_flag, int LX) +{ @@ -17375,11 +18482,6 @@ index 0000000000..93a6294c76 + int pred_flag_index_l0; + int pred_flag_index_l1; + -+ const int cand_bottom_left = lc->na.cand_bottom_left; -+ const int cand_left = lc->na.cand_left; -+ const int cand_up_left = lc->na.cand_up_left; -+ const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right; + ref_idx_curr = LX; + ref_idx = mv->ref_idx[LX]; + pred_flag_index_l0 = LX; @@ -17389,14 +18491,13 @@ index 0000000000..93a6294c76 + xA0 = x0 - 1; + yA0 = y0 + nPbH; + -+ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ PRED_BLOCK_AVAILABLE(A0); ++ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0); + + //left spatial merge candidate + xA1 = x0 - 1; + yA1 = y0 + nPbH - 1; + -+ is_available_a1 = AVAILABLE(cand_left, A1); ++ is_available_a1 = AVAILABLE((avail & AVAIL_L), A1); + if (is_available_a0 || is_available_a1) + isScaledFlag_L0 = 1; + @@ -17443,18 +18544,17 @@ index 0000000000..93a6294c76 + xB0 = x0 + nPbW; + yB0 = y0 - 1; + -+ is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ PRED_BLOCK_AVAILABLE(B0); ++ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0); + + // above spatial merge candidate + xB1 = x0 + nPbW - 1; + yB1 = y0 - 1; -+ is_available_b1 = AVAILABLE(cand_up, B1); ++ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); + + // above left spatial merge candidate + xB2 = x0 - 1; + yB2 = y0 - 1; -+ is_available_b2 = AVAILABLE(cand_up_left, B2); ++ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2); + + // above right spatial merge candidate + if (is_available_b0) { @@ -17726,10 +18826,10 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..744e7cf248 +index 0000000000..4967b3f44c --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1957 @@ +@@ -0,0 +1,1934 @@ +/* + * HEVC Parameter Set decoding + * @@ -19107,7 +20207,6 @@ index 0000000000..744e7cf248 + av_freep(&pps->tile_size); + av_freep(&pps->tile_id); + av_freep(&pps->ctb_ts_flags); -+ av_freep(&pps->min_tb_addr_zs_tab); + + av_freep(&pps); +} @@ -19172,7 +20271,6 @@ index 0000000000..744e7cf248 +static inline int setup_pps(AVCodecContext * const avctx, + HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) +{ -+ int log2_diff; + int pic_area_in_ctbs; + int i, j, x, y, ctb_addr_rs, tile_id; + @@ -19276,9 +20374,8 @@ index 0000000000..744e7cf248 + pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); + pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); + pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); -+ pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab)); + if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || -+ !pps->tile_id || !pps->min_tb_addr_zs_tab || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { ++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { + return AVERROR(ENOMEM); + } + @@ -19374,26 +20471,6 @@ index 0000000000..744e7cf248 + } + } + -+ log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size; -+ pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1]; -+ for (y = 0; y < sps->tb_mask+2; y++) { -+ pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1; -+ pps->min_tb_addr_zs_tab[y] = -1; -+ } -+ for (y = 0; y < sps->tb_mask+1; y++) { -+ for (x = 0; x < sps->tb_mask+1; x++) { -+ int tb_x = x >> log2_diff; -+ int tb_y = y >> log2_diff; -+ int rs = sps->ctb_width * tb_y + tb_x; -+ int val = pps->ctb_addr_rs_to_ts[rs] << (log2_diff * 2); -+ for (i = 0; i < log2_diff; i++) { -+ int m = 1 << i; -+ val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0); -+ } -+ pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val; -+ } -+ } -+ + return 0; +} + @@ -19689,10 +20766,10 @@ index 0000000000..744e7cf248 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..00c1f14614 +index 0000000000..77af463e31 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,444 @@ +@@ -0,0 +1,442 @@ +/* + * HEVC parameter set parsing + * @@ -20099,8 +21176,6 @@ index 0000000000..00c1f14614 + uint16_t *tile_id; ///< TileId + uint16_t *tile_pos_ts; ///< TilePosRS + uint16_t *tile_size; ///< TileSize -+ int *min_tb_addr_zs; ///< MinTbAddrZS -+ int *min_tb_addr_zs_tab;///< MinTbAddrZS + uint8_t * ctb_ts_flags; + + uint8_t data[4096]; @@ -20108,14 +21183,14 @@ index 0000000000..00c1f14614 +} HEVCRpiPPS; + +typedef struct HEVCRpiParamSets { -+ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; -+ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; -+ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; -+ + /* currently active parameter sets */ + const HEVCRpiVPS *vps; + const HEVCRpiSPS *sps; + const HEVCRpiPPS *pps; ++ ++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; ++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; ++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; +} HEVCRpiParamSets; + +int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, @@ -25730,210 +26805,234 @@ index 0000000000..3caef20137 + diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h new file mode 100644 -index 0000000000..1c364492d0 +index 0000000000..18128f4311 --- /dev/null +++ b/libavcodec/rpi_hevc_transform10.h -@@ -0,0 +1,94 @@ +@@ -0,0 +1,106 @@ +static const unsigned char rpi_hevc_transform10 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 ++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 ++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 ++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 ++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 ++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 ++0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, 0x39, 0xef, // 0040 ++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 ++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 ++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 ++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 ++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 ++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 ++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 ++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 ++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 ++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 ++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 ++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 ++0x30, 0xc0, 0x06, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 ++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 ++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 ++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 ++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 ++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 ++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 ++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 ++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 ++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 ++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 ++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 ++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 ++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 ++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 ++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 ++0x00, 0x00, 0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, // 0128 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 ++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 ++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 ++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 ++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 ++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 ++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 ++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 ++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 ++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 ++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 ++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 ++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 ++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 ++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 ++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 ++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 ++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 ++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 ++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 ++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 ++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 ++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 ++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 ++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x02, // 0220 ++0x00, 0x00, 0x65, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 ++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 ++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 ++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 ++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 ++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 ++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 ++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 ++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 ++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 ++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 ++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 ++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 ++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 ++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 ++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 ++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 ++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 ++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 ++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 ++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 ++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 ++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 ++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 ++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 ++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 ++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 +}; diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h new file mode 100644 -index 0000000000..1128a2c054 +index 0000000000..3557348e30 --- /dev/null +++ b/libavcodec/rpi_hevc_transform8.h -@@ -0,0 +1,94 @@ +@@ -0,0 +1,106 @@ +static const unsigned char rpi_hevc_transform8 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 -+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 -+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 -+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 -+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 -+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 -+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 -+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 -+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 -+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 -+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 -+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 -+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 -+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 -+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 -+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 -+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 -+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 -+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 -+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 -+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 -+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 -+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 -+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 -+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 -+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 -+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 -+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 -+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 -+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 -+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 -+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 -+0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 -+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 -+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 -+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 -+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 -+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 -+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 -+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 -+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 -+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 -+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 -+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 -+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 -+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 -+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 -+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 -+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 -+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 -+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 -+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 -+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 -+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 -+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 -+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 -+0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 -+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 -+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 -+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 -+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 -+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 -+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 -+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 -+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 -+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 -+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 -+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 -+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 -+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 -+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 -+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 -+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 -+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 -+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 -+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 -+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 -+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 -+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 -+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 -+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 -+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 ++0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 ++0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 ++0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 ++0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 ++0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 ++0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 ++0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, 0x39, 0xef, // 0040 ++0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 ++0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 ++0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 ++0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 ++0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 ++0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 ++0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 ++0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 ++0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 ++0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 ++0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 ++0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 ++0x30, 0xc0, 0x04, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 ++0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 ++0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 ++0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 ++0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 ++0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 ++0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 ++0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 ++0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 ++0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 ++0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 ++0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 ++0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 ++0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 ++0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 ++0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 ++0x00, 0x00, 0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, // 0128 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 ++0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 ++0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 ++0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 ++0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 ++0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 ++0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 ++0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 ++0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 ++0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 ++0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 ++0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 ++0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 ++0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 ++0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 ++0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 ++0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 ++0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 ++0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 ++0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 ++0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 ++0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 ++0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 ++0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 ++0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 ++0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 ++0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 ++0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x08, // 0220 ++0x00, 0x00, 0x45, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 ++0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 ++0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 ++0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 ++0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 ++0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 ++0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 ++0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 ++0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 ++0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 ++0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 ++0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 ++0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 ++0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 ++0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 ++0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 ++0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 ++0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 ++0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 ++0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 ++0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 ++0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 ++0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 ++0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 ++0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 ++0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 ++0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 ++0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..bddf0c3417 +index 0000000000..7c98f707d3 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5782 @@ +@@ -0,0 +1,5850 @@ +/* + * HEVC video Decoder + * @@ -27742,7 +28841,7 @@ index 0000000000..bddf0c3417 + + if (s->sh.slice_sample_adaptive_offset_flag[0] || + s->sh.slice_sample_adaptive_offset_flag[1]) { -+ if (lc->ctb_left_flag) ++ if ((lc->ctb_avail & AVAIL_L) != 0) + { + const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_left_flag) { @@ -27750,7 +28849,7 @@ index 0000000000..bddf0c3417 + return; + } + } -+ if (lc->ctb_up_flag) ++ if ((lc->ctb_avail & AVAIL_U) != 0) + { + const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_up_flag) { @@ -27832,19 +28931,97 @@ index 0000000000..bddf0c3417 + return jb->intra.cmds + jb->intra.n++; +} + -+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx) ++#define A0(x, y, U, L, UL, UR, DL) \ ++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) ++ ++#define A1(x, y, U, L, UL, UR, DL) \ ++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) ++ ++#define A2(x, y, U, L, UL, UR, DL) \ ++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) ++ ++#define A3(x, y, U, L, UL, UR, DL) \ ++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) ++ ++#define A4(x, y, U, L, UL, UR, DL) \ ++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) ++ ++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) ++{ ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_mask = ctb_size - 1; ++ const unsigned int tb_x = x & ctb_mask; ++ const unsigned int tb_y = y & ctb_mask; ++ ++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; ++ ++ unsigned int f = (lc->ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); ++ ++ if ((tb_x != 0 || tb_y != 0) && (~f & (AVAIL_L | AVAIL_U)) == 0) ++ f |= AVAIL_UL; ++ ++ ++ if (x + w >= lc->end_of_ctb_x) ++ { ++ if (tb_y == 0) ++ f |= (lc->ctb_avail & AVAIL_UR); ++ } ++ else ++ { ++ f |= (tb_y != 0) ? (tb_f[(w - 1) >> 2] & AVAIL_UR) : (lc->ctb_avail >> (AVAIL_S_U - AVAIL_S_UR)) & AVAIL_UR; ++ } ++#if AVAIL_S_U - AVAIL_S_UR < 0 ++#error Shift problem ++#endif ++ ++ // Never any D if Y beyond eoctb ++ if (y + h < lc->end_of_ctb_y) ++ { ++ if (tb_x == 0) ++ f |= (lc->ctb_avail << (AVAIL_S_DL - AVAIL_S_L)) & AVAIL_DL; ++ else ++ f |= tb_f[((h - 1) >> 2) * 16] & AVAIL_DL; ++ } ++#if AVAIL_S_DL - AVAIL_S_L < 0 ++#error Shift problem ++#endif ++ ++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, ++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], ++// lc->end_of_ctb_x, lc->end_of_ctb_y); ++ ++ return f; ++} ++ ++#undef A0 ++#undef A1 ++#undef A2 ++#undef A3 ++#undef A4 ++ ++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, ++ unsigned int avail) +{ + // If rpi_enabled then sand - U & V done on U call + if (c_idx <= 1) + { + HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_INTRA; ++ cmd->type = RPI_PRED_INTRA + c_idx; + cmd->size = log2_trafo_size; -+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; -+ cmd->c_idx = c_idx; ++ cmd->avail = avail; + cmd->i_pred.x = x0; + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ ++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); + } +} + @@ -27872,8 +29049,8 @@ index 0000000000..bddf0c3417 + + if (lc->cu.pred_mode == MODE_INTRA) { + const unsigned int trafo_size = 1 << log2_trafo_size; -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size); -+ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0); ++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size)); + } + + if (cbf_luma || cbf_chroma != 0) @@ -27940,6 +29117,8 @@ index 0000000000..bddf0c3417 + + if (cbf_luma) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); ++ ++ + if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) { + const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); + const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); @@ -27952,8 +29131,8 @@ index 0000000000..bddf0c3417 + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v)); + } + if (((cbf_chroma >> i) & CBF_CB0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), @@ -27979,10 +29158,10 @@ index 0000000000..bddf0c3417 + hls_cross_component_pred(lc, 1); + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { -+ if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); -+ } ++// if (lc->cu.pred_mode == MODE_INTRA) { ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// } + if (((cbf_chroma >> i) & CBF_CR0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), + log2_trafo_size_c, scan_idx_c, 2); @@ -27993,11 +29172,12 @@ index 0000000000..bddf0c3417 + int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer; + int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2; + const int size = 1 << log2_trafo_size_c; ++ int j; + + uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride + + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; -+ for (i = 0; i < (size * size); i++) { -+ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ for (j = 0; j < (size * size); j++) { ++ coeffs[j] = ((lc->tu.res_scale_val * coeffs_y[j]) >> 3); + } + s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride); + } @@ -28007,20 +29187,18 @@ index 0000000000..bddf0c3417 + int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { + if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v)); + } + if (((cbf_chroma >> i) & CBF_CB0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), + log2_trafo_size, scan_idx_c, 1); + } + for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { -+ if (lc->cu.pred_mode == MODE_INTRA) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); -+ } ++// if (lc->cu.pred_mode == MODE_INTRA) { ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v)); ++// } + if (((cbf_chroma >> i) & CBF_CR0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), + log2_trafo_size, scan_idx_c, 2); @@ -28030,28 +29208,29 @@ index 0000000000..bddf0c3417 + if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) { + int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); + int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2); -+ if (ctx_cfmt(s) == 2) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); -+ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); -+ } ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size_h, trafo_size_v)); ++// if (ctx_cfmt(s) == 2) { ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0 + (1 << log2_trafo_size_c), trafo_size_h, trafo_size_v)); ++// } + } else if (blk_idx == 3) { + int trafo_size_h = 1 << (log2_trafo_size + 1); + int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase, -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2); -+ if (ctx_cfmt(s) == 2) { -+ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)), -+ trafo_size_h, trafo_size_v); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); -+ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); -+ } ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase, trafo_size_h, trafo_size_v)); ++// if (ctx_cfmt(s) == 2) { ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v)); ++// do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2, ++// ff_hevc_rpi_tb_avail_flags(s, lc, xBase, yBase + (1 << (log2_trafo_size)), trafo_size_h, trafo_size_v)); ++// } + } + } + @@ -28269,8 +29448,8 @@ index 0000000000..bddf0c3417 +{ + enum InterPredIdc inter_pred_idc = PRED_L0; + int mvp_flag; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); + -+ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); + mv->pred_flag = 0; + if (s->sh.slice_type == HEVC_SLICE_B) + inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); @@ -28282,7 +29461,7 @@ index 0000000000..bddf0c3417 + mv->pred_flag = PF_L0; + ff_hevc_rpi_hls_mvd_coding(lc); + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, + part_idx, merge_idx, mv, mvp_flag, 0); + mv->mv[0].x += lc->pu.mvd.x; + mv->mv[0].y += lc->pu.mvd.y; @@ -28300,7 +29479,7 @@ index 0000000000..bddf0c3417 + + mv->pred_flag += PF_L1; + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, + part_idx, merge_idx, mv, mvp_flag, 1); + mv->mv[1].x += lc->pu.mvd.x; + mv->mv[1].y += lc->pu.mvd.y; @@ -28996,12 +30175,10 @@ index 0000000000..bddf0c3417 + int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); + int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); + -+ int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size); -+ + // intra_pred_mode prediction does not cross vertical CTB boundaries -+ const unsigned int cand_up = (lc->ctb_up_flag || y0b) && (y0 > y_ctb) ? ++ const unsigned int cand_up = y0b != 0 ? + s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC; -+ const unsigned int cand_left = (lc->ctb_left_flag || x0b) ? ++ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) != 0 || x0b) ? + s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC; + + int intra_pred_mode; @@ -29408,16 +30585,17 @@ index 0000000000..bddf0c3417 + if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) + lc->boundary_flags |= BOUNDARY_UPPER_SLICE; + -+ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0; -+ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0; -+ + // Use line width rather than tile width for addr_in_slice test as + // addr_in_slice is in raster units -+ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && -+ (ctb_addr_rs_in_slice >= line_w + 1); + -+ lc->ctb_up_right_flag = (ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && -+ (ctb_addr_rs_in_slice + 1 >= line_w); ++ lc->ctb_avail = ++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | ++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | ++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | ++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && ++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); ++ // Down-left never avail at CTB level +} + + @@ -29455,22 +30633,11 @@ index 0000000000..bddf0c3417 + switch (cmd->type) + { + case RPI_PRED_INTRA: -+ { -+ HEVCRpiLocalContextIntra lci; // Abbreviated local context -+ HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci; -+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; -+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; -+ lc->na.cand_left = (cmd->na >> 3) & 1; -+ lc->na.cand_up_left = (cmd->na >> 2) & 1; -+ lc->na.cand_up = (cmd->na >> 1) & 1; -+ lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ if (cmd->c_idx == 0) -+ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); -+ else -+ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail); ++ break; ++ case RPI_PRED_INTRA_C: ++ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail); + break; -+ } -+ + case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; @@ -31718,10 +32885,10 @@ index 0000000000..bddf0c3417 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..d242727b2a +index 0000000000..d2ac038c9b --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1000 @@ +@@ -0,0 +1,958 @@ +/* + * HEVC video decoder + * @@ -31956,46 +33123,6 @@ index 0000000000..d242727b2a + PF_BI, +}; + -+enum IntraPredMode { -+ INTRA_PLANAR = 0, -+ INTRA_DC, -+ INTRA_ANGULAR_2, -+ INTRA_ANGULAR_3, -+ INTRA_ANGULAR_4, -+ INTRA_ANGULAR_5, -+ INTRA_ANGULAR_6, -+ INTRA_ANGULAR_7, -+ INTRA_ANGULAR_8, -+ INTRA_ANGULAR_9, -+ INTRA_ANGULAR_10, -+ INTRA_ANGULAR_11, -+ INTRA_ANGULAR_12, -+ INTRA_ANGULAR_13, -+ INTRA_ANGULAR_14, -+ INTRA_ANGULAR_15, -+ INTRA_ANGULAR_16, -+ INTRA_ANGULAR_17, -+ INTRA_ANGULAR_18, -+ INTRA_ANGULAR_19, -+ INTRA_ANGULAR_20, -+ INTRA_ANGULAR_21, -+ INTRA_ANGULAR_22, -+ INTRA_ANGULAR_23, -+ INTRA_ANGULAR_24, -+ INTRA_ANGULAR_25, -+ INTRA_ANGULAR_26, -+ INTRA_ANGULAR_27, -+ INTRA_ANGULAR_28, -+ INTRA_ANGULAR_29, -+ INTRA_ANGULAR_30, -+ INTRA_ANGULAR_31, -+ INTRA_ANGULAR_32, -+ INTRA_ANGULAR_33, -+ INTRA_ANGULAR_34, -+}; -+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 -+#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 -+ +enum SAOType { + SAO_NOT_APPLIED = 0, + SAO_BAND, @@ -32042,14 +33169,6 @@ index 0000000000..d242727b2a + uint8_t cu_transquant_bypass_flag; +} RpiCodingUnit; + -+typedef struct RpiNeighbourAvailable { -+ char cand_bottom_left; -+ char cand_left; -+ char cand_up; -+ char cand_up_left; -+ char cand_up_right; -+} RpiNeighbourAvailable; -+ +typedef struct RpiPredictionUnit { + uint8_t intra_pred_mode[4]; + uint8_t intra_pred_mode_c[4]; @@ -32115,14 +33234,8 @@ index 0000000000..d242727b2a + uint8_t dpb_no; +} HEVCFrame; + -+typedef struct HEVCRpiLocalContextIntra { -+ TransformUnit tu; -+ RpiNeighbourAvailable na; -+} HEVCRpiLocalContextIntra; -+ +typedef struct HEVCRpiLocalContext { -+ TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!) -+ RpiNeighbourAvailable na; ++ TransformUnit tu; + + CABACContext cc; + @@ -32163,10 +33276,20 @@ index 0000000000..d242727b2a + int8_t curr_qp_y; + int8_t qPy_pred; + -+ uint8_t ctb_left_flag; -+ uint8_t ctb_up_flag; -+ uint8_t ctb_up_right_flag; -+ uint8_t ctb_up_left_flag; ++// N.B. Used by asm (neon) - do not change ++#define AVAIL_S_UR 0 ++#define AVAIL_S_U 1 ++#define AVAIL_S_UL 2 ++#define AVAIL_S_L 3 ++#define AVAIL_S_DL 4 ++ ++#define AVAIL_U (1 << AVAIL_S_U) ++#define AVAIL_L (1 << AVAIL_S_L) ++#define AVAIL_UL (1 << AVAIL_S_UL) ++#define AVAIL_UR (1 << AVAIL_S_UR) ++#define AVAIL_DL (1 << AVAIL_S_DL) ++ ++ uint8_t ctb_avail; + int end_of_ctb_x; + int end_of_ctb_y; + @@ -32206,6 +33329,7 @@ index 0000000000..d242727b2a + RPI_PRED_ADD_DC_U, // Both U & V are effectively C + RPI_PRED_ADD_DC_V, + RPI_PRED_INTRA, ++ RPI_PRED_INTRA_C, + RPI_PRED_I_PCM, + RPI_PRED_CMD_MAX +}; @@ -32213,8 +33337,8 @@ index 0000000000..d242727b2a +typedef struct HEVCPredCmd { + uint8_t type; + uint8_t size; // log2 "size" used by all variants -+ uint8_t na; // i_pred - but left here as they pack well -+ uint8_t c_idx; // i_pred ++ uint8_t avail; // i_pred - but left here as they pack well ++ uint8_t dummy; + union { + struct { // TRANSFORM_ADD + uint8_t * dst; @@ -32544,7 +33668,6 @@ index 0000000000..d242727b2a + + // Put structures that allocate non-trivial storage at the end + // These are mostly used indirectly so position in the structure doesn't matter -+ HEVCRpiLocalContextIntra HEVClcIntra; + HEVCRpiPassQueue passq[RPI_PASSES]; +#if RPI_EXTRA_BIT_THREADS > 0 + int bt_started; @@ -32597,13 +33720,15 @@ index 0000000000..d242727b2a + +void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags); + -+void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH); ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); ++ +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, + int merge_idx, MvField * const mv); +void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, ++ int nPbH, int log2_cb_size, const unsigned int avail, int part_idx, + int merge_idx, MvField * const mv, + int mvp_lx_flag, int LX); +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); @@ -32724,10 +33849,10 @@ index 0000000000..d242727b2a +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 -index 0000000000..c5d130c377 +index 0000000000..b041e0fd3f --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,419 @@ +@@ -0,0 +1,444 @@ +/* + * HEVC video decoder + * @@ -32970,6 +34095,30 @@ index 0000000000..c5d130c377 + return bs >> shift; +} + ++ ++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) ++{ ++ unsigned int i, j; ++ ++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=8) ++ AV_COPY64U(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } else { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=16) ++ AV_COPY128(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } ++} ++ ++ ++ +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +{ +#undef FUNC @@ -33137,6 +34286,7 @@ index 0000000000..c5d130c377 + } + + hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; ++ hevcdsp->cpy_blk = cpy_blk; + + if (ARCH_PPC) + ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); @@ -33149,10 +34299,10 @@ index 0000000000..c5d130c377 +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 -index 0000000000..8c9bf725bf +index 0000000000..0b532f874b --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h -@@ -0,0 +1,183 @@ +@@ -0,0 +1,185 @@ +/* + * HEVC video decoder + * @@ -33324,6 +34474,8 @@ index 0000000000..8c9bf725bf + uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc); ++ ++ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height); +} HEVCDSPContext; + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth); @@ -35622,10 +36774,10 @@ index 0000000000..d1196a4440 + diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c new file mode 100644 -index 0000000000..113ed33d64 +index 0000000000..62135b83c2 --- /dev/null +++ b/libavcodec/rpi_hevcpred.c -@@ -0,0 +1,150 @@ +@@ -0,0 +1,166 @@ +/* + * HEVC video Decoder + * @@ -35704,6 +36856,10 @@ index 0000000000..113ed33d64 + hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ + hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ + hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \ ++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ ++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ ++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ ++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ + hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ @@ -35723,13 +36879,21 @@ index 0000000000..113ed33d64 + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ -+ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); ++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ ++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ ++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ ++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); + +#define HEVC_PRED_C(depth) \ + hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ + hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ + hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ + hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ ++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ ++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ ++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ ++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ + hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ + hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ + hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ @@ -35749,7 +36913,11 @@ index 0000000000..113ed33d64 + hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ ++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ ++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ ++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); + +#define HEVC_PRED(depth) \ + HEVC_PRED_Y(depth); \ @@ -35778,10 +36946,10 @@ index 0000000000..113ed33d64 +} diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h new file mode 100644 -index 0000000000..31d7d57d95 +index 0000000000..6e594277c0 --- /dev/null +++ b/libavcodec/rpi_hevcpred.h -@@ -0,0 +1,68 @@ +@@ -0,0 +1,121 @@ +/* + * HEVC video Decoder + * @@ -35814,9 +36982,58 @@ index 0000000000..31d7d57d95 +struct HEVCRpiContext; +struct HEVCRpiLocalContext; + -+typedef struct HEVCRpiPredContext { -+ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); ++enum IntraPredMode { ++ INTRA_PLANAR = 0, ++ INTRA_DC, ++ INTRA_ANGULAR_2, ++ INTRA_ANGULAR_3, ++ INTRA_ANGULAR_4, ++ INTRA_ANGULAR_5, ++ INTRA_ANGULAR_6, ++ INTRA_ANGULAR_7, ++ INTRA_ANGULAR_8, ++ INTRA_ANGULAR_9, ++ INTRA_ANGULAR_10, ++ INTRA_ANGULAR_11, ++ INTRA_ANGULAR_12, ++ INTRA_ANGULAR_13, ++ INTRA_ANGULAR_14, ++ INTRA_ANGULAR_15, ++ INTRA_ANGULAR_16, ++ INTRA_ANGULAR_17, ++ INTRA_ANGULAR_18, ++ INTRA_ANGULAR_19, ++ INTRA_ANGULAR_20, ++ INTRA_ANGULAR_21, ++ INTRA_ANGULAR_22, ++ INTRA_ANGULAR_23, ++ INTRA_ANGULAR_24, ++ INTRA_ANGULAR_25, ++ INTRA_ANGULAR_26, ++ INTRA_ANGULAR_27, ++ INTRA_ANGULAR_28, ++ INTRA_ANGULAR_29, ++ INTRA_ANGULAR_30, ++ INTRA_ANGULAR_31, ++ INTRA_ANGULAR_32, ++ INTRA_ANGULAR_33, ++ INTRA_ANGULAR_34, ++}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 + ++typedef void intra_filter_fn_t( ++ uint8_t * const left, uint8_t * const top, ++ const unsigned int req, const unsigned int avail, ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size); ++ ++typedef struct HEVCRpiPredContext { ++ void (*intra_pred[4])(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail); ++ ++ intra_filter_fn_t *intra_filter[4]; + void (*pred_planar[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, @@ -35830,8 +37047,11 @@ index 0000000000..31d7d57d95 + void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); -+ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); ++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); + ++ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail); ++ intra_filter_fn_t *intra_filter_c[4]; + void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, @@ -35845,6 +37065,7 @@ index 0000000000..31d7d57d95 + void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); ++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); +} HEVCRpiPredContext; + +void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); @@ -35852,10 +37073,10 @@ index 0000000000..31d7d57d95 +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 -index 0000000000..a76ba4c442 +index 0000000000..23835a320e --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,983 @@ +@@ -0,0 +1,1487 @@ +/* + * HEVC video decoder + * @@ -35967,7 +37188,7 @@ index 0000000000..a76ba4c442 +#endif + + -+#if DUMP_PRED && !defined(INCLUDE_ONCE) ++#if DUMP_PRED && !defined(INCLUDED_ONCE) +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) +{ + for (unsigned int y = 0; y != size; y++, data += stride * 2) { @@ -35980,105 +37201,705 @@ index 0000000000..a76ba4c442 +} +#endif + -+static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, -+ int log2_size, int c_idx_arg) ++#ifndef INCLUDED_ONCE ++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) +{ ++ if ((n >>= 2) != 0) { ++ uint32_t v4 = v | (v << 8); ++ uint32_t * p = (uint32_t *)ptr; ++ v4 = v4 | (v4 << 16); ++ do { ++ *p++ = v4; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v2 = v | (v << 16); ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v2; ++ *p++ = v2; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ } while (--n != 0); ++ } ++} ++ ++// Beware that this inverts the avail ordering ++// For CIP it seems easier this way round ++static unsigned int cip_avail(const MvField * mvf, const int mvf_stride, const unsigned int log2_pu_size, const unsigned int avail, unsigned int size, ++ unsigned int s0, unsigned int s1) ++{ ++ const unsigned int n = 1 << (log2_pu_size - 2); ++ unsigned int fa = 0; ++ unsigned int i = 0; ++ ++ size >>= 2; // Now in 4-pel units ++ s0 >>= 2; ++ s1 >>= 2; ++ ++ if ((avail & 4) != 0) ++ fa |= ((1 << s0) - 1) << (size - s0); ++ if ((avail & 2) != 0) ++ fa |= ((1 << s1) - 1) << size; ++ if ((avail & 1) != 0) ++ fa |= 1 << (size << 1); ++ ++ for (i = 0; (fa >> i) != 0; i += n, mvf += mvf_stride) { ++ if ((fa & (((1 << n) - 1) << i)) != 0 && mvf->pred_flag != PF_INTRA) ++ fa &= ~(((1 << n) - 1) << i); ++ } ++ ++ return fa; ++} ++ ++static inline unsigned int rmbd(unsigned int x) ++{ ++#if 1 ++ return __builtin_ctz(x); ++#else ++ unsigned int n = 0; ++ if ((x & 0xffff) == 0) { ++ x >>= 16; ++ n += 16; ++ } ++ if ((x & 0xff) == 0) { ++ x >>= 8; ++ n += 8; ++ } ++ if ((x & 0xf) == 0) { ++ x >>= 4; ++ n += 4; ++ } ++ if ((x & 0x3) == 0) { ++ x >>= 2; ++ n += 2; ++ } ++ ++ return (x & 1) == 0 ? n + 1 : n; ++#endif ++} ++#endif ++ ++ ++static void FUNC(cip_fill)(pixel * const left, pixel * const top, ++ const unsigned int avail_l, const unsigned int avail_u, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int size) ++{ ++ pixel a; ++ unsigned int i; ++ ++ // 1st find DL value ++ if ((avail_l & 1) == 0) { ++ if (avail_l != 0) ++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; ++ else ++ { ++ // (avail_l | avail_u) != 0 so this must be good ++ const unsigned int n = rmbd(avail_u)*4; ++ a = (n >= size) ? src_ur[n - size] : src_u[n]; ++ } ++ } ++ ++ // L ++ { ++ pixel * d = left + size * 2 - 1; ++ const pixel * s = src_l + (size * 2 - 1) * stride; ++ unsigned int x = avail_l; ++ for (i = 0; i < size * 2; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = a = *s; ++ s -= stride; ++ } ++ else ++ { ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ s -= stride * 4; ++ } ++ } ++ // UL ++ *d = a = (x & 1) != 0 ? *s : a; ++ } ++ ++ // U ++ { ++ pixel * d = top; ++ const pixel * s = src_u; ++ unsigned int x = avail_u; ++ ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ ++ // UR ++ s = src_ur; ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ } ++} ++ ++ ++#if !PRED_C && PW == 1 ++#define EXTEND(ptr, val, len) extend_8(ptr, val, len) ++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) ++#define EXTEND(ptr, val, len) extend_16(ptr, val, len) ++#else ++#define EXTEND(ptr, val, len) extend_32(ptr, val, len) ++#endif ++ ++ +#define PU(x) \ + ((x) >> s->ps.sps->log2_min_pu_size) +#define MVF(x, y) \ -+ (s->ref->tab_mvf[(x) + (y) * min_pu_width]) ++ (s->ref->tab_mvf[(x) + (y) * s->ps.sps->min_pu_width]) +#define MVF_PU(x, y) \ + MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift)))) -+#define IS_INTRA(x, y) \ -+ (MVF_PU(x, y).pred_flag == PF_INTRA) -+#define MIN_TB_ADDR_ZS(x, y) \ -+ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] -+#define EXTEND(ptr, val, len) \ -+do { \ -+ pixel4 pix = PIXEL_SPLAT_X4(val); \ -+ for (i = 0; i < (len); i += 4) \ -+ AV_WN4P(ptr + i, pix); \ -+} while (0) + -+#define EXTEND_RIGHT_CIP(ptr, start, length) \ -+ for (i = start; i < (start) + (length); i += 4) \ -+ if (!IS_INTRA(i, -1)) \ -+ AV_WN4P(&ptr[i], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i+3]) -+#define EXTEND_LEFT_CIP(ptr, start, length) \ -+ for (i = start; i > (start) - (length); i--) \ -+ if (!IS_INTRA(i - 1, -1)) \ -+ ptr[i - 1] = ptr[i] -+#define EXTEND_UP_CIP(ptr, start, length) \ -+ for (i = (start); i > (start) - (length); i -= 4) \ -+ if (!IS_INTRA(-1, i - 3)) \ -+ AV_WN4P(&ptr[i - 3], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i - 3]) -+#define EXTEND_DOWN_CIP(ptr, start, length) \ -+ for (i = start; i < (start) + (length); i += 4) \ -+ if (!IS_INTRA(-1, i)) \ -+ AV_WN4P(&ptr[i], a); \ -+ else \ -+ a = PIXEL_SPLAT_X4(ptr[i + 3]) ++// Reqs: ++// ++// Planar: DL[0], L, ul, U, UR[0] ++// DC: dl, L, ul, U, ur ++// A2-9: DL, L, ul, u, ur ++// A10: dl, L, ul, u, ur ++// A11-17 dl, L, UL, U, ur ++// A18-25 dl, L, Ul, U, ur ++// A26 dl, l, ul, U, ur ++// A27-34 dl, l, ul, U, UR ++ ++#ifndef INCLUDED_ONCE ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++ ++#define FILTER_LIGHT 0x40 ++#define FILTER_STRONG 0x80 ++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) ++ ++static const uint8_t req_avail_c[35] = ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}; ++ ++static const uint8_t req_avail[4][35] = { ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}, ++{ // 3 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | 0, // 3 ++ AVAIL_DL | AVAIL_L | 0, // 4 ++ AVAIL_DL | AVAIL_L | 0, // 5 ++ AVAIL_DL | AVAIL_L | 0, // 6 ++ AVAIL_DL | AVAIL_L | 0, // 7 ++ AVAIL_DL | AVAIL_L | 0, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | 0, // 28 ++ AVAIL_U | AVAIL_UR | 0, // 29 ++ AVAIL_U | AVAIL_UR | 0, // 30 ++ AVAIL_U | AVAIL_UR | 0, // 31 ++ AVAIL_U | AVAIL_UR | 0, // 32 ++ AVAIL_U | AVAIL_UR | 0, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 4 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 5 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 ++ AVAIL_L | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 ++ AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 ++} ++}; ++ ++ ++#endif ++ ++#define filter_light1 FUNC(filter_light1) ++static inline pixel filter_light1(pixel a, pixel b, pixel c) ++{ ++ return (a + b*2 + c + 2) >> 2; ++} ++ ++#define filter_light FUNC(filter_light) ++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) ++{ ++ pixel p0; ++ pixel p2 = *src; ++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels ++ unsigned int n_minus_1 = n - 1; ++ ++ do ++ { ++ src += sstride; ++ p0 = p1; ++ p1 = p2; ++ p2 = *src; ++ *dst++ = filter_light1(p0, p1, p2); ++ } while (--n_minus_1 != 0); ++ *dst = filter_light1(p1, p2, pn); ++} ++ ++#define filter_strong FUNC(filter_strong) ++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) ++{ ++ unsigned int a = 64 * p0 + 32; ++ const int v = p1 - p0; ++ ++ do ++ { ++ *dst++ = (a += v) >> 6; ++ } while (--n != 0); ++} ++ ++#define intra_filter FUNC(intra_filter) ++static av_always_inline void intra_filter( ++ pixel * const left, pixel * const top, ++ const unsigned int req, const unsigned int avail, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size, ++ const unsigned int log2_size) ++{ ++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); ++ const unsigned int size = 1 << log2_size; ++ ++ // a_ is the first pel in a section working round dl -> ur ++ // b_ is the last ++ // Beware that top & left work out from UL so usage of a_ & b_ may ++ // swap between them. It is a bad naming scheme but I have found no ++ // better ++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; ++ const pixel * b_dl = src_l + size * stride; ++ const pixel * a_l = src_l + (size - 1) * stride; ++ const pixel * b_l = src_l; ++ const pixel * ab_ul = src_l - stride; ++ const pixel * a_u = src_u; ++ const pixel * b_u = src_u + size - 1; ++ const pixel * a_ur = src_ur; ++ const pixel * b_ur = src_ur + top_right_size - 1; ++ ++ const unsigned int want = req & ~avail; ++ const unsigned int have = req & avail; ++ unsigned int i; ++ ++ if ((avail & AVAIL_DL) == 0) ++ { ++ a_dl = a_ur; ++ if ((avail & AVAIL_U) != 0) ++ a_dl = a_u; ++ if ((avail & AVAIL_UL) != 0) ++ a_dl = ab_ul; ++ if ((avail & AVAIL_L) != 0) ++ a_dl = a_l; ++ b_dl = a_dl; ++ } ++ ++ if ((avail & AVAIL_L) == 0) ++ { ++ a_l = b_dl; ++ b_l = b_dl; ++ } ++ if ((avail & AVAIL_UL) == 0) ++ { ++ ab_ul = b_l; ++ } ++ if ((avail & AVAIL_U) == 0) ++ { ++ a_u = ab_ul; ++ b_u = ab_ul; ++ } ++ if ((avail & AVAIL_UR) == 0) ++ { ++ a_ur = b_u; ++ b_ur = b_u; ++ } ++ ++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints ++ { ++ if ((req & AVAIL_UL) != 0) ++ left[-1] = *ab_ul; ++ ++ if ((want & AVAIL_L) != 0) ++ EXTEND(left, *a_l, size); ++ if ((want & AVAIL_DL) != 0) ++ EXTEND(left + size, *a_dl, size); ++ if ((want & AVAIL_U) != 0) ++ EXTEND(top, *a_u, size); ++ if ((want & AVAIL_UR) != 0) ++ EXTEND(top + size, *a_ur, size); ++ ++ if ((have & AVAIL_U) != 0) ++ // Always good - even with sand ++ memcpy(top, a_u, size * sizeof(pixel)); ++ if ((have & AVAIL_UR) != 0) ++ { ++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, *b_ur, ++ size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ for (i = 0; i < size; i++) ++ left[i] = b_l[stride * i]; ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ for (i = 0; i < down_left_size; i++) ++ left[i + size] = b_dl[stride * i]; ++ EXTEND(left + size + down_left_size, *a_dl, ++ size - down_left_size); ++ } ++ } ++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint ++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && ++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) ++ { ++ if ((req & (AVAIL_U | AVAIL_UR)) != 0) ++ filter_strong(top, *ab_ul, *b_ur, size * 2); ++ left[-1] = *ab_ul; ++ if ((req & (AVAIL_L | AVAIL_DL)) != 0) ++ filter_strong(left, *ab_ul, *a_dl, size*2); ++ } ++ else ++ { ++ // Same code for both have & want for UL ++ if ((req & AVAIL_UL) != 0) ++ { ++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u); ++ } ++ ++ if ((want & AVAIL_L) != 0) ++ { ++ EXTEND(left, *a_l, size); ++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; ++ } ++ if ((want & AVAIL_DL) != 0) ++ { ++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding ++ EXTEND(left + size, *a_l, size); ++ } ++ if ((want & AVAIL_U) != 0) ++ { ++ EXTEND(top, *a_u, size); ++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; ++ } ++ if ((want & AVAIL_UR) != 0) ++ { ++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding ++ EXTEND(top + size, *a_ur, size); ++ } ++ ++ if ((have & AVAIL_U) != 0) ++ { ++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size); ++ } ++ if ((have & AVAIL_UR) != 0) { ++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); ++ top[size*2 - 1] = *b_ur; ++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size); ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); ++ left[size*2 - 1] = *a_dl; ++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); ++ } ++ } ++} ++ ++#define INTRA_FILTER(log2_size) \ ++static void FUNC(intra_filter_ ## log2_size)( \ ++ uint8_t * const left, uint8_t * const top, \ ++ const unsigned int req, const unsigned int avail, \ ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ ++ const unsigned int stride, \ ++ const unsigned int top_right_size, const unsigned int down_left_size) \ ++{ \ ++ intra_filter((pixel *)left, (pixel *)top, req, avail, \ ++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ ++} ++ ++INTRA_FILTER(2) ++INTRA_FILTER(3) ++INTRA_FILTER(4) ++INTRA_FILTER(5) ++ ++#undef intra_filter ++#undef INTRA_FILTER ++ ++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, ++ const unsigned int log2_size) ++{ + // c_idx will alaways be 1 for _c versions and 0 for y + const unsigned int c_idx = PRED_C; -+ int i; + const unsigned int hshift = ctx_hshift(s, c_idx); + const unsigned int vshift = ctx_vshift(s, c_idx); -+ int size = (1 << log2_size); -+ int size_in_luma_h = size << hshift; -+ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; -+ int size_in_luma_v = size << vshift; -+ int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; -+ const int x = x0 >> hshift; -+ const int y = y0 >> vshift; -+ int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; -+ int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; -+ -+ int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); ++ const unsigned int size = (1 << log2_size); ++ const unsigned int x = x0 >> hshift; ++ const unsigned int y = y0 >> vshift; + + const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); + pixel *const src = c_idx == 0 ? + (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : + (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); + -+ int min_pu_width = s->ps.sps->min_pu_width; -+ -+ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : -+ lc->tu.intra_pred_mode; -+ pixel4 a; -+ + // Align so we can do multiple loads in the asm + // Padded to 16 byte boundary so as not to confuse anything + DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); + DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ ++ pixel * const left = left_array + 16 / sizeof(pixel); ++ pixel * const top = top_array + 16 / sizeof(pixel); ++ const pixel * top_pred = top; ++ ++ const pixel * src_l = src - 1; ++ const pixel * src_u = src - stride; ++ const pixel * src_ur = src_u + size; +#if !PRED_C -+ DECLARE_ALIGNED(16, pixel, filtered_left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); -+ DECLARE_ALIGNED(16, pixel, filtered_top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ unsigned int req = req_avail[log2_size - 2][mode]; ++#else ++ unsigned int req = req_avail_c[mode]; +#endif + -+ pixel *left = left_array + 16 / sizeof(pixel); -+ pixel *top = top_array + 16 / sizeof(pixel); ++ // If we have nothing to pred from then fill with grey ++ // This isn't a common case but dealing with it here means we don't have to ++ // test for it later ++ if (avail == 0) ++ { ++dc_only: +#if !PRED_C -+ pixel *filtered_left = filtered_left_array + 16 / sizeof(pixel); -+ pixel *filtered_top = filtered_top_array + 16 / sizeof(pixel); ++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); ++#else ++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); +#endif -+ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); -+ int cand_left = lc->na.cand_left; -+ int cand_up_left = lc->na.cand_up_left; -+ int cand_up = lc->na.cand_up; -+ int cand_up_right = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1); ++ return; ++ } + -+ int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) - -+ (y0 + size_in_luma_v)) >> vshift; -+ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - -+ (x0 + size_in_luma_h)) >> hshift; -+ -+ pixel * src_l = src - 1; -+ pixel * src_u = src - stride; -+ pixel * src_ur = src_u + size; ++ // There will be no filtering on C so no point worrying about disabling it ++#if !PRED_C ++ if (s->ps.sps->intra_smoothing_disabled_flag) ++ req &= ~FILTER_EITHER; ++ if (!s->ps.sps->sps_strong_intra_smoothing_enable_flag) ++ req &= ~FILTER_STRONG; ++#endif + + { + // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs @@ -36091,248 +37912,96 @@ index 0000000000..a76ba4c442 + src_ur += stripe_adj; + } + -+ if (s->ps.pps->constrained_intra_pred_flag == 1) { -+ int size_in_luma_pu_v = PU(size_in_luma_v); -+ int size_in_luma_pu_h = PU(size_in_luma_h); -+ int on_pu_edge_x = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size); -+ int on_pu_edge_y = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size); -+ if (!size_in_luma_pu_h) -+ size_in_luma_pu_h++; -+ if (cand_bottom_left == 1 && on_pu_edge_x) { -+ int x_left_pu = PU(x0 - 1); -+ int y_bottom_pu = PU(y0 + size_in_luma_v); -+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu); -+ cand_bottom_left = 0; -+ for (i = 0; i < max; i += 2) -+ cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA); -+ } -+ if (cand_left == 1 && on_pu_edge_x) { -+ int x_left_pu = PU(x0 - 1); -+ int y_left_pu = PU(y0); -+ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu); -+ cand_left = 0; -+ for (i = 0; i < max; i += 2) -+ cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA); -+ } -+ if (cand_up_left == 1) { -+ int x_left_pu = PU(x0 - 1); -+ int y_top_pu = PU(y0 - 1); -+ cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA; -+ } -+ if (cand_up == 1 && on_pu_edge_y) { -+ int x_top_pu = PU(x0); -+ int y_top_pu = PU(y0 - 1); -+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu); -+ cand_up = 0; -+ for (i = 0; i < max; i += 2) -+ cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA); -+ } -+ if (cand_up_right == 1 && on_pu_edge_y) { -+ int y_top_pu = PU(y0 - 1); -+ int x_right_pu = PU(x0 + size_in_luma_h); -+ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu); -+ cand_up_right = 0; -+ for (i = 0; i < max; i += 2) -+ cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA); -+ } -+ memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel)); -+ memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel)); -+ top[-1] = 128; -+ } -+ if (cand_up_left) { -+ left[-1] = src_l[-stride]; -+ top[-1] = left[-1]; -+ } -+ if (cand_up) -+ // Always good - even with sand -+ memcpy(top, src_u, size * sizeof(pixel)); -+ if (cand_up_right) { -+ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); -+ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], -+ size - top_right_size); -+ } -+ if (cand_left) -+ for (i = 0; i < size; i++) -+ left[i] = src_l[stride * i]; -+ if (cand_bottom_left) { -+ for (i = size; i < size + bottom_left_size; i++) -+ left[i] = src_l[stride * i]; -+ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], -+ size - bottom_left_size); -+ } ++ if (s->ps.pps->constrained_intra_pred_flag == 1 && ++ s->sh.slice_type != HEVC_SLICE_I) // Can deal with I-slices in 'normal' code ++ { ++ const unsigned int l2_pu_s = FFMAX(s->ps.sps->log2_min_pu_size - hshift, 2); ++ const unsigned int l2_pu_stride_s = l2_pu_s - (s->ps.sps->log2_min_pu_size - hshift); + -+ if (s->ps.pps->constrained_intra_pred_flag == 1) { -+ if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) { -+ int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ? -+ 2 * size : (s->ps.sps->width - x0) >> hshift; -+ int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ? -+ 2 * size : (s->ps.sps->height - y0) >> vshift; -+ int j = size + (cand_bottom_left? bottom_left_size: 0) -1; -+ if (!cand_up_right) { -+ size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ? -+ size : (s->ps.sps->width - x0) >> hshift; -+ } -+ if (!cand_bottom_left) { -+ size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ? -+ size : (s->ps.sps->height - y0) >> vshift; -+ } -+ if (cand_bottom_left || cand_left || cand_up_left) { -+ while (j > -1 && !IS_INTRA(-1, j)) -+ j--; -+ if (!IS_INTRA(-1, j)) { -+ j = 0; -+ while (j < size_max_x && !IS_INTRA(j, -1)) -+ j++; -+ EXTEND_LEFT_CIP(top, j, j + 1); -+ left[-1] = top[-1]; -+ } -+ } else { -+ j = 0; -+ while (j < size_max_x && !IS_INTRA(j, -1)) -+ j++; -+ if (j > 0) -+ if (x0 > 0) { -+ EXTEND_LEFT_CIP(top, j, j + 1); -+ } else { -+ EXTEND_LEFT_CIP(top, j, j); -+ top[-1] = top[0]; -+ } -+ left[-1] = top[-1]; -+ } -+ left[-1] = top[-1]; -+ if (cand_bottom_left || cand_left) { -+ a = PIXEL_SPLAT_X4(left[-1]); -+ EXTEND_DOWN_CIP(left, 0, size_max_y); -+ } -+ if (!cand_left) -+ EXTEND(left, left[-1], size); -+ if (!cand_bottom_left) -+ EXTEND(left + size, left[size - 1], size); -+ if (x0 != 0 && y0 != 0) { -+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); -+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); -+ if (!IS_INTRA(-1, - 1)) -+ left[-1] = left[0]; -+ } else if (x0 == 0) { -+ EXTEND(left, 0, size_max_y); -+ } else { -+ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); -+ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); -+ } -+ top[-1] = left[-1]; -+ if (y0 != 0) { -+ a = PIXEL_SPLAT_X4(left[-1]); -+ EXTEND_RIGHT_CIP(top, 0, size_max_x); -+ } -+ } -+ } -+ // Infer the unavailable samples -+ if (!cand_bottom_left) { -+ if (cand_left) { -+ EXTEND(left + size, left[size - 1], size); -+ } else if (cand_up_left) { -+ EXTEND(left, left[-1], 2 * size); -+ cand_left = 1; -+ } else if (cand_up) { -+ left[-1] = top[0]; -+ EXTEND(left, left[-1], 2 * size); -+ cand_up_left = 1; -+ cand_left = 1; -+ } else if (cand_up_right) { -+ EXTEND(top, top[size], size); -+ left[-1] = top[size]; -+ EXTEND(left, left[-1], 2 * size); -+ cand_up = 1; -+ cand_up_left = 1; -+ cand_left = 1; -+ } else { // No samples available -+#if PRED_C -+ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); -+#else -+ left[-1] = (1 << (BIT_DEPTH - 1)); -+#endif -+ EXTEND(top, left[-1], 2 * size); -+ EXTEND(left, left[-1], 2 * size); -+ } -+ } ++ unsigned int avail_l = cip_avail(&MVF_PU(-1, size * 2 - 1), ++ -(int)(s->ps.sps->min_pu_width << l2_pu_stride_s), ++ l2_pu_s, ++ avail >> AVAIL_S_UL, ++ size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), size); ++ unsigned int avail_u = cip_avail(&MVF_PU(0, -1), ++ 1 << l2_pu_stride_s, ++ l2_pu_s, ++ avail << 1, ++ size, ++ size, FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size)); + -+ if (!cand_left) -+ EXTEND(left, left[size], size); -+ if (!cand_up_left) { -+ left[-1] = left[0]; -+ } -+ if (!cand_up) -+ EXTEND(top, left[-1], size); -+ if (!cand_up_right) -+ EXTEND(top + size, top[size - 1], size); ++ // Anything left? ++ if ((avail_l | avail_u) == 0) ++ goto dc_only; + -+ top[-1] = left[-1]; ++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); + -+ // Filtering process -+ // Sand can only apply to chroma_format_idc == 1 so we don't need to -+ // worry about chroma smoothing for that case +#if !PRED_C -+ if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || ctx_cfmt(s) == 3)) { -+ if (mode != INTRA_DC && size != 4){ -+ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; -+ int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)), -+ FFABS((int)(mode - 10U))); -+ if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) { -+ int threshold = 1 << (BIT_DEPTH - 5); -+ if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 && -+ log2_size == 5 && -+ FFABS(top[-1] + top[63] - 2 * top[31]) < threshold && -+ FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) { -+ // We can't just overwrite values in top because it could be -+ // a pointer into src -+ filtered_top[-1] = top[-1]; -+ filtered_top[63] = top[63]; -+ for (i = 0; i < 63; i++) -+ filtered_top[i] = ((64 - (i + 1)) * top[-1] + -+ (i + 1) * top[63] + 32) >> 6; -+ for (i = 0; i < 63; i++) -+ left[i] = ((64 - (i + 1)) * left[-1] + -+ (i + 1) * left[63] + 32) >> 6; -+ top = filtered_top; -+ } else { -+ filtered_left[2 * size - 1] = left[2 * size - 1]; -+ filtered_top[2 * size - 1] = top[2 * size - 1]; -+ for (i = 2 * size - 2; i >= 0; i--) -+ filtered_left[i] = (left[i + 1] + 2 * left[i] + -+ left[i - 1] + 2) >> 2; -+ filtered_top[-1] = -+ filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2; -+ for (i = 2 * size - 2; i >= 0; i--) -+ filtered_top[i] = (top[i + 1] + 2 * top[i] + -+ top[i - 1] + 2) >> 2; -+ left = filtered_left; -+ top = filtered_top; -+ } ++ if ((req & FILTER_LIGHT) != 0) ++ { ++ const unsigned threshold = 1 << (BIT_DEPTH - 5); ++ if ((req & FILTER_STRONG) != 0 && ++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && ++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) ++ { ++ filter_strong(top, left[-1], top[63], 64); ++ filter_strong(left, left[-1], left[63], 64); ++ } else ++ { ++ // LHS writes UL too so copy for top ++ const pixel p_ul = left[-1]; ++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); ++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); + } + } ++#endif ++ } ++ else ++ { ++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); ++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && ++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) ++ { ++ top_pred = src_u; ++ } ++ else ++ { ++#if !PRED_C ++ s->hpc.intra_filter[log2_size - 2] ++#else ++ s->hpc.intra_filter_c[log2_size - 2] ++#endif ++ ((uint8_t *)left, (uint8_t *)top, req, avail, ++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), ++ ur_size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); ++ } + } + ++ ++#if !PRED_C + switch (mode) { + case INTRA_PLANAR: -+ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: -+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; @@ -36340,25 +38009,25 @@ index 0000000000..a76ba4c442 +#else + switch (mode) { + case INTRA_PLANAR: -+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: -+ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: -+ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: -+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; @@ -36373,10 +38042,11 @@ index 0000000000..a76ba4c442 +#endif +} + -+#define INTRA_PRED(size) \ -+static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx) \ -+{ \ -+ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \ ++#define INTRA_PRED(log2_size) \ ++static void FUNC(intra_pred_ ## log2_size)(const struct HEVCRpiContext * const s, \ ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail) \ ++{ \ ++ FUNC(intra_pred)(s, mode, x0, y0, avail, log2_size); \ +} + +INTRA_PRED(2) @@ -36521,6 +38191,56 @@ index 0000000000..a76ba4c442 + +#undef PRED_DC + ++ ++ ++ ++#if !PRED_C ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ int i, j; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++} ++#else ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const pixel a = (1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = a; ++ src[j][1] = a; ++ } ++ } ++} ++#endif ++ ++#define PRED_DC0(size)\ ++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc0)(src, stride, size + 2); \ ++} ++ ++PRED_DC0(0) ++PRED_DC0(1) ++PRED_DC0(2) ++PRED_DC0(3) ++ ++#undef PRED_DC0 ++ ++ ++ ++ +#ifndef ANGLE_CONSTS +#define ANGLE_CONSTS +static const int intra_pred_angle[] = { @@ -36835,6 +38555,11 @@ index 0000000000..a76ba4c442 +#undef POS +#undef PW + ++#undef filter_light1 ++#undef filter_light ++#undef filter_strong ++#undef ref_gen ++ +#ifndef INCLUDED_ONCE +#define INCLUDED_ONCE +#endif @@ -40917,7 +42642,7 @@ index 0000000000..59c0d3959e +# -Wa,-ahls diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh new file mode 100755 -index 0000000000..28b7a4f483 +index 0000000000..c8da66514b --- /dev/null +++ b/pi-util/conf_pi2.sh @@ -0,0 +1,32 @@ @@ -40928,7 +42653,7 @@ index 0000000000..28b7a4f483 + +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" -+RPI_DEFINES="-D__VCCOREVER__=0x4000000" ++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon" +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" +