ffmpeg: update HEVC patch

This commit is contained in:
MilhouseVH 2018-07-15 04:43:04 +01:00
parent 6367035631
commit d115f9058c

View File

@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644
/ffplay
/ffprobe
diff --git a/configure b/configure
index dee507cb6a..0ee9efe1e7 100755
index dee507cb6a..9a93189107 100755
--- a/configure
+++ b/configure
@@ -318,6 +318,7 @@ External library support:
@ -30,15 +30,6 @@ index dee507cb6a..0ee9efe1e7 100755
--disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
--disable-nvenc disable Nvidia video encoding code [autodetect]
--enable-omx enable OpenMAX IL code [no]
@@ -1036,7 +1037,7 @@ EOF
check_insn(){
log check_insn "$@"
- check_inline_asm ${1}_inline "$2"
+ check_inline_asm ${1}_inline "\"$2\""
check_as ${1}_external "$2"
}
@@ -1776,6 +1777,7 @@ FEATURE_LIST="
gray
hardcoded_tables
@ -12100,10 +12091,10 @@ index 0000000000..6ce3d3ca8d
+
diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
new file mode 100644
index 0000000000..afafb6bc44
index 0000000000..67192e7213
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
@@ -0,0 +1,922 @@
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
+ *
@ -12340,28 +12331,25 @@ index 0000000000..afafb6bc44
+@ ? Might be faster as simple arm
+
+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
+ vld1.32 {d0[0] }, [r1 :32] @ Up
+ ldrb r12, [r2, #-1] @ Up-left
+ vld1.32 {d16[0]}, [r2 :32] @ left
+
+ vdup.8 d4, r12
+ vmov.u8 d6, #128
+ vhsub.u8 d0, d4
+
+ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
+ add r2, r0, r3
+ vdup.8 d2, d2[0]
+ lsl r3, #1
+ vqadd.s8 d0, d2
+ veor.8 d0, d6
+
+ vdup.8 d1, d16[1]
+ vdup.8 d2, d16[2]
+ vdup.8 d3, d16[3]
+ vst1.32 {d0[0] }, [r0 :32], r3
+ vst1.32 {d1[0] }, [r2 :32], r3
+ vst1.32 {d2[0] }, [r0 :32]
+ vst1.32 {d3[0] }, [r2 :32]
+ ldrb ip, [r2, #-1] @ Top-left
+ vld1.32 {d0[0]}, [r1 :32] @ Top
+ add r1, r2, #3
+ vld1.8 {d1[]}, [r2]!
+ vdup.8 d2, ip
+ vmov.i8 d3, #128
+ vhsub.u8 d0, d2
+ veor d1, d3
+ vld1.8 {d2[]}, [r2]!
+ add ip, r0, r3
+ vqadd.s8 d0, d0, d1
+ lsl r3, #1
+ vld1.8 {d1[]}, [r2]
+ vld1.8 {d4[]}, [r1]
+ veor d0, d3
+ vst1.32 {d0[0]}, [r0 :32], r3
+ vst1.32 {d2[0]}, [ip :32], r3
+ vst1.32 {d1[0]}, [r0 :32]
+ vst1.32 {d4[0]}, [ip :32]
+
+ bx lr
+endfunc
@ -12374,35 +12362,27 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
+ vld1.8 {d0 }, [r1 :64] @ Up
+ ldrb r12, [r2, #-1] @ Up-left
+ vld1.8 {d16}, [r2 :64] @ left
+
+ vdup.8 d4, r12
+ vmov.u8 d6, #128
+ vhsub.u8 d0, d4
+
+ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
+ add r2, r0, r3
+ vdup.8 d2, d2[0]
+ lsl r3, #1
+ vqadd.s8 d0, d2
+ mov r1, #3
+ veor.8 d0, d6
+
+ vdup.8 d4, d16[1]
+ vst1.8 {d0 }, [r0 :64], r3
+ vst1.8 {d4 }, [r2 :64], r3
+
+ ldrb ip, [r2, #-1] @ Top-left
+ vld1.8 {d0}, [r1 :64] @ Top
+ vmov.i8 d1, #128
+ vld1.8 {d2[]}, [r2]!
+ mov r1, #8-2
+ vdup.8 d3, ip
+ vhsub.u8 d0, d3
+ veor d2, d1
+ vqadd.s8 d0, d2
+ vld1.8 {d2[]}, [r2]!
+ veor d0, d1
+ vst1.8 {d0}, [r0], r3
+1:
+ vext.8 d16, d16, #2
+ subs r1, #1
+ vdup.8 d0, d16[0]
+ vdup.8 d4, d16[1]
+ vst1.8 {d0 }, [r0 :64], r3
+ vst1.8 {d4 }, [r2 :64], r3
+ vld1.8 {d0[]}, [r2]!
+ subs r1, #2
+ vst1.8 {d2}, [r0 :64], r3
+ vld1.8 {d2[]}, [r2]!
+ vst1.8 {d0}, [r0 :64], r3
+ bne 1b
+
+ vst1.8 {d2}, [r0 :64]
+ bx lr
+endfunc
+
@ -12414,35 +12394,27 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
+ vld1.8 {q0 }, [r1 :128] @ Up
+ ldrb r12, [r2, #-1] @ Up-left
+ vld1.8 {q8 }, [r2 :128] @ left
+
+ vdup.8 q2, r12
+ vmov.u8 q3, #128
+ vhsub.u8 q0, q2
+
+ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
+ add r2, r0, r3
+ vdup.8 q1, d2[0]
+ lsl r3, #1
+ vqadd.s8 q0, q1
+ mov r1, #7
+ veor.8 q0, q3
+
+ vdup.8 q2, d16[1]
+ vst1.8 {q0 }, [r0 :128], r3
+ vst1.8 {q2 }, [r2 :128], r3
+
+ ldrb ip, [r2, #-1] @ Top-left
+ vld1.8 {q0}, [r1 :64] @ Top
+ mov r1, #16-2
+ vld1.8 {d4[],d5[]}, [r2]!
+ vdup.8 q3, ip
+ vhsub.u8 q0, q3
+ vmov.i8 q1, #128
+ veor q2, q1
+ vqadd.s8 q0, q2
+ vld1.8 {d4[],d5[]}, [r2]!
+ veor q0, q1
+ vst1.8 {q0}, [r0], r3
+1:
+ vext.8 q8, q8, #2
+ subs r1, #1
+ vdup.8 q0, d16[0]
+ vdup.8 q2, d16[1]
+ vst1.8 {q0 }, [r0 :128], r3
+ vst1.8 {q2 }, [r2 :128], r3
+ vld1.8 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.8 {q2}, [r0 :64], r3
+ vld1.8 {d4[],d5[]}, [r2]!
+ vst1.8 {q0}, [r0 :64], r3
+ bne 1b
+
+ vst1.8 {q2}, [r0 :64]
+ bx lr
+endfunc
+
@ -12454,22 +12426,24 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
+ vld1.8 {q8, q9 }, [r2 :128] @ Left
+ add r2, r0, r3
+ lsl r3, #1
+ mov r1, #16
+ vld1.8 {d0[],d1[]}, [r2]!
+ add ip, r0, #16
+ mov r1, #32-2
+ vld1.8 {d2[],d3[]}, [r2]!
+ vst1.8 {q0}, [r0 :128], r3
+ vst1.8 {q0}, [ip :128], r3
+1:
+ vdup.8 q0, d16[0]
+ vdup.8 q1, d16[0]
+ vdup.8 q2, d16[1]
+ vdup.8 q3, d16[1]
+ vext.8 q8, q9, #2
+ vext.8 q9, q9, #2
+ vst1.8 {q0, q1 }, [r0 :128], r3
+ subs r1, #1
+ vst1.8 {q2, q3 }, [r2 :128], r3
+ vld1.8 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.8 {q1}, [r0 :128], r3
+ vst1.8 {q1}, [ip :128], r3
+ vld1.8 {d2[],d3[]}, [r2]!
+ vst1.8 {q0}, [r0 :128], r3
+ vst1.8 {q0}, [ip :128], r3
+ bne 1b
+
+ vst1.8 {q1}, [r0 :128]
+ vst1.8 {q1}, [ip :128]
+ bx lr
+endfunc
+
@ -12481,19 +12455,22 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
+ vld1.16 {d16}, [r2 :64] @ Left
+ add r2, r0, r3, lsl #1
+ lsl r3, #2
+
+ vdup.16 d0, d16[0]
+ vdup.16 d1, d16[1]
+ vdup.16 d2, d16[2]
+ vdup.16 d3, d16[3]
+
+ vst1.16 {d0 }, [r0 :64], r3
+ vst1.16 {d1 }, [r2 :64], r3
+ vst1.16 {d2 }, [r0 :64]
+ vst1.16 {d3 }, [r2 :64]
+ add r1, r2, #2
+ vld1.16 {d0[]}, [r2]
+ add r2, #4
+ vld1.16 {d1[]}, [r1]
+ add r1, #4
+ vld1.16 {d2[]}, [r2]
+A add r2, r0, r3, lsl #1
+T lsl r3, #1
+T add r2, r0, r3
+ vld1.16 {d3[]}, [r1]
+A lsl r3, #2
+T lsl r3, #1
+ vst1.16 {d0}, [r0 :64], r3
+ vst1.16 {d1}, [r2 :64], r3
+ vst1.16 {d2}, [r0 :64]
+ vst1.16 {d3}, [r2 :64]
+
+ bx lr
+endfunc
@ -12506,19 +12483,20 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
+ vld1.16 {q8 }, [r2 :128] @ Left
+ add r2, r0, r3, lsl #1
+ lsl r3, #2
+ mov r1, #4
+ vld1.16 {d0[],d1[]}, [r2]!
+ lsl r3, #1
+ vld1.16 {d2[],d3[]}, [r2]!
+ mov r1, #8-2
+ vst1.16 {q0}, [r0 :64], r3
+1:
+ vdup.16 q0, d16[0]
+ vdup.16 q2, d16[1]
+ vext.16 q8, q8, #2
+ vst1.16 {q0 }, [r0 :128], r3
+ subs r1, #1
+ vst1.16 {q2 }, [r2 :128], r3
+ vld1.16 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.16 {q1}, [r0 :64], r3
+ vld1.16 {d2[],d3[]}, [r2]!
+ vst1.16 {q0}, [r0 :64], r3
+ bne 1b
+
+ vst1.16 {q1}, [r0 :64]
+ bx lr
+endfunc
+
@ -12530,22 +12508,25 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
+ vld1.16 {q8, q9 }, [r2 :128] @ Left
+ add r2, r0, r3, lsl #1
+ lsl r3, #2
+ mov r1, #8
+ vld1.16 {d0[],d1[]}, [r2]!
+ lsl r3, #1
+ add ip, r0, #16
+ mov r1, #16-2
+ vld1.16 {d2[],d3[]}, [r2]!
+ vst1.16 {q0}, [r0 :128], r3
+ vst1.16 {q0}, [ip :128], r3
+1:
+ vdup.16 q0, d16[0]
+ vdup.16 q1, d16[0]
+ vdup.16 q2, d16[1]
+ vdup.16 q3, d16[1]
+ vext.16 q8, q9, #2
+ vext.16 q9, q9, #2
+ vst1.16 {q0, q1 }, [r0 :128], r3
+ subs r1, #1
+ vst1.16 {q2, q3 }, [r2 :128], r3
+ vld1.16 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.16 {q1}, [r0 :128], r3
+ vst1.16 {q1}, [ip :128], r3
+ vld1.16 {d2[],d3[]}, [r2]!
+ vst1.16 {q0}, [r0 :128], r3
+ vst1.16 {q0}, [ip :128], r3
+ bne 1b
+
+ vst1.16 {q1}, [r0 :128]
+ vst1.16 {q1}, [ip :128]
+ bx lr
+endfunc
+
@ -12780,31 +12761,28 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
+ vld1.16 {d0 }, [r1 :64] @ Up
+ ldrh r12, [r2, #-2] @ Up-left
+ vld1.16 {d16}, [r2 :64] @ left
+
+ vdup.16 d4, r12
+ add r2, r0, r3, lsl #1
+ vhsub.u16 d0, d4
+
+ vdup.16 d6, d16[0]
+ vmov.s16 d4, #0
+ vadd.i16 d0, d6
+
+ vmov.s16 d6, #0x3ff
+ vmax.s16 d0, d4
+ lsl r3, #2
+ vmin.s16 d0, d6
+
+ vdup.16 d1, d16[1]
+ vdup.16 d2, d16[2]
+ vdup.16 d3, d16[3]
+
+ vst1.16 {d0 }, [r0 :64], r3
+ vst1.16 {d1 }, [r2 :64], r3
+ vst1.16 {d2 }, [r0 :64]
+ vst1.16 {d3 }, [r2 :64]
+ ldrh ip, [r2, #-2] @ Top-left
+ vld1.16 {d0}, [r1 :64] @ Top
+ vmov.i16 d1, #0
+ vld1.16 {d2[]}, [r2]!
+T lsl r3, #1
+ vdup.16 d3, ip
+ vmov.i16 d4, #0x3ff
+ vhsub.u16 d0, d3
+A add ip, r0, r3, lsl #1
+T add ip, r0, r3
+ vld1.16 {d3[]}, [r2]!
+A lsl r3, #2
+T lsl r3, #1
+ vadd.i16 d0, d2
+ vld1.16 {d2[]}, [r2]!
+ vmax.s16 d0, d1
+ vld1.16 {d1[]}, [r2]
+ vmin.s16 d0, d4
+ vst1.16 {d0}, [r0 :64], r3
+ vst1.16 {d3}, [ip :64], r3
+ vst1.16 {d2}, [r0 :64]
+ vst1.16 {d1}, [ip :64]
+
+ bx lr
+endfunc
@ -12817,37 +12795,29 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
+ vld1.16 {q0 }, [r1 :128] @ Up
+ ldrh r12, [r2, #-2] @ Up-left
+ vld1.16 {q8 }, [r2 :128] @ left
+
+ vdup.16 q2, r12
+ add r2, r0, r3, lsl #1
+ vhsub.u16 q0, q2
+
+ vdup.16 q3, d16[0]
+ lsl r3, #2
+ vmov.s16 q2, #0
+ vadd.i16 q0, q3
+
+ mov r1, #3
+ vmov.s16 q3, #0x3ff
+ vmax.s16 q0, q2
+ vmin.s16 q0, q3
+
+ vdup.16 q2, d16[1]
+
+ vst1.16 {q0 }, [r0 :128], r3
+ vst1.16 {q2 }, [r2 :128], r3
+ ldrh ip, [r2, #-2] @ Top-left
+ vld1.16 {q0}, [r1 :128] @ Top
+ lsl r3, #1
+ vdup.16 q1, ip
+ mov r1, #8-2
+ vhsub.u16 q0, q1
+ vld1.16 {d2[],d3[]}, [r2]!
+ vmov.i16 q2, #0
+ vadd.i16 q0, q1
+ vmov.i16 q1, #0x3ff
+ vmax.s16 q0, q2
+ vld1.16 {d4[],d5[]}, [r2]!
+ vmin.s16 q0, q1
+ vst1.16 {q0}, [r0 :128], r3
+1:
+ vext.16 q8, q8, #2
+ vdup.16 q0, d16[0]
+ vdup.16 q2, d16[1]
+ subs r1, #1
+ vst1.16 {q0 }, [r0 :128], r3
+ vst1.16 {q2 }, [r2 :128], r3
+ vld1.16 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.16 {q2}, [r0 :128], r3
+ vld1.16 {d4[],d5[]}, [r2]!
+ vst1.16 {q0}, [r0 :128], r3
+ bne 1b
+
+ vst1.16 {q2}, [r0 :128]
+ bx lr
+endfunc
+
@ -12859,46 +12829,38 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
+ vld1.16 {q0, q1 }, [r1 :128] @ Up
+ ldrh r12, [r2, #-2] @ Up-left
+ vld1.16 {q8, q9 }, [r2 :128] @ left
+
+
+ vdup.16 q2, r12
+ add r2, r0, r3, lsl #1
+ vhsub.u16 q0, q2
+ vhsub.u16 q1, q2
+
+ vdup.16 q3, d16[0]
+ lsl r3, #2
+ vmov.s16 q2, #0
+ vadd.i16 q0, q3
+ vadd.i16 q1, q3
+
+ mov r1, #7
+ vmov.s16 q3, #0x3ff
+ vmax.s16 q0, q2
+ vmax.s16 q1, q2
+ vmin.s16 q0, q3
+ vmin.s16 q1, q3
+
+ vdup.16 q2, d16[1]
+ vdup.16 q3, d16[1]
+
+ vst1.16 {q0, q1 }, [r0 :128], r3
+ vst1.16 {q2, q3 }, [r2 :128], r3
+ ldrh ip, [r2, #-2] @ Top-left
+ vld1.16 {q0-q1}, [r1 :128] @ Top
+ lsl r3, #1
+ vdup.16 q2, ip
+ add ip, r0, r3
+ vhsub.u16 q0, q2
+ add ip, #16
+ vhsub.u16 q1, q2
+ mov r1, #16-2
+ vld1.16 {d4[],d5[]}, [r2]!
+ vmov.i16 q3, #0
+ vadd.u16 q0, q2
+ vadd.i16 q1, q2
+ vmov.i16 q2, #0x3ff
+ vmax.s16 q0, q3
+ vmax.s16 q1, q3
+ vld1.16 {d6[],d7[]}, [r2]!
+ vmin.s16 q0, q2
+ vmin.s16 q1, q2
+ vst1.16 {q0-q1}, [r0 :128], r3
+1:
+ vext.16 q8, q9, #2
+ vext.16 q9, q9, #2
+ vdup.16 q0, d16[0]
+ vdup.16 q1, d16[0]
+ vdup.16 q2, d16[1]
+ vdup.16 q3, d16[1]
+ subs r1, #1
+ vst1.16 {q0, q1 }, [r0 :128], r3
+ vst1.16 {q2, q3 }, [r2 :128], r3
+ vld1.16 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.16 {q3}, [r0 :128], r3
+ vst1.16 {q3}, [ip :128], r3
+ vld1.16 {d6[],d7[]}, [r2]!
+ vst1.16 {q0}, [r0 :128], r3
+ vst1.16 {q0}, [ip :128], r3
+ bne 1b
+
+ vst1.16 {q3}, [r0 :128]
+ vst1.16 {q3}, [ip :128]
+ bx lr
+endfunc
+
@ -12910,31 +12872,37 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
+ vldm r2, { q8-q11}
+ mov r1, #16
+ vld1.16 {d0[],d1[]}, [r2]!
+ add ip, r0, #16
+ push {lr}
+ mov lr, #32
+ vld1.16 {d2[],d3[]}, [r2]!
+ lsl r3, #1
+ vst1.16 {q0}, [r0 :128], lr
+ sub r3, #32
+ vst1.16 {q0}, [ip :128], lr
+ mov r1, #32-2
+ vst1.16 {q0}, [r0 :128], r3
+ vst1.16 {q0}, [ip :128], r3
+1:
+ vdup.16 q0, d16[0]
+ vdup.16 q1, d16[0]
+ vdup.16 q2, d16[0]
+ vdup.16 q3, d16[0]
+ add r2, r0, r3, lsl #1
+ vdup.16 q12, d16[1]
+ vdup.16 q13, d16[1]
+ vdup.16 q14, d16[1]
+ vdup.16 q15, d16[1]
+ vstm r0, { q0-q3 }
+ vstm r2, {q12-q15}
+
+ vext.16 q8, q9, #2
+ vext.16 q9, q10, #2
+ add r0, r0, r3, lsl #2
+ vext.16 q10, q11, #2
+ subs r1, #1
+ vext.16 q11, q11, #2
+
+ vld1.16 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.16 {q1}, [r0 :128], lr
+ vst1.16 {q1}, [ip :128], lr
+ vst1.16 {q1}, [r0 :128], r3
+ vst1.16 {q1}, [ip :128], r3
+ vld1.16 {d2[],d3[]}, [r2]!
+ vst1.16 {q0}, [r0 :128], lr
+ vst1.16 {q0}, [ip :128], lr
+ vst1.16 {q0}, [r0 :128], r3
+ vst1.16 {q0}, [ip :128], r3
+ bne 1b
+
+ bx lr
+ vst1.16 {q1}, [r0 :128], lr
+ vst1.16 {q1}, [ip :128], lr
+ vst1.16 {q1}, [r0 :128]
+ vst1.16 {q1}, [ip :128]
+ pop {pc}
+endfunc
+
+
@ -12945,19 +12913,22 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
+ vld1.16 {q8 }, [r2 :128] @ Left
+ add r2, r0, r3, lsl #2
+ lsl r3, #3
+
+ vdup.32 q0, d16[0]
+ vdup.32 q1, d16[1]
+ vdup.32 q2, d17[0]
+ vdup.32 q3, d17[1]
+
+ vst1.32 {q0 }, [r0 :128], r3
+ vst1.16 {q1 }, [r2 :128], r3
+ vst1.32 {q2 }, [r0 :128]
+ vst1.16 {q3 }, [r2 :128]
+ add r1, r2, #4
+ vld1.32 {d0[],d1[]}, [r2]
+ add r2, #8
+ vld1.32 {d2[],d3[]}, [r1]
+ add r1, #8
+ vld1.32 {d4[],d5[]}, [r2]
+A add r2, r0, r3, lsl #2
+T lsl r3, #2
+T add r2, r0, r3
+ vld1.32 {d6[],d7[]}, [r1]
+A lsl r3, #3
+T lsl r3, #1
+ vst1.32 {q0}, [r0 :128], r3
+ vst1.32 {q1}, [r2 :128], r3
+ vst1.32 {q2}, [r0 :128]
+ vst1.32 {q3}, [r2 :128]
+
+ bx lr
+endfunc
@ -12970,22 +12941,25 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
+ vld1.16 {q8, q9 }, [r2 :128] @ Left
+ add r2, r0, r3, lsl #2
+ lsl r3, #3
+ mov r1, #4
+ vld1.32 {d0[],d1[]}, [r2]!
+ lsl r3, #2
+ add ip, r0, #16
+ mov r1, #8-2
+ vld1.32 {d2[],d3[]}, [r2]!
+ vst1.32 {q0}, [r0 :128], r3
+ vst1.32 {q0}, [ip :128], r3
+1:
+ vdup.32 q0, d16[0]
+ vdup.32 q1, d16[0]
+ vdup.32 q2, d16[1]
+ vdup.32 q3, d16[1]
+ vext.32 q8, q9, #2
+ vext.32 q9, q9, #2
+ vst1.32 {q0, q1 }, [r0 :128], r3
+ subs r1, #1
+ vst1.32 {q2, q3 }, [r2 :128], r3
+ vld1.32 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.32 {q1}, [r0 :128], r3
+ vst1.32 {q1}, [ip :128], r3
+ vld1.32 {d2[],d3[]}, [r2]!
+ vst1.32 {q0}, [r0 :128], r3
+ vst1.32 {q0}, [ip :128], r3
+ bne 1b
+
+ vst1.32 {q1}, [r0 :128]
+ vst1.32 {q1}, [ip :128]
+ bx lr
+endfunc
+
@ -12997,31 +12971,37 @@ index 0000000000..afafb6bc44
+@ ptrdiff_t stride) [r3]
+
+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
+ vldm r2, { q8-q11}
+ mov r1, #8
+ vld1.32 {d0[],d1[]}, [r2]!
+ add ip, r0, #16
+ push {lr}
+ mov lr, #32
+ vld1.32 {d2[],d3[]}, [r2]!
+ lsl r3, #2
+ vst1.32 {q0}, [r0 :128], lr
+ sub r3, #32
+ vst1.32 {q0}, [ip :128], lr
+ mov r1, #16-2
+ vst1.32 {q0}, [r0 :128], r3
+ vst1.32 {q0}, [ip :128], r3
+1:
+ vdup.32 q0, d16[0]
+ vdup.32 q1, d16[0]
+ vdup.32 q2, d16[0]
+ vdup.32 q3, d16[0]
+ add r2, r0, r3, lsl #2
+ vdup.32 q12, d16[1]
+ vdup.32 q13, d16[1]
+ vdup.32 q14, d16[1]
+ vdup.32 q15, d16[1]
+ vstm r0, { q0-q3 }
+ vstm r2, {q12-q15}
+
+ vext.32 q8, q9, #2
+ vext.32 q9, q10, #2
+ add r0, r0, r3, lsl #3
+ vext.32 q10, q11, #2
+ subs r1, #1
+ vext.32 q11, q11, #2
+
+ vld1.32 {d0[],d1[]}, [r2]!
+ subs r1, #2
+ vst1.32 {q1}, [r0 :128], lr
+ vst1.32 {q1}, [ip :128], lr
+ vst1.32 {q1}, [r0 :128], r3
+ vst1.32 {q1}, [ip :128], r3
+ vld1.32 {d2[],d3[]}, [r2]!
+ vst1.32 {q0}, [r0 :128], lr
+ vst1.32 {q0}, [ip :128], lr
+ vst1.32 {q0}, [r0 :128], r3
+ vst1.32 {q0}, [ip :128], r3
+ bne 1b
+
+ bx lr
+ vst1.32 {q1}, [r0 :128], lr
+ vst1.32 {q1}, [ip :128], lr
+ vst1.32 {q1}, [r0 :128]
+ vst1.32 {q1}, [ip :128]
+ pop {pc}
+endfunc
+
+
@ -18040,10 +18020,10 @@ index 0000000000..8e7695bcf9
+
diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
new file mode 100644
index 0000000000..f283f01489
index 0000000000..163e2558dc
--- /dev/null
+++ b/libavcodec/rpi_hevc_mvs.c
@@ -0,0 +1,704 @@
@@ -0,0 +1,681 @@
+/*
+ * HEVC video decoder
+ *
@ -18163,7 +18143,7 @@ index 0000000000..f283f01489
+ refPicList, X, refIdxLx, \
+ refPicList_col, L ## l, temp_col.ref_idx[l])
+
+// derive the motion vectors section 8.5.3.1.8
+// derive the motion vectors section 8.5.3.2.8
+static int derive_temporal_colocated_mvs(const HEVCRpiContext * const s, const MvField temp_col,
+ const int refIdxLx, Mv * const mvLXCol, const int X,
+ const int colPic, const RefPicList * const refPicList_col)
@ -18173,35 +18153,12 @@ index 0000000000..f283f01489
+ if (temp_col.pred_flag == PF_INTRA)
+ return 0;
+
+ if (!(temp_col.pred_flag & PF_L0))
+ return CHECK_MVSET(1);
+ else if (temp_col.pred_flag == PF_L0)
+ if (temp_col.pred_flag == PF_L0 ||
+ (temp_col.pred_flag == PF_BI && (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
+ {
+ return CHECK_MVSET(0);
+ else if (temp_col.pred_flag == PF_BI) {
+ int check_diffpicount = 0;
+ int i, j;
+ for (j = 0; j < 2; j++) {
+ for (i = 0; i < refPicList[j].nb_refs; i++) {
+ if (refPicList[j].list[i] > s->poc) {
+ check_diffpicount++;
+ break;
+ }
+ }
+ }
+ if (!check_diffpicount) {
+ if (X==0)
+ return CHECK_MVSET(0);
+ else
+ return CHECK_MVSET(1);
+ } else {
+ if (s->sh.collocated_list == L1)
+ return CHECK_MVSET(0);
+ else
+ return CHECK_MVSET(1);
+ }
+ }
+
+ return 0;
+ return CHECK_MVSET(1);
+}
+
+#define TAB_MVF(x, y) \
@ -27149,10 +27106,10 @@ index 0000000000..3557348e30
+};
diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
new file mode 100644
index 0000000000..255dd6835a
index 0000000000..eef98e5643
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.c
@@ -0,0 +1,5799 @@
@@ -0,0 +1,5820 @@
+/*
+ * HEVC video Decoder
+ *
@ -31981,12 +31938,33 @@ index 0000000000..255dd6835a
+}
+
+
+static void set_no_backward_pred(HEVCRpiContext * const s)
+{
+ int i, j;
+ const RefPicList *const refPicList = s->ref->refPicList;
+
+ s->no_backward_pred_flag = 0;
+ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
+ return;
+
+ for (j = 0; j < 2; j++) {
+ for (i = 0; i < refPicList[j].nb_refs; i++) {
+ if (refPicList[j].list[i] > s->poc) {
+ s->no_backward_pred_flag = 1;
+ return;
+ }
+ }
+ }
+}
+
+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
+{
+ int err;
+ if ((err = gen_entry_points(s, nal)) < 0)
+ return err;
+
+ set_no_backward_pred(s);
+
+ return rpi_decode_entry(s->avctx, NULL);
+}
+
@ -32954,10 +32932,10 @@ index 0000000000..255dd6835a
+
diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
new file mode 100644
index 0000000000..a5ce342ab3
index 0000000000..ea08308be2
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.h
@@ -0,0 +1,956 @@
@@ -0,0 +1,959 @@
+/*
+ * HEVC video decoder
+ *
@ -33616,13 +33594,18 @@ index 0000000000..a5ce342ab3
+ const AVClass *c; // needed by private avoptions
+ AVCodecContext *avctx;
+
+ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
+ HEVCRpiLocalContext *HEVClc;
+
+ uint8_t threads_type;
+
+ /** 1 if the independent slice segment header was successfully parsed */
+ uint8_t slice_initialized;
+ char used_for_ref; // rpi
+ char offload_recon;
+ uint8_t eos; ///< current packet contains an EOS/EOB NAL
+ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL
+ uint8_t no_backward_pred_flag;
+ uint8_t is_decoded;
+ uint8_t no_rasl_output_flag;
+
+
+ /**
+ * Sequence counters for decoded and output frames, so that old
@ -33634,9 +33617,6 @@ index 0000000000..a5ce342ab3
+ int width;
+ int height;
+
+ char used_for_ref; // rpi
+ char offload_recon;
+
+ HEVCRpiJobCtl * jbc;
+ // cabac stash
+ // b0 skip flag
@ -33662,33 +33642,19 @@ index 0000000000..a5ce342ab3
+ uint8_t *sao_pixel_buffer_h[3];
+ uint8_t *sao_pixel_buffer_v[3];
+
+ HEVCRpiParamSets ps;
+
+ AVBufferPool *tab_mvf_pool;
+ AVBufferPool *rpl_tab_pool;
+
+ ///< candidate references for the current frame
+ RefPicList rps[5];
+
+ RpiSliceHeader sh;
+ RpiSAOParams *sao;
+ DBParams *deblock;
+ enum HEVCNALUnitType nal_unit_type;
+ int temporal_id; ///< temporal_id_plus1 - 1
+ HEVCFrame *ref;
+ HEVCFrame DPB[HEVC_DPB_ELS];
+ int poc;
+ int pocTid0;
+ int slice_idx; ///< number of the slice being currently decoded
+ int eos; ///< current packet contains an EOS/EOB NAL
+ int last_eos; ///< last packet contains an EOS/EOB NAL
+ int max_ra;
+
+ int is_decoded;
+ int no_rasl_output_flag;
+
+ HEVCRpiPredContext hpc;
+ HEVCDSPContext hevcdsp;
+ int8_t *qp_y_tab;
+
+ // Deblocking block strength bitmaps
@ -33731,6 +33697,21 @@ index 0000000000..a5ce342ab3
+
+ struct AVMD5 *md5_ctx;
+
+ RpiSliceHeader sh;
+
+ HEVCRpiParamSets ps;
+
+ HEVCRpiLocalContext *HEVClc;
+ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
+
+ HEVCFrame DPB[HEVC_DPB_ELS];
+
+ ///< candidate references for the current frame
+ RefPicList rps[5];
+
+ HEVCRpiPredContext hpc;
+ HEVCDSPContext hevcdsp;
+
+ HEVCSEIContext sei;
+
+ // Put structures that allocate non-trivial storage at the end