diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 057d1692fe..4d9a4a6157 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -1,19 +1,20 @@
################################################################################
-# This file is part of OpenELEC - http://www.openelec.tv
+# This file is part of LibreELEC - https://libreelec.tv
+# Copyright (C) 2017-present Team LibreELEC
# Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv)
#
-# OpenELEC is free software: you can redistribute it and/or modify
+# LibreELEC is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
-# OpenELEC is distributed in the hope that it will be useful,
+# LibreELEC is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with OpenELEC. If not, see .
+# along with LibreELEC. If not, see .
################################################################################
PKG_NAME="ffmpeg"
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index ef2f4d7d62..91ea9da3dd 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -582,18 +582,19 @@ index 4d4ef530e4..fba8776c9f 100644
{
const AVCodec *p, *experimental = NULL;
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index e656011c3c..69cd820f06 100644
+index e656011c3c..70c3f026b8 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
+@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \
++ arm/rpi_hevcpred_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
-@@ -136,10 +137,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
+@@ -136,10 +138,23 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
@@ -608,7 +609,12 @@ index e656011c3c..69cd820f06 100644
+ arm/rpi_hevcdsp_idct_neon.o \
+ arm/rpi_hevcdsp_res8_neon.o \
+ arm/rpi_hevcdsp_res16_neon.o \
-+ arm/rpi_hevcdsp_sao_neon.o
++ arm/rpi_hevcdsp_sao_neon.o \
++ arm/rpi_hevcpred_init_neon.o \
++ arm/rpi_hevcpred_intra_angular_neon.o \
++ arm/rpi_hevcpred_intra_dc_neon.o \
++ arm/rpi_hevcpred_intra_hv_neon.o \
++ arm/rpi_hevcpred_intra_planar_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
@@ -1738,10 +1744,10 @@ index 0000000000..62b9326532
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
new file mode 100644
-index 0000000000..e665bd848a
+index 0000000000..f75c82671e
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1249 @@
+@@ -0,0 +1,1593 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi
+ *
@@ -1766,65 +1772,72 @@ index 0000000000..e665bd848a
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
+ vsubl.u8 q0, \Q0a, \P0a
-+ vsubl.u8 q2, \P1a, \Q1a
-+ vshl.i16 q0, #2
-+ vadd.i16 q0, q2
++ vsubl.u8 q1, \P1a, \Q1a
+ vdup.16 d4, r2
-+
-+ vrshr.s16 q0, #3
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
+ vmovl.u8 q2, d4
-+
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
-+ vmax.s16 q0, q2
-+ vaddw.u8 q2, q0, \P0a
-+
-+ vqmovun.s16 \P0a, q2
+ vmovl.u8 q2, \Q0a
-+ vsub.i16 q2, q0
-+
-+ vqmovun.s16 \Q0a, q2
++ vmax.s16 q0, q1
++ vaddw.u8 q1, q0, \P0a
++ vsub.i16 q0, q2, q0
++ vqmovun.s16 \P0a, q1
++ vqmovun.s16 \Q0a, q0
+.endm
+
+
-+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
-+ vsubl.u8 q0, \Q0u, \P0u
-+ vsubl.u8 q1, \Q0v, \P0v
-+ vsubl.u8 q2, \P1u, \Q1u
-+ vsubl.u8 q3, \P1v, \Q1v
-+ vshl.i16 q0, #2
-+ vshl.i16 q1, #2
-+ vadd.i16 q0, q2
-+ vdup.16 d4, r2
-+ lsr r2, #16
-+ vadd.i16 q1, q3
-+
-+ vrshr.s16 q0, #3
-+ vdup.16 d6, r2
-+ vmovl.u8 q2, d4
-+ vmovl.u8 q3, d6
-+ vrshr.s16 q1, #3
-+
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b
++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a
++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vmovl.u8 q2, d4 @ tc0a, tc0b
++ \I3
++ vmovl.u8 q3, d6 @ tc1a, tc1b
++ \I4
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
+ vmin.s16 q1, q3
-+ vneg.s16 q3, q3
-+ vmax.s16 q0, q2
-+ vaddw.u8 q2, q0, \P0u
-+ vmax.s16 q1, q3
-+ vaddw.u8 q3, q1, \P0v
-+
-+ vqmovun.s16 \P0u, q2
-+ vmovl.u8 q2, \Q0u
-+ vqmovun.s16 \P0v, q3
-+ vmovl.u8 q3, \Q0v
-+ vsub.i16 q2, q0
-+ vsub.i16 q3, q1
-+
-+ vqmovun.s16 \Q0u, q2
-+ vqmovun.s16 \Q0v, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q1, q3 @ delta0b
++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a
++ vsub.i16 q0, q2, q0 @ q0a - delta0a
++ vmovl.u8 q2, \Q0b
++ vsub.i16 q2, q1 @ q0b - delta0b
++ vaddw.u8 q1, \P0b @ p0b + delta0b
++ vqmovun.s16 \Q0a, q0
++ vqmovun.s16 \P0a, q3
++ vqmovun.s16 \Q0b, q2
++ vqmovun.s16 \P0b, q1
+.endm
+
+
@@ -1835,33 +1848,36 @@ index 0000000000..e665bd848a
+@ [0..7] tc U a
+@ [8..15] tc V a
+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
+ vsub.i16 q0, \Q0a, \P0a
-+ vsub.i16 q2, \P1a, \Q1a
-+ vshl.i16 q0, #2
-+ vadd.i16 q0, q2
-+ vrshr.s16 q0, #3
-+
++ vsub.i16 q1, \P1a, \Q1a
+ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
+ vshll.u8 q2, d4, #\bit_depth - 8
-+
-+ movw r2, #(1 << \bit_depth) - 1
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
-+ vmax.s16 q0, q2
-+ vmov.i64 q2, #0
-+ vdup.i16 q3, r2
++ vmov.i16 q2, #0
++ vmax.s16 q0, q1
+ vadd.i16 \P0a, q0
+ vsub.i16 \Q0a, q0
-+
++ vmov.i16 q1, #(1 << \bit_depth) - 1
+ vmax.s16 \P0a, q2
+ vmax.s16 \Q0a, q2
-+ vmin.s16 \P0a, q3
-+ vmin.s16 \Q0a, q3
++ vmin.s16 \P0a, q1
++ vmin.s16 \Q0a, q1
+.endm
+
-+@ Preserves r12
-+@ Clobbers r2
++@ Clobbers r2, r12
+@ P0a et al all contain UVUVUVUV
+@ r2 (tc4) contains
+@ [0..7] tc U a
@@ -1869,38 +1885,41 @@ index 0000000000..e665bd848a
+@ [16..23] tc U b
+@ [24..31] tc V b
+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth
-+ vsub.i16 q0, \Q0a, \P0a
-+ vsub.i16 q1, \Q0b, \P0b
-+ vsub.i16 q2, \P1a, \Q1a
-+ vsub.i16 q3, \P1b, \Q1b
-+ vshl.i16 q0, #2
-+ vshl.i16 q1, #2
-+ vadd.i16 q0, q2
-+ vrshr.s16 q0, #3
-+ vadd.i16 q1, q3
-+ vrshr.s16 q1, #3
-+
-+ vdup.16 d4, r2
-+ lsr r2, #16
-+ vdup.16 d6, r2
-+ vshll.u8 q2, d4, #\bit_depth - 8
-+ vshll.u8 q3, d6, #\bit_depth - 8
-+
-+ movw r2, #(1 << \bit_depth) - 1
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b
++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a
++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++ \I3
++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++ \I4
+ vmin.s16 q0, q2
-+ vneg.s16 q2, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
+ vmin.s16 q1, q3
-+ vneg.s16 q3, q3
-+ vmax.s16 q0, q2
-+ vmov.i64 q2, #0
-+ vmax.s16 q1, q3
-+ vdup.i16 q3, r2
-+ vadd.i16 \P0a, q0
-+ vsub.i16 \Q0a, q0
-+ vadd.i16 \P0b, q1
-+ vsub.i16 \Q0b, q1
-+
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vadd.i16 \P0a, q0 @ p0a + delta0a
++ vsub.i16 \Q0a, q0 @ q0a - delta0a
++ vmax.s16 q1, q3 @ delta0b
++ vadd.i16 \P0b, q1 @ p0b + delta0b
++ vsub.i16 \Q0b, q1 @ q0b - delta0b
++ vmov.i16 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
+ vmax.s16 \P0a, q2
+ vmax.s16 \Q0a, q2
+ vmax.s16 \P0b, q2
@@ -1923,11 +1942,10 @@ index 0000000000..e665bd848a
+ it eq
+ bxeq lr
+ push {r4-r10,lr} @ 32 bytes
-+ ldr r5, [sp, #32] @ &_no_p
-+ ldrb r10, [r5]
-+ ldr r5, [sp, #36] @ &_no_q
++ ldrd r4, r5, [sp, #32] @ &_no_p
++ ldrb r4, [r4]
+ ldrb r5, [r5]
-+ cmp r10, #0
++ movs r10, r4
+ it ne
+ movne r10, #1
+ cmp r5, #0
@@ -1950,244 +1968,207 @@ index 0000000000..e665bd848a
+@ Junks:
+@ r5, r6, r7, r8, r9
+
-+.macro m_filter_luma bit_depth
++.macro m_filter_luma bit_depth, Q11, Q15
+.if \bit_depth == 8
-+ vmovl.u8 q15, d23
-+ vmovl.u8 q14, d22
-+ vmovl.u8 q13, d21
-+ vmovl.u8 q12, d20
-+ vmovl.u8 q11, d19
-+ vmovl.u8 q10, d18
-+ vmovl.u8 q9, d17
-+ vmovl.u8 q8, d16
++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
+.endif
-+ vadd.i16 q7, q9, q11
++ vadd.i16 q0, q9, \Q11 @ P2 + P0
+.if \bit_depth > 8
-+ lsl r2, r2, #(\bit_depth - 8)
++ lsl r3, r3, #(\bit_depth - 8)
+.endif
-+ vadd.i16 q6, q14, q12
++ vadd.i16 q1, q14, q12 @ Q2 + Q0
+.if \bit_depth > 8
-+ lsl r3, r3, #(\bit_depth - 8)
++ lsl r2, r2, #(\bit_depth - 8)
+.endif
-+ vsub.i16 q7, q10
-+ vsub.i16 q6, q13
-+ vabd.s16 q7, q7, q10
-+ vabd.s16 q6, q6, q13
++ vsub.i16 q0, q10 @ P2 - P1 + P0
++ lsr r5, r3, #16
++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0)
++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++ vmov.i64 q2, #0xffffffff0000
++ vbic q0, q2 @ only dp0(') and dp3(')
++ vbic q1, q2 @ only dq0(') and dq3(')
++ vsra.u64 q0, #16
++ vsra.u64 q1, #16
++ vdup.16 q3, r2 @ beta
++ vdup.16 d14, r3 @ tC[0]
++ vdup.16 d15, r5 @ tC[1]
++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0
++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0
++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3
++ vshl.s16 q6, q7, #2 @ tC[] * 4
++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1
++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta)
++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3
++ cmp r7, #0
++ beq .Lbypasswrite
+
-+ vdup.16 q0, r2
-+ vmov q4, q7
-+ vmov q5, q6
-+ vdup.16 d4, r3
-+ lsr r3, r3, #16
-+ vtrn.16 q7, q4
-+ vtrn.16 q6, q5
++ vcgt.s16 q5, q6, q5 @ if < tc25
++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++ vand q4, q5
++ vbic d8, d4
++ vbic d9, d4
++ vshr.s16 q3, #2 @ beta_2 = beta >> 2
++ vsra.u64 q4, #16
++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1
++ vshl.i16 q7, #1 @ tc2 = tC[] << 1
++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc
++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half
++ vand d6, d8 @ && beta_2 tests, prime in ms half
++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++ vneg.s16 q6, q7 @ -tc2
++ vmovn.i32 d8, q3
++ vshrn.i32 d6, q3, #16
++ vand d6, d8
++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3
++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block)
++ vadd.i16 q0, \Q11, q12 @ p0 + q0
++ ands r9, r7, r8
++ beq 1f
+
-+ vshl.u64 q7, #32
-+ vshr.u64 q4, #32
-+ vshl.u64 q6, #32
-+ vshr.u64 q5, #32
-+ vshr.u64 q7, #32
-+ vshr.u64 q6, #32
-+ vshl.u64 q5, #32
-+ vshl.u64 q4, #32
-+ vorr q6, q5
-+ vorr q7, q4
-+ vdup.16 d5, r3
-+ vadd.i16 q5, q7, q6
-+
-+ vmov q4, q5
-+ vmov q3, q5
-+ vtrn.32 q3, q4
-+
-+ vadd.i16 q4, q3
-+
-+ vshl.s16 q5, q5, #1
-+ vcgt.s16 q3, q0, q4
-+
-+ vmovn.i16 d6, q3
-+ vshr.s16 q1, q0, #2
-+ vmovn.i16 d6, q3
-+ vcgt.s16 q5, q1, q5
-+ vmov r7, s12
-+ cmp r7, #0
-+ beq .Lbypasswrite
-+
-+ vpadd.i32 d0, d14, d12
-+ vpadd.i32 d1, d15, d13
-+ vmov q4, q2
-+ vshl.s16 q2, #2
-+ vshr.s16 q1, q1, #1
-+ vrhadd.s16 q2, q4
-+
-+ vabd.s16 q7, q8, q11
-+ vaba.s16 q7, q15, q12
-+
-+ vmovn.i32 d0, q0
-+ vmov r5, r6, s0, s1
-+ vcgt.s16 q6, q1, q7
-+ vand q5, q5, q6
-+ vabd.s16 q7, q11, q12
-+ vcgt.s16 q6, q2, q7
-+ vand q5, q5, q6
-+
-+ vmov q2, q5
-+ vtrn.s16 q5, q2
-+ vshr.u64 q2, #32
-+ vshl.u64 q5, #32
-+ vshl.u64 q2, #32
-+ vshr.u64 q5, #32
-+ vorr q5, q2
-+
-+ vmov q2, q5
-+ vshl.i16 q7, q4, #1
-+ vtrn.32 q2, q5
-+ vand q5, q2
-+ vneg.s16 q6, q7
-+ vmovn.i16 d4, q5
-+ vmovn.i16 d4, q2
-+ vmov r8, s8
-+
-+ and r9, r8, r7
-+ cmp r9, #0
-+ beq 1f
-+
-+ vadd.i16 q2, q11, q12
-+ vadd.i16 q4, q9, q8
-+ vadd.i16 q1, q2, q10
-+ vdup.16 d10, r9
-+ vadd.i16 q0, q1, q9
-+ vshl.i16 q4, #1
-+ lsr r9, #16
-+ vadd.i16 q1, q0
-+ vrshr.s16 q3, q0, #2
-+ vadd.i16 q1, q13
-+ vadd.i16 q4, q0
-+ vsub.i16 q3, q10
-+ vrshr.s16 q1, #3
-+ vrshr.s16 q4, #3
-+ vmax.s16 q3, q6
-+ vsub.i16 q1, q11
-+ vsub.i16 q4, q9
-+ vmin.s16 q3, q7
-+ vmax.s16 q4, q6
-+ vmax.s16 q1, q6
-+ vadd.i16 q3, q10
-+ vmin.s16 q4, q7
-+ vmin.s16 q1, q7
-+ vdup.16 d11, r9
-+ vadd.i16 q4, q9
-+ vadd.i16 q1, q11
-+ vbit q9, q4, q5
-+ vadd.i16 q4, q2, q13
-+ vbit q11, q1, q5
-+ vadd.i16 q0, q4, q14
-+ vadd.i16 q2, q15, q14
-+ vadd.i16 q4, q0
-+
-+ vshl.i16 q2, #1
-+ vadd.i16 q4, q10
-+ vbit q10, q3, q5
-+ vrshr.s16 q4, #3
-+ vadd.i16 q2, q0
-+ vrshr.s16 q3, q0, #2
-+ vsub.i16 q4, q12
-+ vrshr.s16 q2, #3
-+ vsub.i16 q3, q13
-+ vmax.s16 q4, q6
-+ vsub.i16 q2, q14
-+ vmax.s16 q3, q6
-+ vmin.s16 q4, q7
-+ vmax.s16 q2, q6
-+ vmin.s16 q3, q7
-+ vadd.i16 q4, q12
-+ vmin.s16 q2, q7
-+ vadd.i16 q3, q13
-+ vbit q12, q4, q5
-+ vadd.i16 q2, q14
-+ vbit q13, q3, q5
-+ vbit q14, q2, q5
++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0
++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1
++ lsr r3, r9, #16
++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping)
++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++ vadd.i16 q0, q8, q9 @ p3 + p2
++ vadd.i16 q5, \Q15, q14 @ q2 + q3
++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2
++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3
++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++ vrshr.s16 q0, #3 @ scale, with rounding
++ vrshr.s16 q5, #3
++ vrshr.s16 q1, #2
++ vrshr.s16 q4, #2
++ vrshr.s16 q2, #3
++ vrshr.s16 q3, #3
++ vsub.i16 q0, q9 @ find difference
++ vsub.i16 q5, q14
++ vsub.i16 q1, q10
++ vsub.i16 q4, q13
++ vsub.i16 q2, \Q11
++ vsub.i16 q3, q12
++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2
++ vmax.s16 q5, q6
++ vmax.s16 q1, q6
++ vmax.s16 q4, q6
++ vmax.s16 q2, q6
++ vmax.s16 q3, q6
++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure
++ vdup.16 d13, r3
++ vmin.s16 q0, q7
++ vmin.s16 q5, q7
++ vmin.s16 q1, q7
++ vmin.s16 q4, q7
++ vmin.s16 q2, q7
++ vmin.s16 q3, q7
++ vadd.i16 q0, q9 @ apply difference
++ vadd.i16 q5, q14
++ vadd.i16 q1, q10
++ vadd.i16 q4, q13
++ vadd.i16 q2, \Q11
++ vadd.i16 q3, q12
++ vbit q9, q0, q6 @ apply filtered values according to mask
++ vbit q14, q5, q6
++ vbit q10, q1, q6
++ vbit q13, q4, q6
++ vbit \Q11, q2, q6
++ vbit q12, q3, q6
++ vneg.s16 q6, q7 @ restore -tc2
+
+1:
-+ mvn r8, r8
-+ and r9, r8, r7
-+ cmp r9, #0
-+ beq 2f
++ bics r9, r7, r8
++ beq 2f
+
-+ vdup.16 q4, r2
-+
-+ vdup.16 d10, r9
-+ lsr r9, #16
-+ vmov q1, q4
-+ vdup.16 d11, r9
-+ vshr.s16 q1, #1
-+ vsub.i16 q2, q12, q11
-+ vadd.i16 q4, q1
-+ vshl.s16 q0, q2, #3
-+ vshr.s16 q4, #3
-+ vadd.i16 q2, q0
-+ vsub.i16 q0, q13, q10
-+ vsub.i16 q2, q0
-+ vshl.i16 q0, q0, #1
-+ vsub.i16 q2, q0
-+ vshl.s16 q1, q7, 2
-+ vrshr.s16 q2, q2, #4
-+ vadd.i16 q1, q7
-+ vabs.s16 q3, q2
-+ vshr.s16 q6, q6, #1
-+ vcgt.s16 q1, q1, q3
-+ vand q5, q1
-+ vshr.s16 q7, q7, #1
-+ vmax.s16 q2, q2, q6
-+ vmin.s16 q2, q2, q7
-+
-+ vshr.s16 q7, q7, #1
-+ vrhadd.s16 q3, q9, q11
-+ vneg.s16 q6, q7
-+ vsub.s16 q3, q10
-+ vdup.16 d2, r5
-+ vhadd.s16 q3, q2
-+ vdup.16 d3, r6
-+ vmax.s16 q3, q3, q6
-+ vcgt.s16 q1, q4, q1
-+ vmin.s16 q3, q3, q7
-+ vand q1, q5
-+ vadd.i16 q3, q10
-+ lsr r5, #16
-+ lsr r6, #16
-+ vbit q10, q3, q1
-+
-+ vrhadd.s16 q3, q14, q12
-+ vdup.16 d2, r5
-+ vsub.s16 q3, q13
-+ vdup.16 d3, r6
-+ vhsub.s16 q3, q2
-+ vcgt.s16 q1, q4, q1
-+ vmax.s16 q3, q3, q6
-+ vand q1, q5
-+ vmin.s16 q3, q3, q7
-+ vadd.i16 q3, q13
-+ vbit q13, q3, q1
-+ vadd.i16 q0, q11, q2
-+ vsub.i16 q4, q12, q2
-+ vbit q11, q0, q5
-+ vbit q12, q4, q5
++ vsub.i16 q0, q12, \Q11 @ q0 - p0
++ vsub.i16 q1, q13, q10 @ q1 - p1
++ lsr r3, r9, #16
++ vshl.i16 q2, q0, #3
++ lsr r7, r5, #16
++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0)
++ lsr r8, r6, #16
++ vshl.i16 q2, q1, #1
++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1)
++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1
++ vsub.i16 q5, q3, q4
++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1
++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1
++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1
++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1
++ vmax.s16 q6, q5 @
++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1
++ vdup.16 q0, r2 @ beta
++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc]
++ vshr.s16 q4, #1 @ tc_2 = tc >> 1
++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++ vshr.s16 q2, q0, #1 @ beta >> 1
++ vadd.i16 q2, q0 @ beta + (beta >> 1)
++ vneg.s16 q0, q4 @ -tc_2
++ vabs.s16 q5, q5 @ abs(original delta0)
++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3
++ vmax.s16 q1, q0
++ vmax.s16 q3, q0
++ vshl.s16 q0, q7, #2 @ 8 * tc
++ vadd.i16 q7, q0 @ 10 * tc
++ vdup.16 d0, r9
++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering
++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++ vdup.16 d8, r5 @ dp0 + dp3
++ vdup.16 d9, r7 @ dp0' + dp3'
++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0))
++ vdup.16 d10, r6 @ dq0 + dq3
++ vdup.16 d11, r8 @ dq0' + dq3'
++ vand q7, q0 @ AND block and line masks
++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++ vadd.i16 q0, q1, q10 @ p1 + deltap1
++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++ vadd.i16 q3, q3, q13 @ q1 + deltaq1
++ vadd.i16 q1, \Q11, q6 @ p0 + delta0
++ vsub.i16 q2, q12, q6 @ q0 - delta0
++ vand q4, q7 @ AND nd_p test with block/line masks
++ vand q5, q7 @ AND nd_q test with block/line masks
++ vbit q10, q0, q4
++ vbit \Q11, q1, q7
++ vbit q12, q2, q7
++ vbit q13, q3, q5
+
+2:
+.if \bit_depth == 8
++ vmovn.i16 d16, q8
++ vmovn.i16 d23, \Q15
+ neg r1, r1
-+ vqmovun.s16 d16, q8
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
-+ vqmovun.s16 d19, q11
++ vqmovun.s16 d19, \Q11
+ lsls r10, #31
+ vqmovun.s16 d20, q12
+ vqmovun.s16 d21, q13
+ vqmovun.s16 d22, q14
-+ vqmovun.s16 d23, q15
+.else
-+ movw r5, #(1 << \bit_depth - 1)
-+ vmov.i64 q0, #0
-+ vdup.i16 q1, r5
++ vmov.i16 q0, #0
++ vmov.i16 q1, #(1 << \bit_depth - 1)
+ @ q8 & q15 should be unaltered and so don't require clipping
+ neg r1, r1
+ vmax.s16 q9, q0
@@ -2204,14 +2185,14 @@ index 0000000000..e665bd848a
+ vmin.s16 q13, q1
+ vmin.s16 q14, q1
+.endif
-+ mov pc, lr
++ bx lr
+.endm
+
+function hevc_loop_filter_luma_body
-+ m_filter_luma 8
++ m_filter_luma 8, q15, q11
+endfunc
+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon(
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
+@ uint8_t *_pix, [r0]
+@ ptrdiff_t _stride, [r1]
+@ int _beta, [r2]
@@ -2219,7 +2200,7 @@ index 0000000000..e665bd848a
+@ uint8_t *_no_p, [sp+0]
+@ uint8_t *_no_q) [sp+4]
+
-+function ff_hevc_rpi_v_loop_filter_luma_neon, export=1
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
+ hevc_loop_filter_luma_start
+
+ sub r4, r0, #4
@@ -2245,66 +2226,72 @@ index 0000000000..e665bd848a
+.Lv_loop_luma_common:
+ vpush {d8-d15}
+
-+ @ Uses slightly fewer instructions to do laned loads than unlaned
-+ @ and transpose. This also means that we can use the same code for
-+ @ both split & unsplit deblock
-+ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
-+ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
-+
-+ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+
-+ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
-+ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
-+
-+ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+
-+ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
-+ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
-+
-+ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+
-+ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
-+ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
-+
-+ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
-+ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
++ @ It's slightly faster to do unlaned loads and transpose in the
++ @ 8-bit case, even though it needs more instructions, because
++ @ VLD4.8 is a really slow way to read from memory.
++ vld1.32 {d16[0]}, [r4:32], r1
++ vld1.32 {d20[0]}, [r0:32], r1
++ vld1.32 {d16[1]}, [r4:32], r1
++ vld1.32 {d20[1]}, [r0:32], r1
++ vld1.32 {d17[0]}, [r4:32], r1
++ vld1.32 {d21[0]}, [r0:32], r1
++ vld1.32 {d17[1]}, [r4:32], r1
++ vld1.32 {d21[1]}, [r0:32], r1
++ vld1.32 {d18[0]}, [r4:32], r1
++ vld1.32 {d22[0]}, [r0:32], r1
++ vld1.32 {d18[1]}, [r4:32], r1
++ vld1.32 {d22[1]}, [r0:32], r1
++ vld1.32 {d19[0]}, [r4:32], r1
++ vld1.32 {d23[0]}, [r0:32], r1
++ vld1.32 {d19[1]}, [r4:32]
++ vld1.32 {d23[1]}, [r0:32]
++ vuzp.16 q8, q9
++ vuzp.16 q10, q11
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vswp d17, d18
++ vswp d21, d22
+
+ bl hevc_loop_filter_luma_body
+
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
+ @ no_p[1]
+ bmi 1f
+ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
+ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
+
+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
+1:
+ @ no_q[1]
-+@ tst r10, #2
+ bcs 1f
+ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
+ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
+
+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+1:
++ pop {r4-r10,pc}
++
+.Lbypasswrite:
+ vpop {d8-d15}
+ pop {r4-r10,pc}
+endfunc
+
-+.macro m_filter_v_luma_common_16 bit_depth
++.macro m_filter_v_luma_16 bit_depth
+ vpush {d8-d15}
+
+ @ Uses slightly fewer instructions to do laned loads than unlaned
@@ -2336,29 +2323,34 @@ index 0000000000..e665bd848a
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
+ @ p[1]
+ bmi 1f
+ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
+ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
+ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
+ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4]
++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6]
+1:
+ @ q[1]
+ bcs 1f
+ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
+ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
+ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0]
++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
+1:
-+ vpop {d8-d15}
+ pop {r4-r10,pc}
+.endm
+
@@ -2374,7 +2366,7 @@ index 0000000000..e665bd848a
+@
+@ Src should always be on 8 byte boundry & all in the same slice
+
-+function ff_hevc_rpi_h_loop_filter_luma_neon, export=1
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
+ hevc_loop_filter_luma_start
+ b .Lh_loop_filter_luma_common_8
+endfunc
@@ -2387,71 +2379,75 @@ index 0000000000..e665bd848a
+ ldr r10, [sp, #32]
+
+.Lh_loop_filter_luma_common_8:
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
+ vpush {d8-d15}
-+ sub r0, r0, r1, lsl #2
+
-+ vld1.8 {d16}, [r0], r1
++ vld1.8 {d16}, [r4], r1
+ vld1.8 {d17}, [r0], r1
-+ vld1.8 {d18}, [r0], r1
++ vld1.8 {d18}, [r4], r1
+ vld1.8 {d19}, [r0], r1
-+ vld1.8 {d20}, [r0], r1
++ vld1.8 {d20}, [r4], r1
+ vld1.8 {d21}, [r0], r1
-+ vld1.8 {d22}, [r0], r1
++ vld1.8 {d22}, [r4]
+ vld1.8 {d23}, [r0]
+
+ bl hevc_loop_filter_luma_body
+
-+ add r2, r0, r1, lsl #2
-+ add r0, r0, r1
-+
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
+ vpop {d8-d15}
+
+ @ P2-P0
+ bcs 1f
-+ vst1.8 {d22}, [r0], r1
-+ vst1.8 {d21}, [r0], r1
-+ vst1.8 {d20}, [r0]
++ vst1.8 {d22}, [r4], r1
++ vst1.8 {d21}, [r6]
++ vst1.8 {d20}, [r4]
+1:
+ @ Q0-Q2
+ bmi 1f
-+ vst1.8 {d19}, [r2], r1
-+ vst1.8 {d18}, [r2], r1
-+ vst1.8 {d17}, [r2]
++ vst1.8 {d19}, [r0], r1
++ vst1.8 {d18}, [r2]
++ vst1.8 {d17}, [r0]
+1:
+ pop {r4-r10,pc}
+endfunc
+
+
+.macro m_filter_h_luma_16 bit_depth
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
+ vpush {d8-d15}
-+ sub r0, r0, r1, lsl #2
+
-+ vld1.16 { q8}, [r0], r1
++ vld1.16 { q8}, [r4], r1
+ vld1.16 { q9}, [r0], r1
-+ vld1.16 {q10}, [r0], r1
++ vld1.16 {q10}, [r4], r1
+ vld1.16 {q11}, [r0], r1
-+ vld1.16 {q12}, [r0], r1
++ vld1.16 {q12}, [r4], r1
+ vld1.16 {q13}, [r0], r1
-+ vld1.16 {q14}, [r0], r1
++ vld1.16 {q14}, [r4]
+ vld1.16 {q15}, [r0]
+
+ bl hevc_loop_filter_luma_body_\bit_depth
+
-+ add r2, r0, r1, lsl #2
-+ add r0, r1
-+
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
+ vpop {d8-d15}
+
+ @ P2-P0
+ bcs 1f
-+ vst1.16 {q14}, [r0], r1
-+ vst1.16 {q13}, [r0], r1
-+ vst1.16 {q12}, [r0]
++ vst1.16 {q14}, [r4], r1
++ vst1.16 {q13}, [r6]
++ vst1.16 {q12}, [r4]
+1:
+ bmi 1f
-+ vst1.16 {q11}, [r2], r1
-+ vst1.16 {q10}, [r2], r1
-+ vst1.16 { q9}, [r2]
++ vst1.16 {q11}, [r0], r1
++ vst1.16 {q10}, [r2]
++ vst1.16 { q9}, [r0]
+1:
+ pop {r4-r10,pc}
+.endm
@@ -2474,23 +2470,25 @@ index 0000000000..e665bd848a
+@ common in the H direction than V due to how we arrange deblock.
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++ sub r12, r0, r1
+ cmp r2, #0
+ bxeq lr
-+ sub r0, r0, r1, lsl #1
++ vld1.8 {d26,d27}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.8 {d18,d19}, [r12], r1
+ vld1.8 {d16,d17}, [r0], r1
-+ vld1.8 {d18,d19}, [r0], r1
-+ vld1.8 {d26,d27}, [r0], r1
-+ vld1.8 {d28,d29}, [r0]
-+ sub r0, r0, r1, lsl #1
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
++ vld1.8 {d28,d29}, [r12]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++ "sub r12, r0, r1, asr #1"
+
-+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
-+ vstrpl d18, [r0, #0]
-+ vstrcc d19, [r0, #8]
-+ add r0, r1
+ lsls r3, #29 @ b2 -> N, b3 -> C
+ vstrpl d26, [r0, #0]
+ vstrcc d27, [r0, #8]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ vstrpl d18, [r12, #0]
++ vstrcc d19, [r12, #8]
+ bx lr
+
+endfunc
@@ -2506,37 +2504,38 @@ index 0000000000..e665bd848a
+@ Macro here actual function near bottom
+
+.macro m_filter_h_uv_16 bit_depth
++ sub r12, r0, r1
+ cmp r2, #0
+ bxeq lr
-+ sub r0, r0, r1, lsl #1
++ vld1.16 {q12, q13}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.16 {q10, q11}, [r12], r1
+ vld1.16 {q8, q9 }, [r0], r1
-+ vld1.16 {q10, q11}, [r0], r1
-+ vld1.16 {q12, q13}, [r0], r1
-+ vld1.16 {q14, q15}, [r0]
-+ sub r0, r0, r1, lsl #1
++ vld1.16 {q14, q15}, [r12]
+
-+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++ "sub r12, r0, r1, asr #1", \
++ "cmp r3, #0"
+
-+ cmp r3, #0
+ bne 1f
-+ vst1.16 {q10, q11}, [r0], r1
++ vst1.16 {q10, q11}, [r12]
+ vst1.16 {q12, q13}, [r0]
+ bx lr
+
+ @ At least one no_f bit is set
+ @ Which means we need to break this apart in an ugly fashion
+1:
-+ lsls r2, r3, #31 @ b0 -> N, b1 -> C
-+ vstrpl d20, [r0, #0]
-+ vstrpl d21, [r0, #8]
-+ vstrcc d22, [r0, #16]
-+ vstrcc d23, [r0, #24]
-+ add r0, r1
+ lsls r3, #29 @ b2 -> N, b3 -> C
+ vstrpl d24, [r0, #0]
+ vstrpl d25, [r0, #8]
+ vstrcc d26, [r0, #16]
+ vstrcc d27, [r0, #24]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ vstrpl d20, [r12, #0]
++ vstrpl d21, [r12, #8]
++ vstrcc d22, [r12, #16]
++ vstrcc d23, [r12, #24]
+ bx lr
+.endm
+
@@ -2556,6 +2555,7 @@ index 0000000000..e665bd848a
+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
+ cmp r2, #0
+ bxeq lr
++ push {lr}
+ vld2.16 {d16[0], d18[0]}, [r3], r1
+ vld2.16 {d20[0], d22[0]}, [r0], r1
+
@@ -2570,106 +2570,112 @@ index 0000000000..e665bd848a
+ vld2.16 {d20[3], d22[3]}, [r0], r1
+ blo 10f
+
-+ sub r12, r0, r3
+ vld2.16 {d17[0], d19[0]}, [r3], r1
+ vld2.16 {d21[0], d23[0]}, [r0], r1
+
-+ cmp r12, #4
++ sub ip, r0, r3
+ vld2.16 {d17[1], d19[1]}, [r3], r1
+ vld2.16 {d21[1], d23[1]}, [r0], r1
+
++ cmp ip, #4
+ vld2.16 {d17[2], d19[2]}, [r3], r1
+ vld2.16 {d21[2], d23[2]}, [r0], r1
+
+ vld2.16 {d17[3], d19[3]}, [r3]
+ vld2.16 {d21[3], d23[3]}, [r0]
-+ it eq
-+ ldreq r12, [sp, #0]
+
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
-+ cmp r12, #0
-+ add r3, #2
-+ neg r1, r1
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #2", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
+ bne 1f
+
+@ Much/most of the time r0 == r3 + 4 and no_f == 0
+@ so it is worth having this special case
+ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b
-+ vst2.16 {d19[2], d21[2]}, [r3], r1
++ vst2.16 {d19[2], d21[2]}, [ip], r1
+ vst2.16 {d19[1], d21[1]}, [r3], r1
-+ vst2.16 {d19[0], d21[0]}, [r3], r1
++ vst2.16 {d19[0], d21[0]}, [ip], r1
+ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a
-+ vst2.16 {d18[2], d20[2]}, [r3], r1
-+ vst2.16 {d18[1], d20[1]}, [r3], r1
-+ vst2.16 {d18[0], d20[0]}, [r3]
-+ bx lr
++ vst2.16 {d18[2], d20[2]}, [ip], r1
++ vst2.16 {d18[1], d20[1]}, [r3]
++ vst2.16 {d18[0], d20[0]}, [ip]
++ pop {pc}
+
+@ Either split or partial
+1:
-+ ldr r12, [sp, #0]
-+ @ I have no idea if this is faster than any of the other ways of
-+ @ testing these bits but it does free up r12
-+ lsl r12, #28
-+ add r2, r0, r1, lsl #2
-+ msr APSR_nzcvq, r12 @ b0 (P0a) -> V, b1 (Q0a) -> C, b2 (P0b) -> Z, b3 (Q0b) -> N
-+ add r12, r3, r1, lsl #2
-+ bmi 1f
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
+ @ Q0b
+ vst1.16 {d21[3]}, [r0], r1
-+ vst1.16 {d21[2]}, [r0], r1
++ vst1.16 {d21[2]}, [r2], r1
+ vst1.16 {d21[1]}, [r0], r1
-+ vst1.16 {d21[0]}, [r0]
++ vst1.16 {d21[0]}, [r2], r1
+1:
-+ beq 2f
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
+ @ P0b
+ vst1.16 {d19[3]}, [r3], r1
-+ vst1.16 {d19[2]}, [r3], r1
++ vst1.16 {d19[2]}, [ip], r1
+ vst1.16 {d19[1]}, [r3], r1
-+ vst1.16 {d19[0]}, [r3]
-+
-+2:
-+ bcs 3f
++ vst1.16 {d19[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
+ @ Q0a
-+ vst1.16 {d20[3]}, [r2], r1
++ vst1.16 {d20[3]}, [r0], r1
+ vst1.16 {d20[2]}, [r2], r1
-+ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[1]}, [r0]
+ vst1.16 {d20[0]}, [r2]
-+
-+3:
-+ it vs
-+ bxvs lr
-+ vst1.16 {d18[3]}, [r12], r1
-+ vst1.16 {d18[2]}, [r12], r1
-+ vst1.16 {d18[1]}, [r12], r1
-+ vst1.16 {d18[0]}, [r12]
-+ bx lr
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[3]}, [r3], r1
++ vst1.16 {d18[2]}, [ip], r1
++ vst1.16 {d18[1]}, [r3]
++ vst1.16 {d18[0]}, [ip]
++ pop {pc}
+
+@ Single lump (rather than double)
+10:
-+ hevc_loop_filter_uv_body1 d16, d18, d20, d22
-+
+ @ As we have post inced r0/r3 in the load the easiest thing to do is
+ @ to subtract and write forwards, rather than backwards (as above)
-+ ldr r12, [sp, #0]
-+ add r3, #2
-+ sub r0, r0, r1, lsl #2
-+ sub r3, r3, r1, lsl #2
-+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++ "ldr lr, [sp, #4]", \
++ "add r3, #2", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
+
+ bcs 3f
++ @ Q0a
+ vst1.16 {d20[0]}, [r0], r1
-+ vst1.16 {d20[1]}, [r0], r1
-+ vst1.16 {d20[2]}, [r0], r1
-+ vst1.16 {d20[3]}, [r0]
-+
++ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[2]}, [r0]
++ vst1.16 {d20[3]}, [r2]
+3:
-+ it mi
-+ bxmi lr
++ it mi
++ popmi {pc}
++ @ P0a
+ vst1.16 {d18[0]}, [r3], r1
-+ vst1.16 {d18[1]}, [r3], r1
-+ vst1.16 {d18[2]}, [r3], r1
-+ vst1.16 {d18[3]}, [r3]
-+ bx lr
++ vst1.16 {d18[1]}, [ip], r1
++ vst1.16 {d18[2]}, [r3]
++ vst1.16 {d18[3]}, [ip]
++ pop {pc}
+
+endfunc
+
@@ -2695,14 +2701,14 @@ index 0000000000..e665bd848a
+.macro m_filter_v_uv2_16 bit_depth
+ cmp r2, #0
+ bxeq lr
-+
++ push {lr}
+ vld2.32 {d16[0], d18[0]}, [r3], r1
+ vld2.32 {d20[0], d22[0]}, [r0], r1
+
++ cmp r2, #0x10000
+ vld2.32 {d16[1], d18[1]}, [r3], r1
+ vld2.32 {d20[1], d22[1]}, [r0], r1
+
-+ cmp r2, #0x10000
+ vld2.32 {d17[0], d19[0]}, [r3], r1
+ vld2.32 {d21[0], d23[0]}, [r0], r1
+
@@ -2713,170 +2719,509 @@ index 0000000000..e665bd848a
+ vld2.32 {d24[0], d26[0]}, [r3], r1
+ vld2.32 {d28[0], d30[0]}, [r0], r1
+
++ sub ip, r0, r3
+ vld2.32 {d24[1], d26[1]}, [r3], r1
+ vld2.32 {d28[1], d30[1]}, [r0], r1
-+ sub r12, r0, r3
+
++ cmp ip, #8
+ vld2.32 {d25[0], d27[0]}, [r3], r1
+ vld2.32 {d29[0], d31[0]}, [r0], r1
-+ cmp r12, #8
+
+ vld2.32 {d25[1], d27[1]}, [r3]
+ vld2.32 {d29[1], d31[1]}, [r0]
-+ it eq
-+ ldreq r12, [sp, #0]
+
-+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth
-+ cmp r12, #0
-+ add r3, #4
-+ neg r1, r1
++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #4", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
+ bne 1f
+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
+@ so it is worth having this special case
-+ vst2.32 {d27[1], d29[1]}, [r3], r1
-+ vst2.32 {d27[0], d29[0]}, [r3], r1
-+ vst2.32 {d26[1], d28[1]}, [r3], r1
-+ vst2.32 {d26[0], d28[0]}, [r3], r1
-+ vst2.32 {d19[1], d21[1]}, [r3], r1
-+ vst2.32 {d19[0], d21[0]}, [r3], r1
-+ vst2.32 {d18[1], d20[1]}, [r3], r1
-+ vst2.32 {d18[0], d20[0]}, [r3]
-+ bx lr
++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b
++ vst2.32 {d27[0], d29[0]}, [ip], r1
++ vst2.32 {d26[1], d28[1]}, [r3], r1
++ vst2.32 {d26[0], d28[0]}, [ip], r1
++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a
++ vst2.32 {d19[0], d21[0]}, [ip], r1
++ vst2.32 {d18[1], d20[1]}, [r3]
++ vst2.32 {d18[0], d20[0]}, [ip]
++ pop {pc}
+
+@ Either split or partial
+1:
-+ ldr r12, [sp, #0]
-+ lsls r12, #29 @ b2 (P0b) -> N, b3 (Q0b) -> C
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
+ bcs 1f
+ @ Q0b
-+ mov r2, r0
-+ vst1.32 {d29[1]}, [r2], r1
++ vst1.32 {d29[1]}, [r0], r1
+ vst1.32 {d29[0]}, [r2], r1
-+ vst1.32 {d28[1]}, [r2], r1
-+ vst1.32 {d28[0]}, [r2]
++ vst1.32 {d28[1]}, [r0], r1
++ vst1.32 {d28[0]}, [r2], r1
+1:
-+ bmi 2f
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
+ @ P0b
-+ mov r2, r3
-+ vst1.32 {d27[1]}, [r2], r1
-+ vst1.32 {d27[0]}, [r2], r1
-+ vst1.32 {d26[1]}, [r2], r1
-+ vst1.32 {d26[0]}, [r2]
-+
-+2:
-+ lsls r12, #2 @ b0 (P0a) -> N, b1 (Q0a) -> C
-+ bcs 3f
++ vst1.32 {d27[1]}, [r3], r1
++ vst1.32 {d27[0]}, [ip], r1
++ vst1.32 {d26[1]}, [r3], r1
++ vst1.32 {d26[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
+ @ Q0a
-+ add r0, r0, r1, lsl #2
+ vst1.32 {d21[1]}, [r0], r1
-+ vst1.32 {d21[0]}, [r0], r1
-+ vst1.32 {d20[1]}, [r0], r1
-+ vst1.32 {d20[0]}, [r0]
-+
-+3:
-+ it mi
-+ bxmi lr
++ vst1.32 {d21[0]}, [r2], r1
++ vst1.32 {d20[1]}, [r0]
++ vst1.32 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
+ @ P0a
-+ add r3, r3, r1, lsl #2
+ vst1.32 {d19[1]}, [r3], r1
-+ vst1.32 {d19[0]}, [r3], r1
-+ vst1.32 {d18[1]}, [r3], r1
-+ vst1.32 {d18[0]}, [r3]
-+ bx lr
-+
++ vst1.32 {d19[0]}, [ip], r1
++ vst1.32 {d18[1]}, [r3]
++ vst1.32 {d18[0]}, [ip]
++ pop {pc}
+
++@ Single lump (rather than double)
+10:
-+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth
-+
+ @ As we have post inced r0/r3 in the load the easiest thing to do is
+ @ to subtract and write forwards, rather than backwards (as above)
-+ ldr r12, [sp, #0]
-+ add r3, #4
-+ sub r0, r0, r1, lsl #2
-+ sub r3, r3, r1, lsl #2
-+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "add r3, #4", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
+
+ bcs 3f
+ @ Q0a
+ vst1.32 {d20[0]}, [r0], r1
-+ vst1.32 {d20[1]}, [r0], r1
-+ vst1.32 {d21[0]}, [r0], r1
-+ vst1.32 {d21[1]}, [r0]
-+
++ vst1.32 {d20[1]}, [r2], r1
++ vst1.32 {d21[0]}, [r0]
++ vst1.32 {d21[1]}, [r2]
+3:
-+ it mi
-+ bxmi lr
++ it mi
++ popmi {pc}
+ @ P0a
+ vst1.32 {d18[0]}, [r3], r1
-+ vst1.32 {d18[1]}, [r3], r1
-+ vst1.32 {d19[0]}, [r3], r1
-+ vst1.32 {d19[1]}, [r3]
-+ bx lr
++ vst1.32 {d18[1]}, [ip], r1
++ vst1.32 {d19[0]}, [r3]
++ vst1.32 {d19[1]}, [ip]
++ pop {pc}
+.endm
+
+
++#if 1 // NEON version
+
+
-+/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
-+ * int *curr_rpl0, int *curr_
-+ * MvField *curr, MvField *ne
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ mov ip, sp
++ push {a2,v1-v8,lr}
++ ldm ip, {v1-v5}
++ cmp a1, #2
++ bls 2f
++ vpush {d8-d13}
++ sub v5, v5, #10
++ mov v6, #32
++1:
++ vld2.32 {d0[0], d2[0]}, [a3]!
++ vld2.32 {d4[0], d6[0]}, [a4]!
++ vmov.u8 q12, #0
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[0]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[0]}, [ip]
++ vld1.32 {d18[0]}, [v8]
++ vld1.32 {d22[0]}, [lr]
++
++ vld2.32 {d0[1], d2[1]}, [a3]!
++ vld2.32 {d4[1], d6[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d12, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d13, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d27, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[2]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[1]}, [ip]
++ vld1.32 {d18[1]}, [v8]
++ vld1.32 {d22[1]}, [lr]
++
++ vld2.32 {d1[0], d3[0]}, [a3]!
++ vld2.32 {d5[0], d7[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[4]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[4]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[0]}, [ip]
++ vld1.32 {d19[0]}, [v8]
++ vld1.32 {d23[0]}, [lr]
++
++ vld2.32 {d1[1], d3[1]}, [a3]!
++ vld2.32 {d5[1], d7[1]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[6]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[6]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[1]}, [ip]
++ vld1.32 {d19[1]}, [v8]
++ vld1.32 {d23[1]}, [lr]
++
++ @ So now we have:
++ @ q0.32[i] = curr[i].mv[0]
++ @ q1.32[i] = curr[i].mv[1]
++ @ q2.32[i] = neigh[i].mv[0]
++ @ q3.32[i] = neigh[i].mv[1]
++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d24.16[i] = curr[i].pred_flag
++ @ d25.16[i] = neigh[i].pred_flag
++
++ vtst.16 d28, d24, d12
++ vtst.16 d29, d24, d13
++ vadd.i16 d8, d24, d12
++ vadd.i16 d9, d25, d12
++ vtst.16 d30, d25, d12
++ vtst.16 d31, d25, d13
++ veor d26, d8, d9
++ ldr lr, [sp, 6*8]
++ vmovl.s16 q4, d28
++ vmovl.s16 q5, d29
++ teq lr, #1
++ vmovl.s16 q14, d30
++ lslne v1, lr, #1
++ vmovl.s16 q15, d31
++ rsbne v2, v1, #32
++ vbif q0, q1, q4
++ vbif q2, q3, q14
++ vbif q1, q0, q5
++ vbif q3, q2, q15
++ vabd.s16 q12, q0, q2
++ vabd.s16 q2, q1
++ vabd.s16 q0, q3
++ vabd.s16 q1, q3
++ vbif q8, q9, q4
++ vbif q10, q11, q14
++ vbif q9, q8, q5
++ vbif q11, q10, q15
++ vclt.u16 d6, d24, d27
++ vclt.u16 d8, d2, d27
++ vclt.u16 d7, d25, d27
++ vclt.u16 d9, d3, d27
++ vclt.u16 d2, d0, d27
++ vclt.u16 d0, d4, d27
++ vclt.u16 d3, d1, d27
++ vclt.u16 d1, d5, d27
++ vceq.i32 q12, q10, q8
++ vceq.i32 q10, q9
++ vceq.i32 q8, q11
++ vceq.i32 q9, q11
++ vshrn.i32 d6, q3, #8
++ vshrn.i32 d7, q4, #8
++ vshrn.i32 d8, q1, #8
++ vshrn.i32 d9, q0, #8
++ vmovn.i32 d4, q12
++ vmovn.i32 d2, q10
++ vmovn.i32 d3, q8
++ vmovn.i32 d5, q9
++ vand q2, q3
++ vrev16.8 q3, q3
++ vand q2, q3
++ vand q1, q4
++ vrev16.8 q4, q4
++ vand q1, q4
++ vand d4, d5
++ vand d2, d3
++ vbic d0, d12, d4
++ vshr.u16 d26, #2
++ vbic d0, d2
++ vmov.i16 d1, #0x5555
++ vorr d0, d26
++ bne 10f
++
++ @ Merge results into result word, no duplicates
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ sub v6, #8
++ lsl a2, #30
++ lsl v8, #30
++ lsl ip, #30
++ lsl lr, #30
++ orr a2, ip, a2, lsr #2
++ orr v8, lr, v8, lsr #2
++ orr a2, v8, a2, lsr #4
++ subs a1, #4
++ orr v7, a2, v7, lsr #8
++ bhi 1b
++
++ vpop {d8-d13}
++ mov a1, v7, lsr v6
++ pop {a2,v1-v8,pc}
++10:
++ @ Merge results into result word, with duplicates
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ sub v6, v6, v1, lsl #2
++ lsl a2, v2
++ subs a1, #4
++ lsl v8, v2
++ lsl ip, v2
++ lsl lr, v2
++ ldr v2, [sp, #6*8 + 10*4 + 1*4]
++ orr a2, ip, a2, lsr v1
++ lsl ip, v1, #1
++ orr v8, lr, v8, lsr v1
++ lsl lr, v1, #2
++ orr a2, v8, a2, lsr ip
++ ldr v1, [sp, #6*8 + 10*4]
++ orr v7, a2, v7, lsr lr
++ bhi 1b
++
++ vpop {d8-d13}
++ mov a1, v7, lsr v6
++ pop {a2,v1-v8,pc}
++
++
++2:
++ sub v5, v5, #10
++ vmov.u8 d16, #0
++ blo 3f
++ vld2.32 {d0[0], d1[0]}, [a3]!
++ vld2.32 {d2[0], d3[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[4]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[0]}, [ip]
++ vld1.32 {d6[0]}, [v8]
++ vld1.32 {d7[0]}, [lr]
++
++3:
++ vld2.32 {d0[1], d1[1]}, [a3]!
++ vld2.32 {d2[1], d3[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d17, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d18, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d19, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[6]}, [a4], v5
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[1]}, [ip]
++ vld1.32 {d6[1]}, [v8]
++ vld1.32 {d7[1]}, [lr]
++
++ @ So now we have:
++ @ d0.32[i] = curr[i].mv[0]
++ @ d1.32[i] = curr[i].mv[1]
++ @ d2.32[i] = neigh[i].mv[0]
++ @ d3.32[i] = neigh[i].mv[1]
++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d16.16[i] = curr[i].pred_flag
++ @ d16.16[2+i] = neigh[i].pred_flag
++
++ vtst.16 d20, d16, d17
++ vtst.16 d22, d16, d18
++ vadd.i16 d30, d16, d17
++ vswp d2, d3
++ ldr lr, [sp]
++ vmovl.s16 q10, d20
++ teq lr, #1
++ vmovl.s16 q11, d22
++ lslne v1, lr, #1
++ vbif d0, d1, d20
++ vbif d4, d6, d20
++ vbif d3, d2, d21
++ vbif d5, d7, d21
++ vbif d1, d0, d22
++ vbif d6, d4, d22
++ vbif d2, d3, d23
++ vbif d7, d5, d23
++ vshr.u16 d30, #2
++ vabd.s16 d24, d0, d3
++ vabd.s16 d25, d1, d2
++ vabd.s16 q0, q0, q1
++ vceq.i32 d2, d4, d5
++ vceq.i32 d20, d5, d6
++ vceq.i32 d21, d4, d7
++ vceq.i32 d3, d6, d7
++ vclt.u16 d6, d24, d19
++ vclt.u16 d7, d25, d19
++ vclt.u16 d22, d1, d19
++ vclt.u16 d23, d0, d19
++ vshrn.i32 d6, q3, #8
++ vmovn.i32 d2, q1
++ vshrn.i32 d7, q11, #8
++ vmovn.i32 d3, q10
++ vand q0, q3, q1
++ rsbne v2, v1, #32
++ vrev16.8 q3, q3
++ vand q0, q3
++ vsra.u64 d30, #32
++ vshr.u64 q1, q0, #32
++ vand q0, q1
++ vbic d0, d17, d0
++ vand d30, d30, d17
++ vbic d0, d1
++ vmov.i16 d1, #0x5555
++ vorr d0, d30
++ bne 10f
++
++ @ Construct result word, no duplicates
++ cmp a1, #2
++ vmov.u16 a1, d0[1]
++ vmov.u16 a2, d0[0]
++ orreq a1, a2, a1, lsl #2
++ pop {a2,v1-v8,pc}
++10:
++ @ Construct result word, with duplicates
++ cmp a1, #2
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov.u16 a1, d0[1]
++ lsl a2, #16
++ pkhbt a1, a1, a1, lsl #16
++ lsr a2, v2
++ lsr a1, v2
++ orreq a1, a2, a1, lsl v1
++ pop {a2,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc)
+ */
+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
+ add ip, sp, #4*4
+ push {a2-a4,v1-v8,lr}
-+ ldmia ip, {v5-v7}
++ mov v6, #32
+1: ldmdb ip, {v1-v4}
-+ ldrsb a3, [v5, #8] @ curr->ref_idx
-+ ldrsb v8, [v5, #9]
-+ ldrsb ip, [v6, #8] @ neigh->ref_idx
-+ ldrsb lr, [v6, #9]
-+ ldr v1, [v1, a3, lsl #2]
-+ ldrb a3, [v5, #10] @ curr->pred_flag
++ ldrsb v5, [a3, #8] @ curr->ref_idx
++ ldrsb v8, [a3, #9]
++ ldrsb ip, [a4, #8] @ neigh->ref_idx
++ ldrsb lr, [a4, #9]
++ ldr v1, [v1, v5, lsl #2]
++ ldrb v5, [a3, #10] @ curr->pred_flag
+ ldr v2, [v2, v8, lsl #2]
-+ ldrb v8, [v6, #10] @ neigh->pred_flag
++ ldrb v8, [a4, #10] @ neigh->pred_flag
+ ldr v3, [v3, ip, lsl #2]
+ ldr v4, [v4, lr, lsl #2]
-+ teq a3, #3
++ teq v5, #3
+ beq 20f
+ teq v8, #3
+ beq 90f
+
-+ tst a3, #1
++ tst v5, #1
+ itee ne
-+ ldrne a3, [v5, #0] @ curr->mv[0]
-+ ldreq a3, [v5, #4] @ curr->mv[1]
++ ldrne v5, [a3, #0] @ curr->mv[0]
+ moveq v1, v2
++ ldreq v5, [a3, #4] @ curr->mv[1]
+ tst v8, #1
+ itee ne
-+ ldrne v8, [v6, #0] @ neigh->mv[0]
-+ ldreq v8, [v6, #4] @ neigh->mv[1]
++ ldrne v8, [a4, #0] @ neigh->mv[0]
+ moveq v3, v4
++ ldreq v8, [a4, #4] @ neigh->mv[1]
+ teq v1, v3
+ bne 10f
+ ldr lr, =0xFFFCFFFC
-+ ssub16 ip, v8, a3
-+ ssub16 a3, a3, v8
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 ip, v8, v5
++ ssub16 v5, v5, v8
++ sel v5, v5, ip
++ ands v5, v5, lr
+ @ drop through
+10: it ne
-+ movne a3, #1
-+11: subs a2, a2, #1
-+12:
-+A strbhs a3, [v7], a4
-+T itt hs
-+T strbhs a3, [v7]
-+T addhs v7, v7, a4
++ movne v5, #1<<30
++11:
++ sub v6, v6, #2
++T mov v7, v7, lsr #2
+ subs a2, a2, #1
-+ bhs 12b
++A orr v7, v5, v7, lsr #2
++T orr v7, v5, v7
++ bhi 11b
+
-+ ldm sp, {a2, a3}
++ ldr v5, [sp, #16*4]
+ add ip, sp, #16*4
++ ldr a2, [sp]
+ subs a1, a1, #1
-+ add v5, v5, a3
-+ add v6, v6, a3
++ add a3, a3, v5
++ add a4, a4, v5
+ bhi 1b
++ mov a1, v7, lsr v6
+ pop {a2-a4,v1-v8,pc}
+
+20: teq v8, #3
@@ -2889,43 +3234,43 @@ index 0000000000..e665bd848a
+ teq v1, v2
+ bne 30f
+
-+ ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
-+ ssub16 a3, v1, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 25f
+ ssub16 ip, v4, v2
-+ ssub16 a3, v2, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ beq 11b
+ @ drop through
+25: ssub16 ip, v4, v1
-+ ssub16 a3, v1, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 10b
+ ssub16 ip, v3, v2
-+ ssub16 a3, v2, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ b 10b
+
-+30: ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++30: ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ ssub16 ip, v3, v1
-+ ssub16 a3, v1, v3
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
+ bne 10b
+ ssub16 ip, v4, v2
-+ ssub16 a3, v2, v4
-+ sel a3, a3, ip
-+ ands a3, a3, lr
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
+ b 10b
+
+40: teq v1, v4
@@ -2933,21 +3278,26 @@ index 0000000000..e665bd848a
+ teqeq v2, v3
+ bne 10b
+
-+ ldrd v1, v2, [v5] @ curr->mv
-+ ldrd v3, v4, [v6] @ neigh->mv
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
+ ldr lr, =0xFFFCFFFC
+ b 25b
+
-+90: mov a3, #1
++90:
++ mov v5, #1<<30
+ b 11b
+endfunc
+
++
++#endif
++
++
+@ =============================================================================
+@
+@ 10 bit
+
+function hevc_loop_filter_luma_body_10
-+ m_filter_luma 10
++ m_filter_luma 10, q11, q15
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
@@ -2980,7 +3330,7 @@ index 0000000000..e665bd848a
+ ldr r10, [sp, #32]
+
+.Lv_loop_luma_common_10:
-+ m_filter_v_luma_common_16 10
++ m_filter_v_luma_16 10
+endfunc
+
+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
@@ -3220,7 +3570,7 @@ index 0000000000..109fa98c29
+}
diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
new file mode 100644
-index 0000000000..a721e392ab
+index 0000000000..ce7e6091f1
--- /dev/null
+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
@@ -0,0 +1,465 @@
@@ -3255,8 +3605,8 @@ index 0000000000..a721e392ab
+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
+// have been removed from head as we never use them.
+
-+void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+
+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -3455,9 +3805,9 @@ index 0000000000..a721e392ab
+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+
-+void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs);
++ int in_inc);
+
+
+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
@@ -3557,10 +3907,10 @@ index 0000000000..a721e392ab
+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+ if (bit_depth == 8) {
-+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon;
-+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon;
-+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon;
-+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon;
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8;
+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8;
@@ -7255,6 +7605,5151 @@ index 0000000000..b56e0f9644
+ edge_64b_bodies edge_64b_body_16, 4
+endfunc
+
+diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
+new file mode 100644
+index 0000000000..36a23a5bf9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
+new file mode 100644
+index 0000000000..80724d4cf3
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
+new file mode 100644
+index 0000000000..8c267a0368
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,188 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ switch (bit_depth)
++ {
++ case 8:
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++ break;
++ case 10:
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++ break;
++ default:
++ break;
++ }
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+new file mode 100644
+index 0000000000..1a2d413ea2
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2352 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ d24 Cur Line - load before 1st call for down - set by _up
++@ d16 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ q0-q3, q14, q15
++
++patch_h_down_8x8_8:
++ mov r5, #8
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++ vld1.8 {d24[7]}, [r2]!
++
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++ vdup.8 d30, r6
++ vext.8 q2, q3, #8
++ vdup.8 d31, r12
++ vext.8 q3, q3, #8
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d7, q14, #5
++ bne 2b
++
++store_tran_8x8_8:
++ add r12, r0, #4
++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]
++ add r5, r0, r3
++ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r12], r3
++ add r0, #8
++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r5 ], r3
++ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r12], r3
++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r5 ], r3
++ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r12], r3
++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r5 ], r3
++ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r12], r3
++ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r5 ], r3
++ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r12], r3
++ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r5 ], r3
++ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r12], r3
++ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r5 ], r3
++ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r12], r3
++ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r5 ]
++ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r12]
++ bx lr
++
++
++patch_h_up_8x8_8:
++ mov r5, #8
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ @ r2=left (variable), r1=up (const)
++ adds r8, r7
++ vmov d24, d16
++ ldrbmi r12, [r2, #-1]!
++ ldrbpl r12, [r1, r8, asr #8]
++ vext.8 d16, d16, d16, #7
++ sub r6, #32
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++
++ vmull.u8 q14, d16, d31
++ vext.8 q2, q3, #8
++ vdup.8 d30, r12
++ vext.8 q3, q3, #8
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d7, q14, #5
++ bne 2b
++ b store_tran_8x8_8 @ This will return
++
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ mov r5, #4 @ Loop counter for all cases
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++
++98:
++ add r12, r0, r3
++ lsl r3, #1
++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ], r3
++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r12], r3
++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0 ]
++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r12]
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.32 {d16[0]}, [r2]
++ sub r8, r7
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ @ r2=left (variable), r1=up (const)
++ adds r8, r7
++ vmov d24, d16
++ ldrbmi r12, [r2, #-1]!
++ ldrbpl r12, [r1, r8, asr #8]
++ vext.8 d16, d16, d16, #7
++ sub r6, #32
++ vmov.8 d16[0], r12
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q2, #8
++
++ vmull.u8 q14, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++ b 98b
++
++18:
++ cmp r12, #26
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.32 {d16[0]}, [r1 :32] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ ldrb r12, [r2, r8, asr #8]
++
++ vmov d24, d16
++ add r8, r7
++ sub r6, #32
++ vext.8 d16, d16, #7
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.32 {d0[0]}, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {d24}, [r1] @ Up + up-right, may be on 32-bit align rather than 64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 d24, d24, #1
++ sub r6, #32
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.32 {d0[0]}, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ bl patch_h_down_8x8_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++ sub r8, r7
++ bl patch_h_up_8x8_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {d16}, [r1 :64] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ ldrb r12, [r2, r8, asr #8]
++
++ vmov d24, d16
++ add r8, r7
++ sub r6, #32
++ vext.8 d16, d16, #7
++ vmov.8 d16[0], r12
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {d24, d25}, [r1 :64]! @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.8 q12, q12, #1
++ sub r6, #32
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++
++ mov r2, r1 @ restore r2
++ sub r0, #16
++ add r6, r4, #32 @ Force initial load in main loop
++ vld1.8 {d24}, [r2]!
++ add r0, r0, r3, lsl #3
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ vld1.8 {d16}, [r2]
++ sub r8, r7
++
++ push {r2, r8}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r2, r8}
++
++ sub r0, #16
++ add r6, r4, #32
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #3
++ add r0, r0, r3, lsl #3
++ vld1.8 {d16}, [r2]
++
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ ldrb r12, [r2, r8, asr #8]
++
++ vmov q12, q8
++ add r8, r7
++ sub r6, #32
++ vext.8 q8, q8, q8, #15
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1 :128]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.8 q12, q12, #1
++ sub r6, #32
++ vld1.8 {d25[7]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #3
++ add r0, r0, r3, lsl #3
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #32 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8, q9 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ ldrb r12, [r2, r8, asr #8]
++
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ sub r6, #32
++ vext.8 q9, q8, q9, #15
++ vext.8 q8, q8, q8, #15
++ vmov.8 d16[0], r12
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmull.u8 q2, d18, d31
++ vmull.u8 q3, d19, d31
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++ vmlal.u8 q2, d26, d30
++ vmlal.u8 q3, d27, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.8 q12, q13, #1
++ vext.8 q13, q13, #1
++ sub r6, #32
++ vld1.8 {d27[7]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmull.u8 q2, d26, d30
++ vmull.u8 q3, d27, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++ vmlal.u8 q2, d18, d31
++ vmlal.u8 q3, d19, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++@ Chroma 8 bit 4x4 patch fns
++ .text
++
++patch_h_down_c_4x4_8:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 d24, d24, #1
++ sub r6, #32
++ vld1.16 {d24[3]}, [r2]!
++
++1:
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q14, d24, d30
++ add r6, r4
++ vmlal.u8 q14, d16, d31
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++
++store_tran_c_4x4_8:
++ add r12, r0, r3
++ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r5 ]
++ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12]
++ bx lr
++
++patch_h_up_c_4x4_8:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov d24, d16
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #2
++ addpl r2, r1, r8, asr #7
++ vext.16 d16, d16, d16, #3
++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0
++ and r2, #~1
++ sub r6, #32
++ vld1.16 d16[0], [r2]
++1:
++ vdup.8 d31, r6
++ vext.8 q0, q1, #8
++ rsb r12, r6, #32
++ vext.8 q1, q1, #8
++
++ vmull.u8 q14, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q14, d24, d30
++ subs r5, #1
++ vrshrn.u16 d3, q14, #5
++ bne 2b
++ b store_tran_c_4x4_8 @ This will return
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ bl patch_h_down_c_4x4_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.8 {d16}, [r2]
++ bl patch_h_up_c_4x4_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {d16}, [r1 :64] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r12, r8, #8
++ vmov d24, d16
++ add r8, r7
++ vext.16 d16, d16, #3
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vld1.16 {d16[0]}, [r12]
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vrshrn.u16 d0, q0, #5
++
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1] @ Up + UR (only 64-bit aligned)
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q12, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmlal.u8 q0, d16, d31
++
++ vrshrn.u16 d0, q0, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {d0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.8 {d24}, [r2]!
++ mov r1, r2
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++
++ sub r0, #16
++ add r0, r0, r3, lsl #2
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.8 {d16}, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r2, r8}
++
++ add r2, r2, #8
++ sub r0, #16
++ sub r8, r8, r7, lsl #2
++ vld1.8 {d16}, [r2]
++ add r0, r0, r3, lsl #2
++ add r6, r4, #32
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vext.16 q8, q8, #7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vld1.16 {d16[0]}, [r12]
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vdup.8 d30, r12
++ vmull.u8 q1, d17, d31
++ add r6, r4
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12}, [r1 :128]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q12, #1
++ sub r6, #32
++ vld1.16 {d25[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.8 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.8 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.8 {q8, q9 }, [r1 :128] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ For other widths we may want different logic
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ vext.16 q9, q8, q9, #7
++ sub r6, #32
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++
++1:
++ vdup.8 d31, r6
++ rsb r12, r6, #32
++
++ vmull.u8 q0, d16, d31
++ vmull.u8 q1, d17, d31
++ vdup.8 d30, r12
++ add r6, r4
++ vmull.u8 q2, d18, d31
++ vmull.u8 q3, d19, d31
++ vmlal.u8 q0, d24, d30
++ vmlal.u8 q1, d25, d30
++ vmlal.u8 q2, d26, d30
++ vmlal.u8 q3, d27, d30
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ sub r6, #32
++ vld1.16 {d27[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vdup.8 d30, r6
++ vdup.8 d31, r12
++
++ vmull.u8 q0, d24, d30
++ vmull.u8 q1, d25, d30
++ vmull.u8 q2, d26, d30
++ vmull.u8 q3, d27, d30
++ vmlal.u8 q0, d16, d31
++ vmlal.u8 q1, d17, d31
++ vmlal.u8 q2, d18, d31
++ vmlal.u8 q3, d19, d31
++
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q3, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++ .text
++ .balign 64
++angle_2:
++ .byte 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Sign inverted from standards table
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Standard sign
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++
++ @ Sign inverted from standards table
++inv_angle:
++ .short 4096, 1638, 910, 630, 482, 390, 315
++ .short 256
++ .short 315, 390, 482, 630, 910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++ .text
++ .balign 64
++
++patch_h_down_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 d24, d24, #1
++ sub r6, #32
++ vld1.16 {d24[3]}, [r2]!
++
++1:
++ rsb r12, r6, #32
++ vext.16 q1, q2, #4
++ vmov s0, r6
++ vmov s1, r12
++ vext.16 q2, q2, #4
++
++ vmul.u16 d1, d24, d0[0]
++ add r6, r4
++ vmla.u16 d1, d16, d0[2]
++ subs r5, #1
++ vrshr.u16 d5, d1, #5
++ bne 2b
++
++store_tran_4x4_10:
++ add r12, r0, r3
++ vst4.16 {d2[0], d3[0], d4[0], d5[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.16 {d2[1], d3[1], d4[1], d5[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.16 {d2[2], d3[2], d4[2], d5[2]}, [r5 ]
++ vst4.16 {d2[3], d3[3], d4[3], d5[3]}, [r12]
++ bx lr
++
++patch_h_up_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov d24, d16
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #2
++ addpl r2, r1, r8, asr #7
++ vext.16 d16, d16, d16, #3
++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0
++ and r2, #~1
++ sub r6, #32
++ vld1.16 d16[0], [r2]
++
++1:
++ rsb r12, r6, #32
++ vext.16 q1, q2, #4
++ vmov s0, r6
++ vmov s1, r12
++ vext.16 q2, q2, #4
++
++ vmul.u16 d1, d24, d0[2]
++ add r6, r4
++ vmla.u16 d1, d16, d0[0]
++ subs r5, #1
++ vrshr.u16 d5, d1, #5
++ bne 2b
++ b store_tran_4x4_10 @ This will return
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.16 {d24}, [r2]!
++ bl patch_h_down_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.16 {d16}, [r2]
++ bl patch_h_up_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {d16}, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov d24, d16
++ add r8, r7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vext.16 d16, d16, #3
++ vld1.16 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 d2, d16, d0[2]
++ vmla.u16 d2, d24, d0[0]
++ vrshr.u16 d2, #5
++
++ subs r5, #1
++ vst1.16 {d2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {d24, d25}, [r1 :64] @ Up + UR (64bit aligned)
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov d16, d24
++ vext.16 q12, q13, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 d2, d24, d0[0]
++ vmla.u16 d2, d16, d0[2]
++ vrshr.u16 d2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {d2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.16 {d24}, [r2]!
++ mov r1, r2
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++
++ vld1.16 {d24}, [r1]!
++ sub r0, #16
++ add r6, r4, #32 @ Force initial load in main loop
++ add r0, r0, r3, lsl #2
++ mov r2, r1
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.16 {d16}, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r2, r8}
++
++ sub r0, #16
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ add r12, r2, r12, lsl #1
++ sub r6, #32
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vrshr.u16 q1, #5
++
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1 :128] @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.16 q12, q13, #1
++ sub r6, #32
++ vext.16 q13, q13, #1
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vrshr.u16 q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.16 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ pop {r2, r8}
++
++ sub r0, #32
++ subs r10, #1
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ pop {r4-r10, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8, q9}, [r1] @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ sub r6, #32
++ vext.16 q9, q8, q9, #7
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1 :128]! @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ sub r6, #32
++ vld1.16 {d27[3]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r10, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ vpush {q4 }
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #1
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #8 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.16 {d24}, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ mov r9, #4
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10
++ subs r9, #1
++ bne 1b
++
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.16 {d16}, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ mov r9, #4
++1:
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10
++ subs r9, #1
++ bne 1b
++ pop {r2, r8}
++
++ sub r0, #64
++ subs r10, #1
++ add r2, #8
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++18:
++ cmp r12, #26
++ mov r5, #32 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vldm r1, {q8-q11} @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #1
++ vmov q14, q10
++ vmov q15, q11
++ sub r6, #32
++ vext.16 q11, q10, q11, #7
++ vext.16 q10, q9, q10, #7
++ vext.16 q9, q8, q9, #7
++ vext.16 q8, q8, q8, #7
++ vld1.16 {d16[0]}, [r9]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmul.u16 q3, q10, d0[2]
++ vmul.u16 q4, q11, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vmla.u16 q3, q14, d0[0]
++ vmla.u16 q4, q15, d0[0]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++ b 99f
++
++@ Right of vertical - works along top - left unused
++26:
++ vldm r1, {q12-q15} @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++ add r1, #64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vmov q10, q14
++ vmov q11, q15
++ vext.16 q12, q13, #1
++ vext.16 q13, q14, #1
++ vext.16 q14, q15, #1
++ vext.16 q15, q15, #1
++ sub r6, #32
++ vld1.16 {d31[3]}, [r1]!
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmul.u16 q3, q14, d0[0]
++ vmul.u16 q4, q15, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vmla.u16 q3, q10, d0[2]
++ vmla.u16 q4, q11, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ add r6, r4
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++99:
++ vpop {q4 }
++ pop {r4-r10, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1 Up ptr (_up only)
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ q2 Cur Line - load before 1st call for down - set by _up
++@ q8 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q2
++ vext.32 q2, q2, #1
++ sub r6, #32
++ vld1.32 {d5[1]}, [r2]!
++1:
++ rsb r12, r6, #32
++ vmov q12, q13
++ vmov s0, r6
++ vmov s1, r12
++ vmov q13, q14
++
++ vmul.u16 q3, q2, d0[0]
++ add r6, r4
++ vmla.u16 q3, q8, d0[2]
++ vmov q14, q15
++ subs r5, #1
++ vrshr.u16 q15, q3, #5
++ bne 2b
++
++store_tran_c_4x4_10:
++ add r12, r0, r3
++ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0 ]!
++ add r5, r12, r3
++ vst4.32 {d24[1], d26[1], d28[1], d30[1]}, [r12]
++ add r12, r12, r3, lsl #1
++ vst4.32 {d25[0], d27[0], d29[0], d31[0]}, [r5 ]
++ vst4.32 {d25[1], d27[1], d29[1], d31[1]}, [r12]
++ bx lr
++
++patch_h_up_c_4x4_10:
++ mov r5, #4
++2:
++ cmp r6, #32
++ ble 1f
++
++ @ If r8 is -ve then we are still tracking left
++ adds r8, r7
++ vmov q2, q8
++ @ Initially r2=left (variable), r1=up (const)
++ @ Use r2 for both up and left, we only ever go from left->up so
++ @ we assume that we are left and thenm overwrite with up if wanted
++ sub r2, #4
++ addpl r2, r1, r8, asr #6
++ vext.32 q8, q8, #3
++ @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1
++ and r2, #~3
++ sub r6, #32
++ vld1.32 d16[0], [r2]
++1:
++ rsb r12, r6, #32
++ vmov q12, q13
++ vmov s0, r6
++ vmov s1, r12
++ vmov q13, q14
++
++ vmul.u16 q1, q2, d0[2]
++ add r6, r4
++ vmla.u16 q1, q8, d0[0]
++ vmov q14, q15
++ subs r5, #1
++ vrshr.u16 q15, q1, #5
++ bne 2b
++ b store_tran_c_4x4_10 @ This will return
++
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.32 {q2 }, [r2]!
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.32 {q8 }, [r2]
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #4 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r12, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vext.32 q8, q8, q8, #3
++ add r12, r2, r12, lsl #2
++ sub r6, #32
++ vld1.32 {d16[0]}, [r12]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vrshr.u16 q1, #5
++
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1] @ Up + UR
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vext.32 q12, q13, #1
++ vext.32 q13, q13, #1
++ sub r6, #32
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vrshr.u16 q1, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r8, lr}
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ add r6, r4, #32 @ Force initial load in main loop
++ bge 18f
++
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ vld1.32 {q2 }, [r2]!
++ mov r1, r2
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++
++ vld1.32 {q2 }, [r1]!
++ sub r0, #32
++ add r6, r4, #32 @ Force initial load in main loop
++ add r0, r0, r3, lsl #2
++ mov r2, r1
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++ vld1.32 {q8 }, [r2]
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r2, r8}
++
++ sub r0, #32
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ vld1.32 {q8 }, [r2]
++ add r6, r4, #32
++
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++18:
++ cmp r12, #26
++ mov r5, #8 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vld1.16 {q8, q9 }, [r1] @ Up
++ ldrh r7, [r7]
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q12, q8
++ asr r12, r8, #8
++ vmov q13, q9
++ add r8, r7
++ vext.32 q9, q8, q9, #3
++ add r12, r2, r12, lsl #2
++ vext.32 q8, q8, q8, #3
++ sub r6, #32
++ vld1.32 {d16[0]}, [r12]
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q12, q13}, [r1]! @ Up
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vext.32 q12, q13, #1
++ vext.32 q13, q14, #1
++ sub r6, #32
++ vld1.32 {d27[1]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++
++ add r6, r4
++ subs r5, #1
++ vst1.16 {q1, q2 }, [r0], r3
++ bne 2b
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++ ldr r12, [sp, #0]
++ push {r4-r10, lr}
++ vpush {q4 }
++ adrl r4, angle_2 - 2
++ adrl r7, inv_angle - 11*2
++ lsl r3, #2
++ ldrsb r4, [r4, r12]
++ add r7, r7, r12, lsl #1
++
++ cmp r12, #18
++ bge 18f
++
++ cmp r12, #10
++ mov r10, #4 @ Outer loop counter for "hard" cases
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2
++2:
++ vld1.32 {q2 }, [r1]!
++ add r6, r4, #32 @ Force initial load in main loop
++ mov r2, r1
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10
++
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ @ -128 (rather than +128) means we get UL
++ @ from L & don't have to offset U
++ mov r8, #-128
++ sub r8, r7
++2:
++ vld1.32 {q8 }, [r2]
++ add r6, r4, #32
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10
++ pop {r2, r8}
++
++ sub r0, #64
++ subs r10, #1
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ bne 2b
++ b 99f
++
++18:
++ cmp r12, #26
++ mov r5, #16 @ Loop counter for the "easy" cases
++ bge 26f
++
++@ Left of vertical - works down left
++ vldm r1, {q8-q11} @ Up
++ ldrh r7, [r7]
++ add r6, r4, #32
++ mov r8, #-128
++
++2:
++ cmp r6, #32
++ ble 1f
++
++ asr r9, r8, #8
++ vmov q12, q8
++ add r8, r7
++ vmov q13, q9
++ add r9, r2, r9, lsl #2
++ vmov q14, q10
++ vmov q15, q11
++ vext.32 q11, q10, q11, #3
++ vext.32 q10, q9, q10, #3
++ vext.32 q9, q8, q9, #3
++ vext.32 q8, q8, q8, #3
++ sub r6, #32
++ vld1.32 {d16[0]}, [r9]
++
++1:
++ vmov s1, r6
++ rsb r12, r6, #32
++ add r6, r4
++ vmov s0, r12
++
++ vmul.u16 q1, q8, d0[2]
++ vmul.u16 q2, q9, d0[2]
++ vmul.u16 q3, q10, d0[2]
++ vmul.u16 q4, q11, d0[2]
++ vmla.u16 q1, q12, d0[0]
++ vmla.u16 q2, q13, d0[0]
++ vmla.u16 q3, q14, d0[0]
++ vmla.u16 q4, q15, d0[0]
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++ b 99f
++
++@ Right of vertical - works along top - left unused
++26:
++ vldm r1, {q12-q15} @ Up
++ add r6, r4, #32 @ Force initial load in main loop
++ add r1, #64
++2:
++ cmp r6, #32
++ ble 1f
++
++ vmov q8, q12
++ vmov q9, q13
++ vmov q10, q14
++ vmov q11, q15
++ vext.32 q12, q13, #1
++ vext.32 q13, q14, #1
++ vext.32 q14, q15, #1
++ vext.32 q15, q15, #1
++ sub r6, #32
++ vld1.32 {d31[1]}, [r1]!
++
++1:
++ rsb r12, r6, #32
++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply
++ vmov s1, r12
++
++ vmul.u16 q1, q12, d0[0]
++ vmul.u16 q2, q13, d0[0]
++ vmul.u16 q3, q14, d0[0]
++ vmul.u16 q4, q15, d0[0]
++ vmla.u16 q1, q8, d0[2]
++ vmla.u16 q2, q9, d0[2]
++ vmla.u16 q3, q10, d0[2]
++ vmla.u16 q4, q11, d0[2]
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q3, #5
++ vrshr.u16 q4, #5
++
++ add r6, r4
++ subs r5, #1
++ vstm r0, {q1-q4}
++ add r0, r3
++ bne 2b
++99:
++ vpop {q4 }
++ pop {r4-r10, pc}
++
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+new file mode 100644
+index 0000000000..af7ba1f45e
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,682 @@
++/*
++ * Copyright (c) 2017 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++ @ Average the els of top & left
++ ldr r2, [r2]
++ vld1.32 {d0[0]}, [r1]
++ mov r1, #2
++ vmov s1, r2
++ vmov s2, r2
++ vmov.i16 q2, #3
++ add r2, r0, r3
++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0]
++ lsl r3, #1
++ vmovl.u8 q0, d0
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.32 {d0[0]}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d1, d0, #5*8
++ vshr.u64 d2, d0, #6*8
++ vshr.u64 d3, d0, #7*8
++ vbif d1, d6, d7
++ vbif d2, d6, d7
++ vst1.32 {d1[0]}, [r2], r3
++ vbif d3, d6, d7
++ vst1.32 {d2[0]}, [r0]
++ vst1.32 {d3[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ vld1.8 {d1}, [r2]
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T lsl r3, #1
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshrn.u16 d0, q1, #3
++
++ @ Store
++ vst1.8 {d0}, [r0], r3
++ vst1.8 {d0}, [r2], r3
++ vst1.8 {d0}, [r0]
++ vst1.8 {d0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ mov r1, #2
++ vld1.8 {d16}, [r2]
++ vmov.i16 q2, #3
++ vmov.i64 d7, #0xffff
++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0]
++ vmovl.u8 q0, d0
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vmovl.u8 q1, d16
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q1, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d2, q1, #2
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.8 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d2, #8
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ mov r1, #6
++1:
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ subs r1, #2
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q8 }, [r1]
++ vld1.8 {q12}, [r2]
++ vaddl.u8 q0, d16, d17
++ vaddl.u8 q2, d24, d25
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ mov r1, #4
++ vpadd.i32 d0, d0 @ This add U & V separately
++ lsl r3, #1 @ pels
++ vrshrn.u16 d0, q0, #4
++ vdup.u16 q0, d0[0] @ Dup results
++
++ @ Store
++1:
++ vst1.8 {q0 }, [r0], r3
++ subs r1, #1
++ vst1.8 {q0 }, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 { q8}, [r1]
++ vld1.8 {q12}, [r2]
++ vaddl.u8 q0, d16, d24
++ vaddl.u8 q2, d17, d25
++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0]
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d0, d0 @ 1 (all the same)
++ vrshr.u16 d0, #5
++
++ vmov.i64 d31, #0xff
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + dc * 2)
++
++ vmov.u16 r12, d0[0] @ dc
++ add r2, r12, r12, lsl #1 @ dc*3
++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2
++
++ vdup.u16 q3, r2
++ vaddw.u8 q1, q3, d16
++ vaddw.u8 q2, q3, d17
++ vmov.u16 d2[0], r1
++ vrshrn.u16 d2, q1, #2
++ vrshrn.u16 d3, q2, #2
++
++ @ Construct lhs pels
++ vaddw.u8 q2, q3, d24
++ vaddw.u8 q3, q3, d25
++ vrshrn.u16 d4, q2, #2
++ vrshrn.u16 d5, q3, #2
++
++ @ Store top line
++ vst1.8 { q1}, [r0], r3
++
++ mov r1, #15
++ vdup.u8 q0, d0[0]
++
++1:
++ vext.8 q2, q2, #1
++ vbit d0, d4, d31
++ subs r1, #1
++ vst1.8 { q0}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 { q8, q9}, [r1]
++ vld1.8 {q12,q13}, [r2]
++ vaddl.u8 q0, d16, d17
++ vaddl.u8 q1, d18, d19
++ vaddl.u8 q2, d24, d25
++ vaddl.u8 q3, d26, d27
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ lsl r3, #1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ mov r1, #4
++ vpadd.i32 d0, d0 @ This add U & V separately
++ add r2, r0, r3
++ vmov d1, d0
++ lsl r3, #1
++ vrshrn.u16 d0, q0, #5
++ vmov d1, d0 @ Dup results
++ vmov q1, q0
++
++ @ Store
++1:
++ vst1.8 { q0, q1}, [r0], r3
++ vst1.8 { q0, q1}, [r2], r3
++ subs r1, #1
++ vst1.8 { q0, q1}, [r0], r3
++ vst1.8 { q0, q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q8, q9 }, [r1]
++ vld1.8 {q12, q13}, [r2]
++ vaddl.u8 q0, d16, d17
++ vaddl.u8 q1, d18, d19
++ vaddl.u8 q2, d24, d25
++ vaddl.u8 q3, d26, d27
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ mov r1, #8
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ add r2, r0, r3
++ vpadd.i16 d0, d0 @ 1 (all the same)
++ lsl r3, #1
++ vrshrn.u16 d0, q0, #6
++ vdup.u8 q1, d0[0] @ Dup results
++ vdup.u8 q0, d0[0]
++
++ @ Store
++1:
++ vst1.8 {q0, q1 }, [r0], r3
++ vst1.8 {q0, q1 }, [r2], r3
++ subs r1, #1
++ vst1.8 {q0, q1 }, [r0], r3
++ vst1.8 {q0, q1 }, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {d0}, [r1]
++ mov r1, #2
++ vld1.16 {d1}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vmov.i64 d7, #0xffff
++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vrshr.u16 q0, #2
++
++ @ Store top line
++ vst1.16 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d3, d1, #1*16
++ vshr.u64 d4, d1, #2*16
++ vshr.u64 d5, d1, #3*16
++ vbif d3, d6, d7
++ vbif d4, d6, d7
++ vst1.16 {d3}, [r2], r3
++ vbif d5, d6, d7
++ vst1.16 {d4}, [r0]
++ vst1.16 {d5}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ vld1.8 {q1}, [r2]
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q0, q1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshr.u16 q0, q1, #3
++
++ vst1.16 {q0}, [r0], r3
++ vst1.16 {q0}, [r2], r3
++ vst1.16 {q0}, [r0]
++ vst1.16 {q0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0}, [r1]
++ mov r1, #2
++ vld1.16 {q8}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q8, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.16 q2, d6[0]
++ vdup.16 q9, d6[0]
++ vrshr.u16 q8, q8, #2
++ vrshr.u16 q0, q0, #2
++ vext.16 q1, q8, q8, #1
++
++ @ Store top line
++ vst1.16 {q0}, [r0], r3
++
++ @ Store the rest
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ mov r1, #6
++1:
++ vext.16 q8, q8, q8, #2
++ subs r1, #2
++ vext.16 q1, q1, q1, #2
++ vbit d4, d16, d7
++ vst1.16 {q2}, [r0], r3
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++ vld1.8 { q8, q9 }, [r1]
++ vld1.8 {q12, q13}, [r2]
++ vadd.i16 q8, q9
++ vadd.i16 q12, q13
++ vadd.i16 q8, q12
++ vadd.i16 d16, d17 @ d16 has 2 pairs
++ mov r1, #4
++ vpadd.i32 d16, d16
++ lsl r3, #2 @ stride in pels
++ vrshr.u16 d16, #4
++ vdup.u32 q9, d16[0];
++ vdup.u32 q8, d16[0];
++
++ @ Store
++1:
++ vst1.16 {q8, q9 }, [r0], r3
++ subs r1, #1
++ vst1.16 {q8, q9 }, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q8, q9 }, [r1]
++ vld1.16 {q12, q13}, [r2]
++ lsl r3, #1 @ stride given in pels
++ vadd.u16 q0, q8, q12
++ vadd.u16 q2, q9, q13
++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0]
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d0, d0 @ 1 (all the same)
++ vrshr.u16 d0, #5
++
++ vmov.i64 d31, #0xffff
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + dc * 2)
++
++ vmov.u16 r12, d0[0] @ dc
++ add r2, r12, r12, lsl #1 @ dc*3
++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2
++
++ vdup.u16 q3, r2
++ vadd.u16 q8, q3
++ vadd.u16 q9, q3
++ vmov.u16 d16[0], r1
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++
++ @ Construct lhs pels
++ vadd.u16 q12, q3
++ vadd.u16 q13, q3
++ vrshr.u16 q12, #2
++ vrshr.u16 q13, #2
++
++ @ Store top line
++ vst1.16 {q8, q9 }, [r0], r3
++
++ mov r1, #15
++ vdup.u16 q1, d0[0]
++ vdup.u16 q0, d0[0]
++
++1:
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ vbit d0, d24, d31
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vldm r1, { q8-q11}
++ vldm r2, {q12-q15}
++ vadd.i16 q8, q9
++ vadd.i16 q10, q11
++ vadd.i16 q12, q13
++ vadd.i16 q14, q15
++ vadd.i16 q8, q10
++ vadd.i16 q12, q14
++ vadd.i16 q8, q12
++ vadd.i16 d16, d17 @ d16 has 2 pairs
++ mov r1, #8
++ vpadd.i32 d16, d16
++ lsl r3, #2 @ stride in pels
++ vrshr.u16 d16, #5
++ vmov d17, d16 @ Dup results
++ vmov q9, q8
++ vmov q10, q8
++ vmov q11, q8
++
++ @ Store
++1:
++ vstm r0, {q8-q11}
++ add r0, r3
++ subs r1, #1
++ vstm r0, {q8-q11}
++ add r0, r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++ @ Average the els of top & left
++ @ With 10 bits we are (just) safe from overflow in i16
++ vldm r1, { q8-q11}
++ vldm r2, {q12-q15}
++ vadd.i16 q8, q9
++ vadd.i16 q10, q11
++ vadd.i16 q12, q13
++ vadd.i16 q14, q15
++ vadd.i16 q8, q10
++ vadd.i16 q12, q14
++ vadd.i16 q8, q12
++ vadd.i16 d16, d17 @ d16 has 4 vals
++ mov r1, #16
++ vpadd.i16 d16, d16 @ 2 (top & bottom the same)
++ lsl r3, #1 @ stride in pels
++ vpadd.i16 d16, d16 @ 1 (all the same)
++ vrshr.u16 d16, #6
++ vmov d17, d16 @ Dup results
++ vmov q9, q8
++ vmov q10, q8
++ vmov q11, q8
++
++ @ Store
++1:
++ vstm r0, { q8-q11}
++ add r0, r3
++ subs r1, #1
++ vstm r0, { q8-q11}
++ add r0, r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+new file mode 100644
+index 0000000000..ccf13a081f
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,888 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ * Vertical, at least, is relatively common
++ * Much simpler code than the general angular case
++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ * cases where it should be turned off (rdpcm & an extension sps flag).
++ * These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++ vld1.32 {d0[0] }, [r1 :32] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.32 {d24[0]}, [r2 :32] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d24, d4
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ mov r1, #4
++ vdup.8 d2, d2[0]
++ vqadd.s8 d24, d2
++ vmov.i64 d4, #0xff
++ veor.8 d24, d6
++
++1:
++ vbit.8 d0, d24, d4
++ vext.8 d24, d24, #1
++ subs r1, #1
++ vst1.32 {d0[0] }, [r0 :32], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++ vld1.8 {d0 }, [r1 :64] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {d24}, [r2 :64] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d24, d4
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ mov r1, #8
++ vdup.8 d2, d2[0]
++ vqadd.s8 d24, d2
++ vmov.i64 d4, #0xff
++ veor.8 d24, d6
++
++1:
++ vbit.8 d0, d24, d4
++ vext.8 d24, d24, #1
++ subs r1, #1
++ vst1.8 {d0 }, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {q12}, [r2 :128] @ left
++
++ vdup.8 q2, r12
++ vmov.u8 q3, #128
++ vhsub.u8 q12, q2
++
++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd
++ vdup.8 q1, d2[0]
++ vqadd.s8 q12, q1
++ veor.8 q12, q3
++
++ vmov.i64 d4, #0xff
++ mov r1, #16
++1:
++ vbit.8 d0, d24, d4
++ vext.8 q12, q12, #1
++ subs r1, #1
++ vst1.8 {q0 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++ vld1.8 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d0 }, [r2 :64], r3
++ vst1.16 {d0 }, [r0 :64]
++ vst1.16 {d0 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++ vld1.32 {d0[0] }, [r1 :32] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.32 {d16[0]}, [r2 :32] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d0, d4
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 d2, d2[0]
++ lsl r3, #1
++ vqadd.s8 d0, d2
++ veor.8 d0, d6
++
++ vdup.8 d1, d16[1]
++ vdup.8 d2, d16[2]
++ vdup.8 d3, d16[3]
++ vst1.32 {d0[0] }, [r0 :32], r3
++ vst1.32 {d1[0] }, [r2 :32], r3
++ vst1.32 {d2[0] }, [r0 :32]
++ vst1.32 {d3[0] }, [r2 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++ vld1.8 {d0 }, [r1 :64] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {d16}, [r2 :64] @ left
++
++ vdup.8 d4, r12
++ vmov.u8 d6, #128
++ vhsub.u8 d0, d4
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 d2, d2[0]
++ lsl r3, #1
++ vqadd.s8 d0, d2
++ mov r1, #3
++ veor.8 d0, d6
++
++ vdup.8 d4, d16[1]
++ vst1.8 {d0 }, [r0 :64], r3
++ vst1.8 {d4 }, [r2 :64], r3
++
++1:
++ vext.8 d16, d16, #2
++ subs r1, #1
++ vdup.8 d0, d16[0]
++ vdup.8 d4, d16[1]
++ vst1.8 {d0 }, [r0 :64], r3
++ vst1.8 {d4 }, [r2 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r12, [r2, #-1] @ Up-left
++ vld1.8 {q8 }, [r2 :128] @ left
++
++ vdup.8 q2, r12
++ vmov.u8 q3, #128
++ vhsub.u8 q0, q2
++
++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd
++ add r2, r0, r3
++ vdup.8 q1, d2[0]
++ lsl r3, #1
++ vqadd.s8 q0, q1
++ mov r1, #7
++ veor.8 q0, q3
++
++ vdup.8 q2, d16[1]
++ vst1.8 {q0 }, [r0 :128], r3
++ vst1.8 {q2 }, [r2 :128], r3
++
++1:
++ vext.8 q8, q8, #2
++ subs r1, #1
++ vdup.8 q0, d16[0]
++ vdup.8 q2, d16[1]
++ vst1.8 {q0 }, [r0 :128], r3
++ vst1.8 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++ vld1.8 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vdup.8 q0, d16[0]
++ vdup.8 q1, d16[0]
++ vdup.8 q2, d16[1]
++ vdup.8 q3, d16[1]
++ vext.8 q8, q9, #2
++ vext.8 q9, q9, #2
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++ vld1.16 {d16}, [r2 :64] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vdup.16 d0, d16[0]
++ vdup.16 d1, d16[1]
++ vdup.16 d2, d16[2]
++ vdup.16 d3, d16[3]
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d1 }, [r2 :64], r3
++ vst1.16 {d2 }, [r0 :64]
++ vst1.16 {d3 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++ vld1.16 {q8 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q2, d16[1]
++ vext.16 q8, q8, #2
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++ vld1.16 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++ vext.16 q8, q9, #2
++ vext.16 q9, q9, #2
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {d24}, [r2 :64] @ left
++
++ vdup.16 d4, r12
++ lsl r3, #1
++ vhsub.u16 d24, d4
++
++ vdup.16 d6, d0[0]
++ vmov.s16 d4, #0
++ vadd.i16 d24, d6
++
++ vmov.s16 d6, #0x3ff
++ vmax.s16 d24, d4
++ vmov.i64 d4, #0xffff
++ vmin.s16 d24, d6
++
++ mov r1, #4
++1:
++ vbit.8 d0, d24, d4
++ vext.16 d24, d24, #1
++ subs r1, #1
++ vst1.16 {d0 }, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q12}, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ lsl r3, #1
++ vhsub.u16 q12, q2
++
++ vdup.16 q3, d0[0]
++ vmov.s16 q2, #0
++ vadd.i16 q12, q3
++
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q12, q2
++ vmin.s16 q12, q3
++
++ vmov.i64 d4, #0xffff
++ mov r1, #8
++1:
++ vbit.8 d0, d24, d4
++ vext.16 q12, q12, #1
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q12, q13}, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ lsl r3, #1
++ vhsub.u16 q12, q2
++ vhsub.u16 q13, q2
++
++ vdup.16 q3, d0[0]
++ vmov.s16 q2, #0
++ vadd.i16 q12, q3
++ vadd.i16 q13, q3
++
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q12, q2
++ vmax.s16 q13, q2
++ vmin.s16 q12, q3
++ vmin.s16 q13, q3
++
++ vmov.i64 d4, #0xffff
++ mov r1, #16
++1:
++ vbit.8 d0, d24, d4
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ mov r1, #32
++1:
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #1
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128]
++ vst1.16 {q0 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ mov r1, #16
++1:
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ bx lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {d16}, [r2 :64] @ left
++
++ vdup.16 d4, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 d0, d4
++
++ vdup.16 d6, d16[0]
++ vmov.s16 d4, #0
++ vadd.i16 d0, d6
++
++ vmov.s16 d6, #0x3ff
++ vmax.s16 d0, d4
++ lsl r3, #2
++ vmin.s16 d0, d6
++
++ vdup.16 d1, d16[1]
++ vdup.16 d2, d16[2]
++ vdup.16 d3, d16[3]
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d1 }, [r2 :64], r3
++ vst1.16 {d2 }, [r0 :64]
++ vst1.16 {d3 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q8 }, [r2 :128] @ left
++
++ vdup.16 q2, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 q0, q2
++
++ vdup.16 q3, d16[0]
++ lsl r3, #2
++ vmov.s16 q2, #0
++ vadd.i16 q0, q3
++
++ mov r1, #3
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmin.s16 q0, q3
++
++ vdup.16 q2, d16[1]
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q2 }, [r2 :128], r3
++1:
++ vext.16 q8, q8, #2
++ vdup.16 q0, d16[0]
++ vdup.16 q2, d16[1]
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q2 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ ldrh r12, [r2, #-2] @ Up-left
++ vld1.16 {q8, q9 }, [r2 :128] @ left
++
++
++ vdup.16 q2, r12
++ add r2, r0, r3, lsl #1
++ vhsub.u16 q0, q2
++ vhsub.u16 q1, q2
++
++ vdup.16 q3, d16[0]
++ lsl r3, #2
++ vmov.s16 q2, #0
++ vadd.i16 q0, q3
++ vadd.i16 q1, q3
++
++ mov r1, #7
++ vmov.s16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmax.s16 q1, q2
++ vmin.s16 q0, q3
++ vmin.s16 q1, q3
++
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ vst1.16 {q2, q3 }, [r2 :128], r3
++1:
++ vext.16 q8, q9, #2
++ vext.16 q9, q9, #2
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[1]
++ vdup.16 q3, d16[1]
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ vst1.16 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++ vldm r2, { q8-q11}
++ mov r1, #16
++1:
++ vdup.16 q0, d16[0]
++ vdup.16 q1, d16[0]
++ vdup.16 q2, d16[0]
++ vdup.16 q3, d16[0]
++ add r2, r0, r3, lsl #1
++ vdup.16 q12, d16[1]
++ vdup.16 q13, d16[1]
++ vdup.16 q14, d16[1]
++ vdup.16 q15, d16[1]
++ vstm r0, { q0-q3 }
++ vstm r2, {q12-q15}
++
++ vext.16 q8, q9, #2
++ vext.16 q9, q10, #2
++ add r0, r0, r3, lsl #2
++ vext.16 q10, q11, #2
++ subs r1, #1
++ vext.16 q11, q11, #2
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++ vld1.16 {q8 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[1]
++ vdup.32 q2, d17[0]
++ vdup.32 q3, d17[1]
++
++ vst1.32 {q0 }, [r0 :128], r3
++ vst1.16 {q1 }, [r2 :128], r3
++ vst1.32 {q2 }, [r0 :128]
++ vst1.16 {q3 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++ vld1.16 {q8, q9 }, [r2 :128] @ Left
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[0]
++ vdup.32 q2, d16[1]
++ vdup.32 q3, d16[1]
++ vext.32 q8, q9, #2
++ vext.32 q9, q9, #2
++ vst1.32 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.32 {q2, q3 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++ vldm r2, { q8-q11}
++ mov r1, #8
++1:
++ vdup.32 q0, d16[0]
++ vdup.32 q1, d16[0]
++ vdup.32 q2, d16[0]
++ vdup.32 q3, d16[0]
++ add r2, r0, r3, lsl #2
++ vdup.32 q12, d16[1]
++ vdup.32 q13, d16[1]
++ vdup.32 q14, d16[1]
++ vdup.32 q15, d16[1]
++ vstm r0, { q0-q3 }
++ vstm r2, {q12-q15}
++
++ vext.32 q8, q9, #2
++ vext.32 q9, q10, #2
++ add r0, r0, r3, lsl #3
++ vext.32 q10, q11, #2
++ subs r1, #1
++ vext.32 q11, q11, #2
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+new file mode 100644
+index 0000000000..9fb3633862
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,930 @@
++/*
++ * Copyright (c) 2017 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@ ( x + 1 ) * p[ nTbS ][ -1 ] +
++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++ adr r12, nb_3_0_1_4
++ vld1.8 {d24}, [r2] @ Left
++ vld1.8 {d0 }, [r1] @ Up
++ vld1.8 {q8 }, [r12 :128] @ 3..
++
++ vdup.8 d30, d24[4]
++ vdup.8 d31, d0[4]
++
++ vdup.32 d0, d0[0] @ copy lo -> hi
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q0, d0, #2
++ add r1, r0, r3
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++ vshl.i16 q3, q2, #1
++ vadd.i16 d0, d4
++ vadd.i16 d1, d6
++ lsl r3, #1
++ vadd.i16 q1, q0, q3
++
++ vdup.u8 d20, d24[0]
++ vdup.u8 d21, d24[1]
++ vdup.u8 d22, d24[2]
++ vdup.u8 d23, d24[3]
++
++ vtrn.32 d20, d21
++ vtrn.32 d22, d23
++
++ vmull.u8 q10, d16, d20
++ vmull.u8 q11, d16, d22
++ vadd.i16 q10, q0
++ vadd.i16 q11, q1
++
++ vrshrn.u16 d28, q10, #3
++ vrshrn.u16 d29, q11, #3
++
++ vst1.32 {d28[0]}, [r0 :32], r3
++ vst1.32 {d28[1]}, [r1 :32], r3
++ vst1.32 {d29[0]}, [r0 :32]
++ vst1.32 {d29[1]}, [r1 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++ adr r12, nb_7_0_1_8
++ vld1.8 {q12}, [r2] @ Left
++ vld1.8 {q0 }, [r1] @ Up
++ vld1.8 {q8 }, [r12 :128] @ 7..
++
++ vdup.8 d30, d25[0]
++ vdup.8 d31, d1[0]
++
++ mov r1, #8
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q0, d0, #3
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++@ u8 7..0 [1] d16
++@ u8 left[y] [1] d24
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q0, q2
++
++ vdup.u8 d20, d24[0]
++ vext.8 d24, d24, #1
++
++ vmull.u8 q10, d16, d20
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d28, q10, #4
++
++ subs r1, #1
++ vst1.8 {d28}, [r0 :64], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++ vld1.8 {q12}, [r2 :128] @ Left
++ ldrb r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread
++ adr r12, nb_15_0_1_16
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrb r1, [r1, #16] @ Up-right
++ vld1.8 {q8, q9 }, [r12 :128] @ 15...
++
++ vdup.8 d30, r2
++ vdup.8 d31, r1
++
++ mov r1, #16
++ vsubl.u8 q3, d30, d1
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ vshll.u8 q1, d1, #4
++ vshll.u8 q0, d0, #4
++ vmlal.u8 q1, d19, d31
++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free
++
++@ u8 15..0 [1] q8
++@ u8 left[y] [1] q12
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q1, q3
++ vadd.i16 q0, q2
++
++ vdup.u8 d20, d24[0]
++ vext.8 q12, q12, #1
++
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d29, q11, #5
++ vrshrn.u16 d28, q10, #5
++
++ subs r1, #1
++ vst1.8 {q14}, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++ vpush {q4-q7}
++ vld1.8 {q12, q13}, [r2 :128]! @ Left
++ adr r12, nb_31_0_1_32
++ vld1.8 {q0, q1 }, [r1 :128]! @ Up
++ vld1.8 {d30[0]}, [r2] @ Down left
++ vld1.8 {d31[0]}, [r1] @ Up-right
++ vldm r12, { q8-q11} @ 1..32, 31..0
++
++ vdup.8 d30, d30[0]
++ vdup.8 d31, d31[0]
++
++ vsubl.u8 q7, d30, d3
++ vsubl.u8 q6, d30, d2
++ vsubl.u8 q5, d30, d1
++ vsubl.u8 q4, d30, d0 @ Add set up
++
++ vshll.u8 q3, d3, #5
++ vshll.u8 q2, d2, #5
++ vshll.u8 q1, d1, #5
++ vshll.u8 q0, d0, #5
++ vmlal.u8 q3, d23, d31
++ vmlal.u8 q2, d22, d31
++ vmlal.u8 q1, d21, d31
++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free
++
++ mov r1, #32
++
++@ u8 31..0 [2] q10, q11
++@ u8 left[y] [2] q12, q13
++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q3, q7
++ vadd.i16 q2, q6
++ vadd.i16 q1, q5
++ vadd.i16 q0, q4
++
++ vdup.u8 d20, d24[0]
++ vext.8 q12, q13, #1
++ vext.8 q13, q13, #1
++
++ vmull.u8 q15, d19, d20
++ vmull.u8 q14, d18, d20
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q15, q3
++ vadd.i16 q14, q2
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d31, q15, #6
++ vrshrn.u16 d30, q14, #6
++ vrshrn.u16 d29, q11, #6
++ vrshrn.u16 d28, q10, #6
++
++ subs r1, #1
++ vst1.8 {q14, q15}, [r0 :128], r3
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++ vld1.8 {q12}, [r2 :64] @ Left + down-left - <1d of overread is OK
++ adr r12, nbx2_3_0_1_4
++ vld1.8 {q0 }, [r1 :64] @ Up + up right
++ vld1.8 {q8 }, [r12 :128] @ 3,3..
++
++ vdup.16 d30, d25[0]
++ vdup.16 d31, d1[0]
++
++ mov r1, #4
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q0, d0, #2
++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free
++
++@ u8 3,3..0,0 [1] d16
++@ u8 left[y] [1] d24
++@ u16 acc [1] q0 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q0, q2
++
++ vdup.u16 d20, d24[0]
++ vext.16 d24, d24, #1
++
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d28, q10, #3
++
++ subs r1, #1
++ vst1.8 {d28}, [r0 :64], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++ vld1.8 {q12}, [r2 :128] @ Left
++ ldrh r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread
++ adr r12, nbx2_7_0_1_8
++ vld1.8 {q0 }, [r1 :128] @ Up
++ ldrh r1, [r1, #16] @ Up-right
++ vld1.8 {q8, q9 }, [r12 :128] @ 7,7...
++
++ vdup.16 d30, r2
++ vdup.16 d31, r1
++
++ mov r1, #8
++ vsubl.u8 q3, d30, d1
++ vsubl.u8 q2, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q1, d1, #3
++ vshll.u8 q0, d0, #3
++ vmlal.u8 q1, d19, d31
++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free
++
++@ u8 7,7..0,0 [1] q8
++@ u8 left[y] [1] q12
++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q1, q3
++ vadd.i16 q0, q2
++
++ vdup.u16 d20, d24[0]
++ vext.16 q12, q12, #1
++
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d29, q11, #4
++ vrshrn.u16 d28, q10, #4
++
++ subs r1, #1
++ vst1.8 {q14}, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++ vpush {q4-q7}
++ vld1.8 {q12, q13}, [r2 :128]! @ Left
++ adr r12, nbx2_15_0_1_16
++ vld1.8 {q0, q1 }, [r1 :128]! @ Up
++ vld1.16 {d30[0]}, [r2] @ Down left
++ vld1.16 {d31[0]}, [r1] @ Up-right
++ vldm r12, { q8-q11} @ 1..32, 31..0
++
++ vdup.16 d30, d30[0]
++ vdup.16 d31, d31[0]
++
++ mov r1, #16
++ vsubl.u8 q7, d30, d3
++ vsubl.u8 q6, d30, d2
++ vsubl.u8 q5, d30, d1
++ vsubl.u8 q4, d30, d0 @ Add set up
++
++ lsl r3, #1
++ vshll.u8 q3, d3, #4
++ vshll.u8 q2, d2, #4
++ vshll.u8 q1, d1, #4
++ vshll.u8 q0, d0, #4
++ vmlal.u8 q3, d23, d31
++ vmlal.u8 q2, d22, d31
++ vmlal.u8 q1, d21, d31
++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free
++
++@ u8 31..0 [2] q10, q11
++@ u8 left[y] [2] q12, q13
++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vadd.i16 q3, q7
++ vadd.i16 q2, q6
++ vadd.i16 q1, q5
++ vadd.i16 q0, q4
++
++ vdup.u16 d20, d24[0]
++ vext.16 q12, q13, #1
++ vext.16 q13, q13, #1
++
++ vmull.u8 q15, d19, d20
++ vmull.u8 q14, d18, d20
++ vmull.u8 q11, d17, d20
++ vmull.u8 q10, d16, d20
++
++ vadd.i16 q15, q3
++ vadd.i16 q14, q2
++ vadd.i16 q11, q1
++ vadd.i16 q10, q0
++
++ vrshrn.u16 d31, q15, #5
++ vrshrn.u16 d30, q14, #5
++ vrshrn.u16 d29, q11, #5
++ vrshrn.u16 d28, q10, #5
++
++ subs r1, #1
++ vst1.8 {q14, q15}, [r0 :256], r3
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++@------------------------------------------------------------------------------
++@
++@ Data - put btween the 2 code lumps so we can reach it with an adr from both
++@ Beware - it gets quite close which is why nb_3_0_1_4 is 1st...
++
++ .text
++ .balign 64
++
++ @ These could be extracted from the above array, but separate out
++ @ out for better (16 byte) alignment
++nb_3_0_1_4:
++ .byte 3, 2, 1, 0, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 1, 2, 3, 4
++nb_7_0_1_8:
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++nbh_3_0_1_4:
++ .short 3, 2, 1, 0, 1, 2, 3, 4
++nbx2_3_0_1_4:
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++
++ @ should be back on a 64-byte boundary here
++nb_31_0_1_32:
++ .byte 31, 30, 29, 28, 27, 26, 25, 24
++ .byte 23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++ .byte 15, 14, 13, 12, 11, 10, 9, 8
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++ .byte 9, 10, 11, 12, 13, 14, 15, 16
++ .byte 17, 18, 19, 20, 21, 22, 23, 24
++ .byte 25, 26, 27, 28, 29, 30, 31, 32
++
++ @ should be back on a 64-byte boundary here
++nbx2_15_0_1_16:
++ .byte 15, 15, 14, 14, 13, 13, 12, 12
++ .byte 11, 11, 10, 10, 9, 9, 8, 8
++nbx2_7_0_1_8:
++ .byte 7, 7, 6, 6, 5, 5, 4, 4
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++ .byte 5, 5, 6, 6, 7, 7, 8, 8
++ .byte 9, 9, 10, 10, 11, 11, 12, 12
++ .byte 13, 13, 14, 14, 15, 15, 16, 16
++
++@------------------------------------------------------------------------------
++@
++@ 10 bits
++@ (all would work with 9)
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbh_3_0_1_4
++ vld1.16 {q14}, [r2 :64]
++ vld1.16 {q8 }, [r12 :128] @ 3..0,1,..4
++ vld1.16 {q12}, [r1 :64] @ Up
++ vdup.16 d2, d29[0]
++
++ lsl r3, #1
++ vsub.i16 d4, d2, d24 @ Add set up
++
++ vdup.16 d0, d25[0]
++ vshl.i16 d24, #2
++ vmla.i16 d24, d17, d0 @ Acc set up
++ add r1, r0, r3
++ vmov d17, d16
++
++ vadd.i16 d24, d4
++ vadd.i16 d25, d24, d4
++ vshl.i16 d4, d4, #1 @ x2
++ lsl r3, #1
++ vadd.i16 d26, d24, d4
++ vadd.i16 d27, d25, d4
++
++ vdup.16 d0, d28[0]
++ vdup.16 d1, d28[1]
++ vdup.16 d2, d28[2]
++ vdup.16 d3, d28[3]
++
++ vmul.i16 q0, q8, q0
++ vmul.i16 q1, q8, q1
++ vadd.i16 q0, q12
++ vadd.i16 q1, q13
++
++ vrshr.u16 q0, #3
++ vrshr.u16 q1, #3
++
++ vst1.16 {d0}, [r0], r3
++ vst1.16 {d1}, [r1], r3
++ vst1.16 {d2}, [r0]
++ vst1.16 {d3}, [r1]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_7_0_1_8
++ vld1.16 {q14}, [r2 :128]
++ ldrh r2, [r2, #16] @ Down left
++ vld1.8 {q0 }, [r12 :128] @ 7..0,1,..8
++ vld1.16 {q12}, [r1 :128] @ Up
++ ldrh r1, [r1, #16] @ Up-right
++ vmovl.u8 q8, d1
++ vdup.16 q1, r2
++ vmovl.u8 q10, d0
++
++ lsl r3, #1
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ vdup.16 q0, r1
++ mov r1, #8
++ vshl.i16 q12, #3
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 15..0 [1] q10
++@ u32 left[y] [1] q14
++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.16 q0, d28[0]
++ vext.16 q14, q14, #1
++
++ vadd.i16 q12, q2
++
++ vmul.i16 q0, q10, q0
++ vadd.i16 q0, q12
++ vrshr.u16 q0, #4
++
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_15_0_1_16
++ vld1.16 {q14, q15}, [r2 :128]
++ ldrh r2, [r2, #32] @ Down left
++ vld1.8 {q0, q1 }, [r12 :128] @ 15..0,1,..16
++ vld1.16 {q12, q13}, [r1 :128] @ Up
++ ldrh r1, [r1, #32] @ Up-right
++ vmovl.u8 q9, d3
++ vmovl.u8 q8, d2
++ vdup.16 q1, r2
++ vmovl.u8 q11, d1
++ vmovl.u8 q10, d0
++
++ lsl r3, #1
++ vsub.i16 q3, q1, q13
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ vdup.16 q0, r1
++ mov r1, #16
++ vshl.i16 q13, #4
++ vshl.i16 q12, #4
++ vmla.i16 q13, q9, q0
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 15..0 [2] q10..q11
++@ u32 left[y] [2] q14..q15
++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.16 q0, d28[0]
++ vext.16 q14, q15, #1
++ vext.16 q15, q15, #1
++
++ vadd.i16 q13, q3
++ vadd.i16 q12, q2
++
++ vmul.i16 q1, q11, q0
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q1, #5
++ vrshr.u16 q0, #5
++
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++ push {r4, lr}
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nb_31_0_1_32
++ vpush { q4-q7 }
++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0
++ vldm r1!, {q12-q15} @ Up
++ ldrh r12, [r2, #64] @ Down left
++ vmovl.u8 q8, d4
++ vmovl.u8 q9, d5
++ vmovl.u8 q10, d6
++ vmovl.u8 q11, d7
++ vdup.16 q3, r12
++ vld1.16 {d4[0]}, [r1] @ Up-right
++
++ vsub.i16 q7, q3, q15
++ vsub.i16 q6, q3, q14
++ vsub.i16 q5, q3, q13
++ vsub.i16 q4, q3, q12 @ Add set up
++
++ vshl.i16 q15, #5
++ vshl.i16 q14, #5
++ vshl.i16 q13, #5
++ vshl.i16 q12, #5
++ vmla.i16 q15, q11, d4[0]
++ vmla.i16 q14, q10, d4[0]
++ vmla.i16 q13, q9, d4[0]
++ vmla.i16 q12, q8, d4[0] @ Acc set up - q8-q11 free
++
++ mov r1, #32
++ vmovl.u8 q8, d0
++ vmovl.u8 q9, d1
++ vmovl.u8 q10, d2
++ vmovl.u8 q11, d3
++
++@ u8 31..0 [4] q8..q11
++@ u8 left[y] [4] [r2]
++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1]
++1:
++ vld1.16 {d0[0]}, [r2]!
++
++ vadd.i16 q15, q7
++ vadd.i16 q14, q6
++ vadd.i16 q13, q5
++ vadd.i16 q12, q4
++
++ vmul.i16 q3, q11, d0[0]
++ vmul.i16 q2, q10, d0[0]
++ vmul.i16 q1, q9, d0[0]
++ vmul.i16 q0, q8, d0[0]
++
++ vadd.i16 q3, q15
++ vadd.i16 q2, q14
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q3, #6
++ vrshr.u16 q2, #6
++ vrshr.u16 q1, #6
++ vrshr.u16 q0, #6
++
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #1
++
++ bne 1b
++
++ vpop {q4-q7}
++ pop {r4, pc}
++
++endfunc
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_3_0_1_4
++ vld1.8 {q0 }, [r12 :128] @ 3,3..0,0,1,1..4,4
++ vld1.16 {q14}, [r2 :128] @ left
++ ldr r12, [r2, #16] @ Down left
++ vld1.16 {q12}, [r1 :128] @ Up
++ vmovl.u8 q8, d1
++ vdup.32 q1, r12
++ ldr r12, [r1, #16] @ Up-right
++ vmovl.u8 q10, d0
++
++ lsl r3, #2
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ mov r1, #4
++ vdup.32 q0, r12
++ vshl.i16 q12, #2
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 3,3..0,0 [1] q10
++@ u32 left[y] [1] q14
++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.32 q0, d28[0]
++ vext.32 q14, q14, #1
++
++ vadd.i16 q12, q2
++
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q0, q12
++
++ vrshr.u16 q0, #3
++
++ subs r1, #1
++ vst1.16 {q0 }, [r0 :128], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_7_0_1_8
++ vld1.8 {q0, q1 }, [r12 :128] @ 7,7..0,0,1,1..8,8
++ vld1.16 {q14, q15}, [r2 :128]
++ ldr r12, [r2, #32] @ Down left
++ vld1.16 {q12, q13}, [r1 :128] @ Up
++ vmovl.u8 q9, d3
++ vmovl.u8 q8, d2
++ vdup.32 q1, r12
++ ldr r12, [r1, #32] @ Up-right
++ vmovl.u8 q11, d1
++ vmovl.u8 q10, d0
++
++ lsl r3, #2
++ vsub.i16 q3, q1, q13
++ vsub.i16 q2, q1, q12 @ Add set up
++
++ mov r1, #8
++ vdup.32 q0, r12
++ vshl.i16 q13, #3
++ vshl.i16 q12, #3
++ vmla.i16 q13, q9, q0
++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free
++
++@ u16 7,7..0,0 [2] q10..q11
++@ u32 left[y] [2] q14..q15
++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1]
++1:
++ vdup.32 q0, d28[0]
++ vext.32 q14, q15, #1
++ vext.32 q15, q15, #1
++
++ vadd.i16 q13, q3
++ vadd.i16 q12, q2
++
++ vmul.i16 q1, q11, q0
++ vmul.i16 q0, q10, q0
++
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q1, #4
++ vrshr.u16 q0, #4
++
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r0 :256], r3
++
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr r12, nbx2_15_0_1_16
++ vpush { q4-q7 }
++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0
++ vldm r1!, {q12-q15} @ Up
++ ldr r12, [r2, #64] @ Down left
++ vmovl.u8 q11, d7
++ vmovl.u8 q10, d6
++ vmovl.u8 q9, d5
++ vmovl.u8 q8, d4
++ vdup.32 q3, r12
++ ldr r12, [r1] @ Up-right
++
++ vsub.i16 q7, q3, q15
++ vsub.i16 q6, q3, q14
++ vsub.i16 q5, q3, q13
++ vsub.i16 q4, q3, q12 @ Add set up
++
++ vdup.32 q2, r12
++ vshl.i16 q15, #4
++ vshl.i16 q14, #4
++ vshl.i16 q13, #4
++ vshl.i16 q12, #4
++ vmla.i16 q15, q11, q2
++ vmla.i16 q14, q10, q2
++ vmla.i16 q13, q9, q2
++ vmla.i16 q12, q8, q2 @ Acc set up - q8-q11 free
++
++ mov r1, #16
++ vmovl.u8 q11, d3
++ vmovl.u8 q10, d2
++ vmovl.u8 q9, d1
++ vmovl.u8 q8, d0
++
++@ u16 15,15..0,0 [4] q8..q11
++@ u32 left[y] [4] [r2]
++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1]
++1:
++ ldr r12, [r2], #4
++
++ vadd.i16 q15, q7
++ vadd.i16 q14, q6
++ vdup.32 q0, r12
++ vadd.i16 q13, q5
++ vadd.i16 q12, q4
++
++ vmul.i16 q3, q11, q0
++ vmul.i16 q2, q10, q0
++ vmul.i16 q1, q9, q0
++ vmul.i16 q0, q8, q0
++
++ vadd.i16 q3, q15
++ vadd.i16 q2, q14
++ vadd.i16 q1, q13
++ vadd.i16 q0, q12
++
++ vrshr.u16 q3, #5
++ vrshr.u16 q2, #5
++ vrshr.u16 q1, #5
++ vrshr.u16 q0, #5
++
++ subs r1, #1
++ vstm r0, { q0-q3 }
++ add r0, r0, r3, lsl #2
++
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++endfunc
++
++
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index fb0c6fae70..9f2ebb16f3 100644
--- a/libavcodec/avcodec.h
@@ -10034,10 +15529,10 @@ index 0000000000..0aee673d8b
+#endif /* AVCODEC_RPI_HEVC_DATA_H */
diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
new file mode 100644
-index 0000000000..a8601da4e7
+index 0000000000..4bfa000da4
--- /dev/null
+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1165 @@
+@@ -0,0 +1,1236 @@
+/*
+ * HEVC video decoder
+ *
@@ -10599,6 +16094,15 @@ index 0000000000..a8601da4e7
+#endif
+}
+
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
@@ -10614,23 +16118,60 @@ index 0000000000..a8601da4e7
+ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
+}
+
-+// We sometimes need 17 2-bit entries (annoying!)
-+// * This could be avoided if we separate out the H filter left-stub deblock
-+// but 64 bit constant shr shouldn't be too bad - though the variable mask here is probably quite nasty
-+static inline uint64_t hbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
-+ unsigned int n = (xr - xl + 7) & ~7;
-+
-+ return n == 0 ? (uint64_t)0 :
-+ (*(uint64_t *)(s->horizontal_bs + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1);
++ return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
+}
+
-+static inline uint64_t vbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
+{
-+ unsigned int n = (xr - xl + 7) & ~7;
++ return (uint8_t *)(bs +
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
+
-+ return n == 0 ? (uint64_t)0 :
-+ (*(uint64_t *)(s->vertical_bs2 + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1);
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2,
++ const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ if (xr <= xl) {
++ return 0;
++ }
++ else
++ {
++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++ return n == 32 ? a :
++ (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++ }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
+}
+
+
@@ -10658,68 +16199,78 @@ index 0000000000..a8601da4e7
+ // Main body
+ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
+ {
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
+ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
++ if (vbs != 0)
+ {
+ const uint8_t * const tcv = tctable + dbp->tc_offset;
+ const uint8_t * const betav = betatable + dbp->beta_offset;
+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+// const uint8_t * vbs = s->vertical_bs + (bv_l >> 3) * s->bs_height + (y >> 2);
-+ uint64_t vbs2 = vbs_get(s, bv_l, bv_r, y);
+ unsigned int x;
+
-+ for (x = bv_l; x < bv_r; x += 8)
++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
+ {
-+ const unsigned int pcmf_v = pcmfa & 3;
-+ const unsigned int bs0 = vbs2 & 3;
-+ const unsigned int bs1 = (vbs2 & 0xc) >> 2;
-+
-+ if ((bs0 | bs1) != 0 && pcmf_v != 3)
++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
+ {
+ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+ frame_stride1(s->frame, LUMA),
+ betav[qp],
-+ (bs0 == 0 ? 0 : tcv[qp + (int)(bs0 & 2)]) |
-+ ((bs1 == 0 ? 0 : tcv[qp + (int)(bs1 & 2)]) << 16),
-+ pcmf_v,
++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++ pcmfa & 3,
+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+ }
-+
-+ pcmfa >>= 1;
-+// vbs += s->bs_height;
-+ vbs2 >>= 4;
+ }
+ }
+
+ if (y != 0)
+ {
-+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+ uint64_t hbs = hbs_get(s, bh_l, bh_r + 1, y); // Will give (x <= bh_r) in for loop
++ uint32_t hbs;
+
-+ for (x = bh_l; hbs != 0; x += 8, hbs >>= 4)
++ // H left - mostly separated out so we only need a uint32_t hbs
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
+ {
-+ const unsigned int pcmf_h = (pcmfa & 1) | ((pcmfa & 0x10000) >> 15);
-+ const unsigned int bs0 = hbs & 3;
-+ const unsigned int bs1 = (hbs >> 2) & 3;
++ const unsigned int x = bh_l;
++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const DBParams * const dbph = dbp - 1;
++ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
+
-+ if ((bs0 | bs1) != 0 && pcmf_h != 3)
++ av_assert2(cb_x - bh_l == 8);
++
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbph->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++
++ // H
++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
+ {
-+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const DBParams * const dbph = (x < cb_x ? dbp - 1 : dbp);
-+ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+ frame_stride1(s->frame, LUMA),
-+ betatable[qp + dbph->beta_offset],
-+ (bs0 == 0 ? 0 : tc[bs0 & 2]) |
-+ ((bs1 == 0 ? 0 : tc[bs1 & 2]) << 16),
-+ pcmf_h);
++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++ {
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbp->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
+ }
-+
-+ pcmfa >>= 1;
+ }
+ }
+
@@ -10727,11 +16278,6 @@ index 0000000000..a8601da4e7
+ }
+}
+
-+#define TL 1
-+#define TR 2
-+#define BL 4
-+#define BR 8
-+
+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
+{
+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
@@ -10768,98 +16314,119 @@ index 0000000000..a8601da4e7
+ // Deblock V up 8
+ // CTB above current
+ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+ unsigned int x;
+ const unsigned int y = bounds.y - 8;
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
+
-+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U);
-+
-+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8)
++ if (vbs != 0)
+ {
-+ const unsigned int pcmf_v = (pcmfa & 3);
-+ if ((vbs2 & 2) != 0 && pcmf_v != 3)
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+ {
-+ const int qp0 = q2h(s, x, y);
-+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ pcmf_v);
++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ pcmfa & 3);
++ }
+ }
-+ pcmfa >>= 2;
+ }
+ }
+
+ for (y = bounds.y; y < b_b; y += 16)
+ {
++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
+ // V
++ if (vbs != 0)
+ {
+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, bv_l - 1, y);
-+ const unsigned int pcmf_or = (y + 16 <= b_b) ? 0 : BL | BR;
++ unsigned int pcmfa =
++ (y + 16 > b_b ?
++ pcm2(s, bv_l - 1, y) | 0xffff0000 :
++ pcm4(s, bv_l - 1, y));
+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U) |
-+ ((vbs_get(s, bv_l, bv_r, y + 8) & 0x0202020202020202U) << 4);
+
-+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8)
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
+ {
-+ const unsigned int pcmf_v = pcmf_or | (pcmfa & 3) | ((pcmfa >> 14) & 0xc);
-+ const unsigned int bs0 = (~pcmf_v & (TL | TR)) == 0 ? 0 : vbs2 & 2;
-+ const unsigned int bs1 = (~pcmf_v & (BL | BR)) == 0 ? 0 : (vbs2 & 0x20) >> 4;
-+
-+ if ((bs0 | bs1) != 0)
++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
+ {
+ const int qp0 = q2h(s, x, y);
+ const int qp1 = q2h(s, x, y + 8);
+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ frame_stride1(s->frame, 1),
-+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ pcmf_v);
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
+ }
-+
-+ pcmfa >>= 2;
+ }
+ }
+
+ // H
+ if (y != 0)
+ {
-+ unsigned int x;
-+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++ uint32_t hbs;
+ const unsigned int bh_l = bv_l - 16;
-+ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+ uint64_t hbs = hbs_get(s, bh_l, bh_r, y) & 0x2222222222222222U;
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
+
-+ // Chop off bits we don't want...
-+ if (bh_l < bounds.x) {
-+ pcmfa |= 0x10001; // TL|BL pre rearrangement
-+ hbs &= ~(uint64_t)3; // Make BS 0
-+ }
-+
-+ for (x = bh_l; hbs != 0; x += 16, hbs >>= 8)
++ // H left - mostly separated out so we only need a uint32_t hbs
++ // Stub is width 8 to the left of bounds, but width 16 internally
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
+ {
-+ const unsigned int pcmf_h = (x + 16 > bh_r ? TR | BR : 0) |
-+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc);
-+ const int bs0 = hbs & 2;
-+ const int bs1 = (~pcmf_h & (TR | BR)) == 0 ? 0 : (hbs >> 4) & 2;
-+ if ((bs0 | bs1) != 0)
++ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++ // Chop off bits we don't want...
++ if (bh_l < bounds.x) {
++ pcmfa |= 0x10001; // TL|BL pre rearrangement
++ hbs &= ~3; // Make BS 0
++ }
++
++ // Double check we still want this
++ if (hbs != 0 && (~pcmfa & 0x30003) != 0)
+ {
++ const unsigned int x = bh_l;
+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+ const uint8_t * const tc = tctable + 2 + (x < cb_x ? dbp - 1 : dbp)->tc_offset;
++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
+
+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+ frame_stride1(s->frame, 1),
-+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+ pcmf_h);
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++
++ // H main
++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it
++
++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++ {
++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
+ }
-+ pcmfa >>= 2;
+ }
+ }
+ }
@@ -10871,18 +16438,18 @@ index 0000000000..a8601da4e7
+ return x & ~(~0U << log2_n);
+}
+
-+static inline void set_bs_h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+ av_assert2((y & 7) == 0);
+
+ // This doesn't have the same simultainious update issues that bsf_stash
+ // does (other threads will have a different y) so we can do it the easy way
+ if ((bsf &= mask) != 0)
-+ *(uint32_t *)(s->horizontal_bs + ((x >> 4) & ~3) + (y >> 3) * s->hbs_stride) |= bsf << ((x >> 1) & 31);
++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
+}
+
+
-+static void set_bs_v(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
+{
+ // We arrange this in a slightly odd fashion but it lines up with
+ // how we are going to use it in the actual deblock code & it is easier
@@ -10894,8 +16461,7 @@ index 0000000000..a8601da4e7
+
+ if ((bsf &= mask) != 0)
+ {
-+ const unsigned int stride1 = s->hbs_stride;
-+ uint8_t *p = s->vertical_bs2 + (x >> 4) + (y >> 3) * stride1;
++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
+ const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
+
+ if (mask <= 0xf)
@@ -10906,7 +16472,7 @@ index 0000000000..a8601da4e7
+ {
+ do {
+ *p |= (bsf & 0xf) << sh;
-+ p += stride1;
++ p += HEVC_RPI_BS_STRIDE1_BYTES;
+ } while ((bsf >>= 4) != 0);
+ }
+ }
@@ -10918,19 +16484,10 @@ index 0000000000..a8601da4e7
+ const RefPicList * const rpl_p, const RefPicList * const rpl_q,
+ const MvField * const mvf_p, const MvField * const mvf_q)
+{
-+ uint8_t res[16];
-+ unsigned int i;
-+ unsigned int a = 0;
-+
-+ s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+ sizeof(MvField) * mvf_stride, 1,
++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++ mvf_p, mvf_q,
+ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+ mvf_p, mvf_q, res);
-+
-+ for (i = 0; i != rep * dup; ++i) {
-+ a |= res[i] << (i * 2);
-+ }
-+ return a;
++ sizeof(MvField) * mvf_stride);
+}
+
+
@@ -11050,7 +16607,7 @@ index 0000000000..a8601da4e7
+ }
+
+ // Finally put the results into bs
-+ set_bs_h(s, x0, y0, bsf_mask, bsf_h);
++ hbs_set(s, x0, y0, bsf_mask, bsf_h);
+ }
+
+ // Max of 1 pu internal split - ignore if not on 8pel boundary
@@ -11061,7 +16618,7 @@ index 0000000000..a8601da4e7
+ // If we have the x split as well then it must be in the middle
+ const unsigned int log2_rep = has_x_split ? 1 : 0;
+
-+ set_bs_h(s, x0, lc->cu.y_split, bsf_mask,
++ hbs_set(s, x0, lc->cu.y_split, bsf_mask,
+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+ trafo_size >> (log2_min_pu_size + log2_rep),
+ rpl, rpl,
@@ -11074,7 +16631,7 @@ index 0000000000..a8601da4e7
+ {
+ // Boundary left
+ if (x0 != 0 &&
-+ ((x0 & ((1 << s->ps.sps->log2_ctb_size) - 1)) != 0 ||
++ (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
+ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
+ {
+ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
@@ -11090,7 +16647,7 @@ index 0000000000..a8601da4e7
+ mvf_curr, mvf_curr - 1);
+ }
+
-+ set_bs_v(s, x0, y0, bsf_mask, bsf_v);
++ vbs_set(s, x0, y0, bsf_mask, bsf_v);
+ }
+
+ if (has_x_split && !off_boundary(lc->cu.x_split, 3))
@@ -11099,7 +16656,7 @@ index 0000000000..a8601da4e7
+ (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size);
+ const unsigned int log2_rep = has_y_split ? 1 : 0;
+
-+ set_bs_v(s, lc->cu.x_split, y0, bsf_mask,
++ vbs_set(s, lc->cu.x_split, y0, bsf_mask,
+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
+ (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
+ rpl, rpl,
@@ -11135,6 +16692,12 @@ index 0000000000..a8601da4e7
+
+ // Deblock may not touch the edges of the bound as they are still needed
+ // for Intra pred
++ //
++ // Deblock is disabled with a per-slice flag
++ // Given that bounds may cover multiple slices & we dblock outside bounds
++ // anyway we can't avoid deblock using that flag - about the only thing we
++ // could do is have a "no deblock seen yet" flag but it doesn't really
++ // seem worth the effort
+
+ deblock_y_blk(s, bounds, x_end, y_end);
+ deblock_uv_blk(s, bounds, x_end, y_end);
@@ -11150,9 +16713,12 @@ index 0000000000..a8601da4e7
+ const unsigned int xl = ussub(bounds.x, xo);
+ const unsigned int xr = x_end ? br : ussub(br, xo);
+
-+ for (y = yt; y < yb; y += ctb_size) {
-+ for (x = xl; x < xr; x += ctb_size) {
-+ sao_filter_CTB(s, x, y);
++ if (s->ps.sps->sao_enabled)
++ {
++ for (y = yt; y < yb; y += ctb_size) {
++ for (x = xl; x < xr; x += ctb_size) {
++ sao_filter_CTB(s, x, y);
++ }
+ }
+ }
+
@@ -12162,7 +17728,7 @@ index 0000000000..4b4d032a16
+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
new file mode 100644
-index 0000000000..e8df452021
+index 0000000000..744e7cf248
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.c
@@ -0,0 +1,1957 @@
@@ -13347,7 +18913,7 @@ index 0000000000..e8df452021
+ sps->long_term_ref_pics_present_flag = get_bits1(gb);
+ if (sps->long_term_ref_pics_present_flag) {
+ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+ if (sps->num_long_term_ref_pics_sps > 31U) {
++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
+ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
+ sps->num_long_term_ref_pics_sps);
+ return AVERROR_INVALIDDATA;
@@ -14125,7 +19691,7 @@ index 0000000000..e8df452021
+}
diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
new file mode 100644
-index 0000000000..c9ecf9a268
+index 0000000000..1e7120a43d
--- /dev/null
+++ b/libavcodec/rpi_hevc_ps.h
@@ -0,0 +1,441 @@
@@ -14388,8 +19954,8 @@ index 0000000000..c9ecf9a268
+ uint8_t sao_enabled;
+
+ uint8_t long_term_ref_pics_present_flag;
-+ uint16_t lt_ref_pic_poc_lsb_sps[32];
-+ uint8_t used_by_curr_pic_lt_sps_flag[32];
++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
+ uint8_t num_long_term_ref_pics_sps;
+
+ struct {
@@ -15093,7 +20659,7 @@ index 0000000000..d7745711ab
+}
diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
new file mode 100644
-index 0000000000..c5133a8a88
+index 0000000000..cd8149d58e
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.c
@@ -0,0 +1,368 @@
@@ -15194,10 +20760,11 @@ index 0000000000..c5133a8a88
+ s->quincunx_subsampling = get_bits1(gb);
+ s->content_interpretation_type = get_bits(gb, 6);
+
-+ // the following skips spatial_flipping_flag frame0_flipped_flag
-+ // field_views_flag current_frame_is_frame0_flag
-+ // frame0_self_contained_flag frame1_self_contained_flag
-+ skip_bits(gb, 6);
++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++ skip_bits(gb, 3);
++ s->current_frame_is_frame0_flag = get_bits1(gb);
++ // frame0_self_contained_flag, frame1_self_contained_flag
++ skip_bits(gb, 2);
+
+ if (!s->quincunx_subsampling && s->arrangement_type != 5)
+ skip_bits(gb, 16); // frame[01]_grid_position_[xy]
@@ -15371,8 +20938,8 @@ index 0000000000..c5133a8a88
+ return 0;
+}
+
-+static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+ int type, int size, void *logctx)
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++ int type, int size)
+{
+ switch (type) {
+ case 256: // Mismatched value from HM 8.1
@@ -15400,8 +20967,8 @@ index 0000000000..c5133a8a88
+ }
+}
+
-+static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s,
-+ int type, int size, void *logctx)
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ int type, int size)
+{
+ switch (type) {
+ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
@@ -15413,9 +20980,8 @@ index 0000000000..c5133a8a88
+ }
+}
+
-+static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s,
-+ const HEVCRpiParamSets *ps, int nal_unit_type,
-+ void *logctx)
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++ const HEVCRpiParamSets * const ps, const int nal_unit_type)
+{
+ int payload_type = 0;
+ int payload_size = 0;
@@ -15436,9 +21002,9 @@ index 0000000000..c5133a8a88
+ payload_size += byte;
+ }
+ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+ return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx);
++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
+ } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+ return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx);
++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
+ }
+}
+
@@ -15453,7 +21019,7 @@ index 0000000000..c5133a8a88
+ int ret;
+
+ do {
-+ ret = decode_nal_sei_message(gb, s, ps, type, logctx);
++ ret = decode_nal_sei_message(gb, logctx, s, ps, type);
+ if (ret < 0)
+ return ret;
+ } while (more_rbsp_data(gb));
@@ -15467,7 +21033,7 @@ index 0000000000..c5133a8a88
+}
diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
new file mode 100644
-index 0000000000..41e4a20127
+index 0000000000..d4ac348df9
--- /dev/null
+++ b/libavcodec/rpi_hevc_sei.h
@@ -0,0 +1,135 @@
@@ -15533,7 +21099,6 @@ index 0000000000..41e4a20127
+} HEVC_SEI_Type;
+
+typedef struct HEVCSEIPictureHash {
-+ struct AVMD5 *md5_ctx;
+ uint8_t md5[3][16];
+ uint8_t is_md5;
+} HEVCSEIPictureHash;
@@ -15543,6 +21108,7 @@ index 0000000000..41e4a20127
+ int arrangement_type;
+ int content_interpretation_type;
+ int quincunx_subsampling;
++ int current_frame_is_frame0_flag;
+} HEVCSEIFramePacking;
+
+typedef struct HEVCSEIDisplayOrientation {
@@ -20363,10 +25929,10 @@ index 0000000000..1128a2c054
+};
diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
new file mode 100644
-index 0000000000..4034c77979
+index 0000000000..08686ff260
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,5753 @@
+@@ -0,0 +1,5787 @@
+/*
+ * HEVC video Decoder
+ *
@@ -21302,9 +26868,9 @@ index 0000000000..4034c77979
+ av_freep(&s->tab_slice_address);
+ av_freep(&s->filter_slice_edges);
+
-+ av_freep(&s->horizontal_bs);
++ av_freep(&s->bs_horizontal);
+// av_freep(&s->vertical_bs);
-+ av_freep(&s->vertical_bs2);
++ av_freep(&s->bs_vertical);
+ av_freep(&s->bsf_stash_left);
+ av_freep(&s->bsf_stash_up);
+
@@ -21325,8 +26891,13 @@ index 0000000000..4034c77979
+ int ctb_count = sps->ctb_width * sps->ctb_height;
+ int min_pu_size = sps->min_pu_width * sps->min_pu_height;
+
-+ s->hbs_stride = ((width + 63) & ~63) >> 4;
-+ s->bs_size = (((height + 15) & ~15) >> 3) * s->hbs_stride;
++ {
++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++ unsigned int h = ((height + 15) & ~15);
++
++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++ }
+
+ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
+ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
@@ -21352,9 +26923,9 @@ index 0000000000..4034c77979
+ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
+ goto fail;
+
-+ s->horizontal_bs = av_mallocz(s->bs_size);
-+ s->vertical_bs2 = av_mallocz(s->bs_size);
-+ if (s->horizontal_bs == NULL || s->vertical_bs2 == NULL)
++ s->bs_horizontal = av_mallocz(s->bs_size);
++ s->bs_vertical = av_mallocz(s->bs_size);
++ if (s->bs_horizontal == NULL || s->bs_vertical == NULL)
+ goto fail;
+
+ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
@@ -21406,15 +26977,22 @@ index 0000000000..4034c77979
+ uint8_t chroma_weight_l0_flag[16];
+ uint8_t luma_weight_l1_flag[16];
+ uint8_t chroma_weight_l1_flag[16];
-+ int luma_log2_weight_denom;
++ unsigned int luma_log2_weight_denom;
+
+ luma_log2_weight_denom = get_ue_golomb_long(gb);
-+ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
++ if (luma_log2_weight_denom > 7) {
+ av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
-+ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
++ return AVERROR_INVALIDDATA;
++ }
++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
+ if (ctx_cfmt(s) != 0) {
-+ int delta = get_se_golomb(gb);
-+ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
++ const unsigned int chroma_log2_weight_denom = luma_log2_weight_denom + get_se_golomb(gb);
++ if (chroma_log2_weight_denom > 7)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is invalid\n", chroma_log2_weight_denom);
++ return AVERROR_INVALIDDATA;
++ }
++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
+ }
+
+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
@@ -21741,6 +27319,7 @@ index 0000000000..4034c77979
+ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
+ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
+ const HEVCRpiSPS *last_sps = s->ps.sps;
++ enum AVPixelFormat pix_fmt;
+
+ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
+ if (sps->width != last_sps->width || sps->height != last_sps->height ||
@@ -21750,10 +27329,20 @@ index 0000000000..4034c77979
+ }
+ ff_hevc_rpi_clear_refs(s);
+
-+ ret = set_sps(s, sps, get_format(s, sps));
++ ret = set_sps(s, sps, sps->pix_fmt);
+ if (ret < 0)
+ return ret;
+
++ pix_fmt = get_format(s, sps);
++ if (pix_fmt < 0)
++ return pix_fmt;
++
++// ret = set_sps(s, sps, pix_fmt);
++// if (ret < 0)
++// return ret;
++
++ s->avctx->pix_fmt = pix_fmt;
++
+ s->seq_decode = (s->seq_decode + 1) & 0xff;
+ s->max_ra = INT_MAX;
+ }
@@ -25184,6 +30773,13 @@ index 0000000000..4034c77979
+
+ if (s->sei.frame_packing.content_interpretation_type == 2)
+ stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++ if (s->sei.frame_packing.arrangement_type == 5) {
++ if (s->sei.frame_packing.current_frame_is_frame0_flag)
++ stereo->view = AV_STEREO3D_VIEW_LEFT;
++ else
++ stereo->view = AV_STEREO3D_VIEW_RIGHT;
++ }
+ }
+
+ if (s->sei.display_orientation.present &&
@@ -25297,8 +30893,8 @@ index 0000000000..4034c77979
+ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+ int ret;
+
-+ memset(s->horizontal_bs, 0, s->bs_size);
-+ memset(s->vertical_bs2, 0, s->bs_size);
++ memset(s->bs_horizontal, 0, s->bs_size);
++ memset(s->bs_vertical, 0, s->bs_size);
+ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
+ memset(s->skip_flag, 0, s->ps.sps->min_cb_height * s->skip_flag_stride);
+ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
@@ -25421,7 +31017,12 @@ index 0000000000..4034c77979
+ }
+ }
+#endif
-+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++ if (
++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s)))
++ {
+ s->is_decoded = 0;
+ break;
+ }
@@ -25596,7 +31197,7 @@ index 0000000000..4034c77979
+ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
+ uint8_t md5[16];
+
-+ av_md5_init(s->sei.picture_hash.md5_ctx);
++ av_md5_init(s->md5_ctx);
+ for (j = 0; j < h; j++) {
+ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
+#if HAVE_BIGENDIAN
@@ -25606,9 +31207,9 @@ index 0000000000..4034c77979
+ src = s->checksum_buf;
+ }
+#endif
-+ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift);
++ av_md5_update(s->md5_ctx, src, w << pixel_shift);
+ }
-+ av_md5_final(s->sei.picture_hash.md5_ctx, md5);
++ av_md5_final(s->md5_ctx, md5);
+
+ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
+ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
@@ -25759,7 +31360,7 @@ index 0000000000..4034c77979
+
+ pic_arrays_free(s);
+
-+ av_freep(&s->sei.picture_hash.md5_ctx);
++ av_freep(&s->md5_ctx);
+
+ av_freep(&s->cabac_save);
+
@@ -25871,8 +31472,7 @@ index 0000000000..4034c77979
+
+ s->max_ra = INT_MAX;
+
-+ s->sei.picture_hash.md5_ctx = av_md5_alloc();
-+ if (!s->sei.picture_hash.md5_ctx)
++ if ((s->md5_ctx = av_md5_alloc()) == NULL)
+ goto fail;
+
+ s->context_initialized = 1;
@@ -26122,10 +31722,10 @@ index 0000000000..4034c77979
+
diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
new file mode 100644
-index 0000000000..117432de0a
+index 0000000000..df2bac1df4
--- /dev/null
+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,985 @@
+@@ -0,0 +1,1002 @@
+/*
+ * HEVC video decoder
+ *
@@ -26397,6 +31997,8 @@ index 0000000000..117432de0a
+ INTRA_ANGULAR_33,
+ INTRA_ANGULAR_34,
+};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26
+
+enum SAOType {
+ SAO_NOT_APPLIED = 0,
@@ -26813,6 +32415,17 @@ index 0000000000..117432de0a
+ uint8_t state[HEVC_CONTEXTS];
+} HEVCRpiCabacState;
+
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
+typedef struct HEVCRpiContext {
+ const AVClass *c; // needed by private avoptions
+ AVCodecContext *avctx;
@@ -26882,17 +32495,19 @@ index 0000000000..117432de0a
+ int eos; ///< current packet contains an EOS/EOB NAL
+ int last_eos; ///< last packet contains an EOS/EOB NAL
+ int max_ra;
-+ unsigned int hbs_stride;
-+ unsigned int bs_size;
+
+ int is_decoded;
+ int no_rasl_output_flag;
+
-+ HEVCPredContext hpc;
++ HEVCRpiPredContext hpc;
+ HEVCDSPContext hevcdsp;
+ int8_t *qp_y_tab;
-+ uint8_t *horizontal_bs;
-+ uint8_t *vertical_bs2;
++
++ // Deblocking block strength bitmaps
++ unsigned int bs_stride2;
++ unsigned int bs_size;
++ uint8_t *bs_horizontal;
++ uint8_t *bs_vertical;
+ uint8_t *bsf_stash_up;
+ uint8_t *bsf_stash_left;
+
@@ -26930,6 +32545,8 @@ index 0000000000..117432de0a
+ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
+ int nuh_layer_id;
+
++ struct AVMD5 *md5_ctx;
++
+ HEVCSEIContext sei;
+
+ // Put structures that allocate non-trivial storage at the end
@@ -27113,10 +32730,10 @@ index 0000000000..117432de0a
+#endif /* AVCODEC_RPI_HEVCDEC_H */
diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
new file mode 100644
-index 0000000000..a6af5ecd85
+index 0000000000..c5d130c377
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,416 @@
+@@ -0,0 +1,419 @@
+/*
+ * HEVC video decoder
+ *
@@ -27242,10 +32859,12 @@ index 0000000000..a6af5ecd85
+#include "rpi_hevcdsp_template.c"
+#undef BIT_DEPTH
+
-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs)
++ int in_inc)
+{
++ int shift = 32;
++ uint32_t bs = 0;
+ for (; pus > 0; pus--) {
+ int strength, out;
+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
@@ -27350,10 +32969,11 @@ index 0000000000..a6af5ecd85
+
+ for (out = dup; out > 0; out--)
+ {
-+ *bs = strength;
-+ bs += out_inc;
++ bs = (bs >> 2) | (strength << 30);
++ shift -= 2;
+ }
+ }
++ return bs >> shift;
+}
+
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
@@ -27535,7 +33155,7 @@ index 0000000000..a6af5ecd85
+}
diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
new file mode 100644
-index 0000000000..59d06bbe28
+index 0000000000..8c9bf725bf
--- /dev/null
+++ b/libavcodec/rpi_hevcdsp.h
@@ -0,0 +1,183 @@
@@ -27707,9 +33327,9 @@ index 0000000000..59d06bbe28
+ uint8_t * src_l,
+ unsigned int no_f);
+
-+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh,
+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ const MvField *curr, const MvField *neigh, uint8_t *bs);
++ int in_inc);
+} HEVCDSPContext;
+
+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
@@ -30008,10 +35628,10 @@ index 0000000000..cfe9264fc3
+
diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
new file mode 100644
-index 0000000000..f6db76482d
+index 0000000000..113ed33d64
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,122 @@
+@@ -0,0 +1,150 @@
+/*
+ * HEVC video Decoder
+ *
@@ -30037,6 +35657,9 @@ index 0000000000..f6db76482d
+#include "rpi_hevcdec.h"
+
+#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
+
+#define PRED_C 0
+#define BIT_DEPTH 8
@@ -30074,7 +35697,7 @@ index 0000000000..f6db76482d
+#undef BIT_DEPTH
+#undef PRED_C
+
-+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth)
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
+{
+#undef FUNC
+#define FUNC(a, depth) a ## _ ## depth
@@ -30091,7 +35714,18 @@ index 0000000000..f6db76482d
+ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
+ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
+ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
-+ hpc->pred_dc = FUNC(pred_dc, depth); \
++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \
++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \
++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \
++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \
++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
+ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
+ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
+ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
@@ -30106,7 +35740,18 @@ index 0000000000..f6db76482d
+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
-+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \
++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \
++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \
++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \
++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \
++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
@@ -30131,15 +35776,18 @@ index 0000000000..f6db76482d
+ break;
+ }
+
-+ if (ARCH_MIPS)
-+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#if (ARCH_ARM)
++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
+}
diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
new file mode 100644
-index 0000000000..03c6eb3295
+index 0000000000..31d7d57d95
--- /dev/null
+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,57 @@
+@@ -0,0 +1,68 @@
+/*
+ * HEVC video Decoder
+ *
@@ -30172,37 +35820,48 @@ index 0000000000..03c6eb3295
+struct HEVCRpiContext;
+struct HEVCRpiLocalContext;
+
-+typedef struct HEVCPredContext {
++typedef struct HEVCRpiPredContext {
+ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
+
+ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
+ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride,
-+ int c_idx, int mode);
++ int mode);
++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
+ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
+
+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+ const uint8_t *left, ptrdiff_t stride,
-+ int c_idx, int mode);
-+} HEVCPredContext;
++ int mode);
++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++} HEVCRpiPredContext;
+
-+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth);
-+void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
+
+#endif /* AVCODEC_RPI_HEVCPRED_H */
diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
new file mode 100644
-index 0000000000..4ee776f955
+index 0000000000..a76ba4c442
--- /dev/null
+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,850 @@
+@@ -0,0 +1,983 @@
+/*
+ * HEVC video decoder
+ *
@@ -30396,20 +36055,21 @@ index 0000000000..4ee776f955
+ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
+ lc->tu.intra_pred_mode;
+ pixel4 a;
-+ pixel left_array[2 * MAX_TB_SIZE + 1];
++
++ // Align so we can do multiple loads in the asm
++ // Padded to 16 byte boundary so as not to confuse anything
++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++ DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
+#if !PRED_C
-+ pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
-+#endif
-+ pixel top_array[2 * MAX_TB_SIZE + 1];
-+#if !PRED_C
-+ pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
++ DECLARE_ALIGNED(16, pixel, filtered_left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++ DECLARE_ALIGNED(16, pixel, filtered_top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
+#endif
+
-+ pixel *left = left_array + 1;
-+ pixel *top = top_array + 1;
++ pixel *left = left_array + 16 / sizeof(pixel);
++ pixel *top = top_array + 16 / sizeof(pixel);
+#if !PRED_C
-+ pixel *filtered_left = filtered_left_array + 1;
-+ pixel *filtered_top = filtered_top_array + 1;
++ pixel *filtered_left = filtered_left_array + 16 / sizeof(pixel);
++ pixel *filtered_top = filtered_top_array + 16 / sizeof(pixel);
+#endif
+ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+ int cand_left = lc->na.cand_left;
@@ -30664,12 +36324,22 @@ index 0000000000..4ee776f955
+ (uint8_t *)left, stride);
+ break;
+ case INTRA_DC:
-+ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, log2_size, c_idx);
++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride,
++ mode);
+ break;
+ default:
+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, c_idx,
++ (uint8_t *)left, stride,
+ mode);
+ break;
+ }
@@ -30680,12 +36350,22 @@ index 0000000000..4ee776f955
+ (uint8_t *)left, stride);
+ break;
+ case INTRA_DC:
-+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, log2_size, c_idx);
++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride,
++ mode);
+ break;
+ default:
+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+ (uint8_t *)left, stride, c_idx,
++ (uint8_t *)left, stride,
+ mode);
+ break;
+ }
@@ -30768,7 +36448,7 @@ index 0000000000..4ee776f955
+#if !PRED_C
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size, int c_idx)
++ ptrdiff_t stride, int log2_size)
+{
+ int i, j, x, y;
+ int size = (1 << log2_size);
@@ -30788,7 +36468,10 @@ index 0000000000..4ee776f955
+ for (j = 0; j < size; j+=4)
+ AV_WN4P(&POS(j, i), a);
+
-+ if (c_idx == 0 && size < 32) {
++// if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++ if (size < 32)
++ {
+ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
+ for (x = 1; x < size; x++)
+ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
@@ -30799,7 +36482,7 @@ index 0000000000..4ee776f955
+#else
+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size, int c_idx)
++ ptrdiff_t stride, int log2_size)
+{
+ unsigned int i, j;
+ const unsigned int size = (1 << log2_size);
@@ -30830,6 +36513,20 @@ index 0000000000..4ee776f955
+}
+#endif
+
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc)(src, top, left, stride, size + 2); \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
+#ifndef ANGLE_CONSTS
+#define ANGLE_CONSTS
+static const int intra_pred_angle[] = {
@@ -30846,7 +36543,7 @@ index 0000000000..4ee776f955
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+ const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int c_idx,
++ ptrdiff_t stride,
+ int mode, int size)
+{
+ int x, y;
@@ -30889,10 +36586,12 @@ index 0000000000..4ee776f955
+ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
+ }
+ }
-+ if (mode == 26 && c_idx == 0 && size < 32) {
++// if (mode == 26 && c_idx == 0 && size < 32) {
++ if (mode == 26 && size < 32) {
+ for (y = 0; y < size; y++)
+ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
+ }
++
+ } else {
+ ref = left - 1;
+ if (angle < 0 && last < -1) {
@@ -30916,7 +36615,8 @@ index 0000000000..4ee776f955
+ POS(x, y) = ref[y + idx + 1];
+ }
+ }
-+ if (mode == 10 && c_idx == 0 && size < 32) {
++// if (mode == 10 && c_idx == 0 && size < 32) {
++ if (mode == 10 && size < 32) {
+ for (x = 0; x < size; x += 4) {
+ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1));
+ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
@@ -30925,12 +36625,61 @@ index 0000000000..4ee776f955
+ }
+ }
+ }
++
++
++
++#if BIT_DEPTH == 8 && 0
++ if ((size == 16 || size == 32) && mode != 10 && mode != 26) {
++ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
++ void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++// void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++#if 1
++ src = (pixel *)_src;
++ printf("C: Mode=%d\n", mode);
++ for (y = 0; y < size; y++, src += stride)
++ {
++ printf("%2d: ", y);
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x ", src[x]);
++ }
++ printf("\n");
++ }
++#endif
++// ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size);
++ memset(a, 0, sizeof(a));
++// ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode);
++ ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode);
++#if 1
++ src = (pixel *)a;
++ printf("A:\n");
++ for (y = 0; y < size; y++, src += size)
++ {
++ printf("%2d: ", y);
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x ", src[x]);
++ }
++ printf("\n");
++ }
++#endif
++ src = (pixel *)_src;
++ for (y = 0; y < size; y++, src += stride)
++ {
++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
++ printf("Fail at line %d\n", y);
++ av_assert0(0);
++ }
++ }
++ }
++#endif
++
+}
+#else
+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+ const uint8_t *_top,
+ const uint8_t *_left,
-+ ptrdiff_t stride, int c_idx,
++ ptrdiff_t stride,
+ int mode, int size)
+{
+ int x, y;
@@ -31001,35 +36750,78 @@ index 0000000000..4ee776f955
+ }
+ }
+ }
++
++#if BIT_DEPTH == 10 && 0
++ if (size == 16 && mode != 10 && mode != 26) {
++ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
++// void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++ void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++ src = (c_dst_ptr_t)_src;
++ printf("C: mode=%d\n", mode);
++ for (y = 0; y < size; y++, src += stride)
++ {
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x:%3x ", src[x][0], src[x][1]);
++ }
++ printf("\n");
++ }
++
++ memset(a, 0, sizeof(a));
++ ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode);
++
++ src = (c_dst_ptr_t)a;
++ printf("A:\n");
++ for (y = 0; y < size; y++, src += size)
++ {
++ for (x = 0; x < size; x++)
++ {
++ printf("%3x:%3x ", src[x][0], src[x][1]);
++ }
++ printf("\n");
++ }
++
++ src = (c_dst_ptr_t)_src;
++ for (y = 0; y < size; y++, src += stride)
++ {
++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
++ printf("Fail at line %d\n", y);
++ av_assert0(0);
++ }
++ }
++
++ }
++#endif
+}
+#endif
+
+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
+}
+
+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
+}
+
+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
+}
+
+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+ const uint8_t *left,
-+ ptrdiff_t stride, int c_idx, int mode)
++ ptrdiff_t stride, int mode)
+{
-+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
+}
+
+#undef cpel