diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 057d1692fe..4d9a4a6157 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -1,19 +1,20 @@ ################################################################################ -# This file is part of OpenELEC - http://www.openelec.tv +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC # Copyright (C) 2009-2016 Stephan Raue (stephan@openelec.tv) # -# OpenELEC is free software: you can redistribute it and/or modify +# LibreELEC is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # -# OpenELEC is distributed in the hope that it will be useful, +# LibreELEC is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with OpenELEC. If not, see . +# along with LibreELEC. If not, see . ################################################################################ PKG_NAME="ffmpeg" diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index ef2f4d7d62..91ea9da3dd 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -582,18 +582,19 @@ index 4d4ef530e4..fba8776c9f 100644 { const AVCodec *p, *experimental = NULL; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index e656011c3c..69cd820f06 100644 +index e656011c3c..70c3f026b8 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ +@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o -+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ ++ arm/rpi_hevcpred_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o -@@ -136,10 +137,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -136,10 +138,23 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -608,7 +609,12 @@ index e656011c3c..69cd820f06 100644 + arm/rpi_hevcdsp_idct_neon.o \ + arm/rpi_hevcdsp_res8_neon.o \ + arm/rpi_hevcdsp_res16_neon.o \ -+ arm/rpi_hevcdsp_sao_neon.o ++ arm/rpi_hevcdsp_sao_neon.o \ ++ arm/rpi_hevcpred_init_neon.o \ ++ arm/rpi_hevcpred_intra_angular_neon.o \ ++ arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_hv_neon.o \ ++ arm/rpi_hevcpred_intra_planar_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o @@ -1738,10 +1744,10 @@ index 0000000000..62b9326532 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 -index 0000000000..e665bd848a +index 0000000000..f75c82671e --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1249 @@ +@@ -0,0 +1,1593 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -1766,65 +1772,72 @@ index 0000000000..e665bd848a +#include "libavutil/arm/asm.S" +#include "neon.S" + -+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a ++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 + vsubl.u8 q0, \Q0a, \P0a -+ vsubl.u8 q2, \P1a, \Q1a -+ vshl.i16 q0, #2 -+ vadd.i16 q0, q2 ++ vsubl.u8 q1, \P1a, \Q1a + vdup.16 d4, r2 -+ -+ vrshr.s16 q0, #3 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 + vmovl.u8 q2, d4 -+ ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 -+ vmax.s16 q0, q2 -+ vaddw.u8 q2, q0, \P0a -+ -+ vqmovun.s16 \P0a, q2 + vmovl.u8 q2, \Q0a -+ vsub.i16 q2, q0 -+ -+ vqmovun.s16 \Q0a, q2 ++ vmax.s16 q0, q1 ++ vaddw.u8 q1, q0, \P0a ++ vsub.i16 q0, q2, q0 ++ vqmovun.s16 \P0a, q1 ++ vqmovun.s16 \Q0a, q0 +.endm + + -+.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v -+ vsubl.u8 q0, \Q0u, \P0u -+ vsubl.u8 q1, \Q0v, \P0v -+ vsubl.u8 q2, \P1u, \Q1u -+ vsubl.u8 q3, \P1v, \Q1v -+ vshl.i16 q0, #2 -+ vshl.i16 q1, #2 -+ vadd.i16 q0, q2 -+ vdup.16 d4, r2 -+ lsr r2, #16 -+ vadd.i16 q1, q3 -+ -+ vrshr.s16 q0, #3 -+ vdup.16 d6, r2 -+ vmovl.u8 q2, d4 -+ vmovl.u8 q3, d6 -+ vrshr.s16 q1, #3 -+ ++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 ++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b ++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a ++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vmovl.u8 q2, d4 @ tc0a, tc0b ++ \I3 ++ vmovl.u8 q3, d6 @ tc1a, tc1b ++ \I4 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 + vmin.s16 q1, q3 -+ vneg.s16 q3, q3 -+ vmax.s16 q0, q2 -+ vaddw.u8 q2, q0, \P0u -+ vmax.s16 q1, q3 -+ vaddw.u8 q3, q1, \P0v -+ -+ vqmovun.s16 \P0u, q2 -+ vmovl.u8 q2, \Q0u -+ vqmovun.s16 \P0v, q3 -+ vmovl.u8 q3, \Q0v -+ vsub.i16 q2, q0 -+ vsub.i16 q3, q1 -+ -+ vqmovun.s16 \Q0u, q2 -+ vqmovun.s16 \Q0v, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q1, q3 @ delta0b ++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a ++ vsub.i16 q0, q2, q0 @ q0a - delta0a ++ vmovl.u8 q2, \Q0b ++ vsub.i16 q2, q1 @ q0b - delta0b ++ vaddw.u8 q1, \P0b @ p0b + delta0b ++ vqmovun.s16 \Q0a, q0 ++ vqmovun.s16 \P0a, q3 ++ vqmovun.s16 \Q0b, q2 ++ vqmovun.s16 \P0b, q1 +.endm + + @@ -1835,33 +1848,36 @@ index 0000000000..e665bd848a +@ [0..7] tc U a +@ [8..15] tc V a + -+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth ++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 + vsub.i16 q0, \Q0a, \P0a -+ vsub.i16 q2, \P1a, \Q1a -+ vshl.i16 q0, #2 -+ vadd.i16 q0, q2 -+ vrshr.s16 q0, #3 -+ ++ vsub.i16 q1, \P1a, \Q1a + vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 + vshll.u8 q2, d4, #\bit_depth - 8 -+ -+ movw r2, #(1 << \bit_depth) - 1 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 -+ vmax.s16 q0, q2 -+ vmov.i64 q2, #0 -+ vdup.i16 q3, r2 ++ vmov.i16 q2, #0 ++ vmax.s16 q0, q1 + vadd.i16 \P0a, q0 + vsub.i16 \Q0a, q0 -+ ++ vmov.i16 q1, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 -+ vmin.s16 \P0a, q3 -+ vmin.s16 \Q0a, q3 ++ vmin.s16 \P0a, q1 ++ vmin.s16 \Q0a, q1 +.endm + -+@ Preserves r12 -+@ Clobbers r2 ++@ Clobbers r2, r12 +@ P0a et al all contain UVUVUVUV +@ r2 (tc4) contains +@ [0..7] tc U a @@ -1869,38 +1885,41 @@ index 0000000000..e665bd848a +@ [16..23] tc U b +@ [24..31] tc V b + -+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth -+ vsub.i16 q0, \Q0a, \P0a -+ vsub.i16 q1, \Q0b, \P0b -+ vsub.i16 q2, \P1a, \Q1a -+ vsub.i16 q3, \P1b, \Q1b -+ vshl.i16 q0, #2 -+ vshl.i16 q1, #2 -+ vadd.i16 q0, q2 -+ vrshr.s16 q0, #3 -+ vadd.i16 q1, q3 -+ vrshr.s16 q1, #3 -+ -+ vdup.16 d4, r2 -+ lsr r2, #16 -+ vdup.16 d6, r2 -+ vshll.u8 q2, d4, #\bit_depth - 8 -+ vshll.u8 q3, d6, #\bit_depth - 8 -+ -+ movw r2, #(1 << \bit_depth) - 1 ++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 ++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b ++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a ++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b ++ \I3 ++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b ++ \I4 + vmin.s16 q0, q2 -+ vneg.s16 q2, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 + vmin.s16 q1, q3 -+ vneg.s16 q3, q3 -+ vmax.s16 q0, q2 -+ vmov.i64 q2, #0 -+ vmax.s16 q1, q3 -+ vdup.i16 q3, r2 -+ vadd.i16 \P0a, q0 -+ vsub.i16 \Q0a, q0 -+ vadd.i16 \P0b, q1 -+ vsub.i16 \Q0b, q1 -+ ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vadd.i16 \P0a, q0 @ p0a + delta0a ++ vsub.i16 \Q0a, q0 @ q0a - delta0a ++ vmax.s16 q1, q3 @ delta0b ++ vadd.i16 \P0b, q1 @ p0b + delta0b ++ vsub.i16 \Q0b, q1 @ q0b - delta0b ++ vmov.i16 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 + vmax.s16 \P0b, q2 @@ -1923,11 +1942,10 @@ index 0000000000..e665bd848a + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes -+ ldr r5, [sp, #32] @ &_no_p -+ ldrb r10, [r5] -+ ldr r5, [sp, #36] @ &_no_q ++ ldrd r4, r5, [sp, #32] @ &_no_p ++ ldrb r4, [r4] + ldrb r5, [r5] -+ cmp r10, #0 ++ movs r10, r4 + it ne + movne r10, #1 + cmp r5, #0 @@ -1950,244 +1968,207 @@ index 0000000000..e665bd848a +@ Junks: +@ r5, r6, r7, r8, r9 + -+.macro m_filter_luma bit_depth ++.macro m_filter_luma bit_depth, Q11, Q15 +.if \bit_depth == 8 -+ vmovl.u8 q15, d23 -+ vmovl.u8 q14, d22 -+ vmovl.u8 q13, d21 -+ vmovl.u8 q12, d20 -+ vmovl.u8 q11, d19 -+ vmovl.u8 q10, d18 -+ vmovl.u8 q9, d17 -+ vmovl.u8 q8, d16 ++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 ++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 ++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 ++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 ++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 ++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 +.endif -+ vadd.i16 q7, q9, q11 ++ vadd.i16 q0, q9, \Q11 @ P2 + P0 +.if \bit_depth > 8 -+ lsl r2, r2, #(\bit_depth - 8) ++ lsl r3, r3, #(\bit_depth - 8) +.endif -+ vadd.i16 q6, q14, q12 ++ vadd.i16 q1, q14, q12 @ Q2 + Q0 +.if \bit_depth > 8 -+ lsl r3, r3, #(\bit_depth - 8) ++ lsl r2, r2, #(\bit_depth - 8) +.endif -+ vsub.i16 q7, q10 -+ vsub.i16 q6, q13 -+ vabd.s16 q7, q7, q10 -+ vabd.s16 q6, q6, q13 ++ vsub.i16 q0, q10 @ P2 - P1 + P0 ++ lsr r5, r3, #16 ++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0 ++.if \bit_depth == 8 ++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 ++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 ++.endif ++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) ++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) ++ vmov.i64 q2, #0xffffffff0000 ++ vbic q0, q2 @ only dp0(') and dp3(') ++ vbic q1, q2 @ only dq0(') and dq3(') ++ vsra.u64 q0, #16 ++ vsra.u64 q1, #16 ++ vdup.16 q3, r2 @ beta ++ vdup.16 d14, r3 @ tC[0] ++ vdup.16 d15, r5 @ tC[1] ++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) ++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 ++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 ++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 ++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) ++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) ++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 ++ vshl.s16 q6, q7, #2 @ tC[] * 4 ++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 ++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) ++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) ++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 ++ cmp r7, #0 ++ beq .Lbypasswrite + -+ vdup.16 q0, r2 -+ vmov q4, q7 -+ vmov q5, q6 -+ vdup.16 d4, r3 -+ lsr r3, r3, #16 -+ vtrn.16 q7, q4 -+ vtrn.16 q6, q5 ++ vcgt.s16 q5, q6, q5 @ if < tc25 ++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) ++ vand q4, q5 ++ vbic d8, d4 ++ vbic d9, d4 ++ vshr.s16 q3, #2 @ beta_2 = beta >> 2 ++ vsra.u64 q4, #16 ++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 ++ vshl.i16 q7, #1 @ tc2 = tC[] << 1 ++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc ++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half ++ vand d6, d8 @ && beta_2 tests, prime in ms half ++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 ++ vneg.s16 q6, q7 @ -tc2 ++ vmovn.i32 d8, q3 ++ vshrn.i32 d6, q3, #16 ++ vand d6, d8 ++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 ++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) ++ vadd.i16 q0, \Q11, q12 @ p0 + q0 ++ ands r9, r7, r8 ++ beq 1f + -+ vshl.u64 q7, #32 -+ vshr.u64 q4, #32 -+ vshl.u64 q6, #32 -+ vshr.u64 q5, #32 -+ vshr.u64 q7, #32 -+ vshr.u64 q6, #32 -+ vshl.u64 q5, #32 -+ vshl.u64 q4, #32 -+ vorr q6, q5 -+ vorr q7, q4 -+ vdup.16 d5, r3 -+ vadd.i16 q5, q7, q6 -+ -+ vmov q4, q5 -+ vmov q3, q5 -+ vtrn.32 q3, q4 -+ -+ vadd.i16 q4, q3 -+ -+ vshl.s16 q5, q5, #1 -+ vcgt.s16 q3, q0, q4 -+ -+ vmovn.i16 d6, q3 -+ vshr.s16 q1, q0, #2 -+ vmovn.i16 d6, q3 -+ vcgt.s16 q5, q1, q5 -+ vmov r7, s12 -+ cmp r7, #0 -+ beq .Lbypasswrite -+ -+ vpadd.i32 d0, d14, d12 -+ vpadd.i32 d1, d15, d13 -+ vmov q4, q2 -+ vshl.s16 q2, #2 -+ vshr.s16 q1, q1, #1 -+ vrhadd.s16 q2, q4 -+ -+ vabd.s16 q7, q8, q11 -+ vaba.s16 q7, q15, q12 -+ -+ vmovn.i32 d0, q0 -+ vmov r5, r6, s0, s1 -+ vcgt.s16 q6, q1, q7 -+ vand q5, q5, q6 -+ vabd.s16 q7, q11, q12 -+ vcgt.s16 q6, q2, q7 -+ vand q5, q5, q6 -+ -+ vmov q2, q5 -+ vtrn.s16 q5, q2 -+ vshr.u64 q2, #32 -+ vshl.u64 q5, #32 -+ vshl.u64 q2, #32 -+ vshr.u64 q5, #32 -+ vorr q5, q2 -+ -+ vmov q2, q5 -+ vshl.i16 q7, q4, #1 -+ vtrn.32 q2, q5 -+ vand q5, q2 -+ vneg.s16 q6, q7 -+ vmovn.i16 d4, q5 -+ vmovn.i16 d4, q2 -+ vmov r8, s8 -+ -+ and r9, r8, r7 -+ cmp r9, #0 -+ beq 1f -+ -+ vadd.i16 q2, q11, q12 -+ vadd.i16 q4, q9, q8 -+ vadd.i16 q1, q2, q10 -+ vdup.16 d10, r9 -+ vadd.i16 q0, q1, q9 -+ vshl.i16 q4, #1 -+ lsr r9, #16 -+ vadd.i16 q1, q0 -+ vrshr.s16 q3, q0, #2 -+ vadd.i16 q1, q13 -+ vadd.i16 q4, q0 -+ vsub.i16 q3, q10 -+ vrshr.s16 q1, #3 -+ vrshr.s16 q4, #3 -+ vmax.s16 q3, q6 -+ vsub.i16 q1, q11 -+ vsub.i16 q4, q9 -+ vmin.s16 q3, q7 -+ vmax.s16 q4, q6 -+ vmax.s16 q1, q6 -+ vadd.i16 q3, q10 -+ vmin.s16 q4, q7 -+ vmin.s16 q1, q7 -+ vdup.16 d11, r9 -+ vadd.i16 q4, q9 -+ vadd.i16 q1, q11 -+ vbit q9, q4, q5 -+ vadd.i16 q4, q2, q13 -+ vbit q11, q1, q5 -+ vadd.i16 q0, q4, q14 -+ vadd.i16 q2, q15, q14 -+ vadd.i16 q4, q0 -+ -+ vshl.i16 q2, #1 -+ vadd.i16 q4, q10 -+ vbit q10, q3, q5 -+ vrshr.s16 q4, #3 -+ vadd.i16 q2, q0 -+ vrshr.s16 q3, q0, #2 -+ vsub.i16 q4, q12 -+ vrshr.s16 q2, #3 -+ vsub.i16 q3, q13 -+ vmax.s16 q4, q6 -+ vsub.i16 q2, q14 -+ vmax.s16 q3, q6 -+ vmin.s16 q4, q7 -+ vmax.s16 q2, q6 -+ vmin.s16 q3, q7 -+ vadd.i16 q4, q12 -+ vmin.s16 q2, q7 -+ vadd.i16 q3, q13 -+ vbit q12, q4, q5 -+ vadd.i16 q2, q14 -+ vbit q13, q3, q5 -+ vbit q14, q2, q5 ++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0 ++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1 ++ lsr r3, r9, #16 ++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) ++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) ++ vadd.i16 q0, q8, q9 @ p3 + p2 ++ vadd.i16 q5, \Q15, q14 @ q2 + q3 ++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 ++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 ++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 ++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 ++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) ++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) ++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) ++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) ++ vrshr.s16 q0, #3 @ scale, with rounding ++ vrshr.s16 q5, #3 ++ vrshr.s16 q1, #2 ++ vrshr.s16 q4, #2 ++ vrshr.s16 q2, #3 ++ vrshr.s16 q3, #3 ++ vsub.i16 q0, q9 @ find difference ++ vsub.i16 q5, q14 ++ vsub.i16 q1, q10 ++ vsub.i16 q4, q13 ++ vsub.i16 q2, \Q11 ++ vsub.i16 q3, q12 ++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 ++ vmax.s16 q5, q6 ++ vmax.s16 q1, q6 ++ vmax.s16 q4, q6 ++ vmax.s16 q2, q6 ++ vmax.s16 q3, q6 ++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure ++ vdup.16 d13, r3 ++ vmin.s16 q0, q7 ++ vmin.s16 q5, q7 ++ vmin.s16 q1, q7 ++ vmin.s16 q4, q7 ++ vmin.s16 q2, q7 ++ vmin.s16 q3, q7 ++ vadd.i16 q0, q9 @ apply difference ++ vadd.i16 q5, q14 ++ vadd.i16 q1, q10 ++ vadd.i16 q4, q13 ++ vadd.i16 q2, \Q11 ++ vadd.i16 q3, q12 ++ vbit q9, q0, q6 @ apply filtered values according to mask ++ vbit q14, q5, q6 ++ vbit q10, q1, q6 ++ vbit q13, q4, q6 ++ vbit \Q11, q2, q6 ++ vbit q12, q3, q6 ++ vneg.s16 q6, q7 @ restore -tc2 + +1: -+ mvn r8, r8 -+ and r9, r8, r7 -+ cmp r9, #0 -+ beq 2f ++ bics r9, r7, r8 ++ beq 2f + -+ vdup.16 q4, r2 -+ -+ vdup.16 d10, r9 -+ lsr r9, #16 -+ vmov q1, q4 -+ vdup.16 d11, r9 -+ vshr.s16 q1, #1 -+ vsub.i16 q2, q12, q11 -+ vadd.i16 q4, q1 -+ vshl.s16 q0, q2, #3 -+ vshr.s16 q4, #3 -+ vadd.i16 q2, q0 -+ vsub.i16 q0, q13, q10 -+ vsub.i16 q2, q0 -+ vshl.i16 q0, q0, #1 -+ vsub.i16 q2, q0 -+ vshl.s16 q1, q7, 2 -+ vrshr.s16 q2, q2, #4 -+ vadd.i16 q1, q7 -+ vabs.s16 q3, q2 -+ vshr.s16 q6, q6, #1 -+ vcgt.s16 q1, q1, q3 -+ vand q5, q1 -+ vshr.s16 q7, q7, #1 -+ vmax.s16 q2, q2, q6 -+ vmin.s16 q2, q2, q7 -+ -+ vshr.s16 q7, q7, #1 -+ vrhadd.s16 q3, q9, q11 -+ vneg.s16 q6, q7 -+ vsub.s16 q3, q10 -+ vdup.16 d2, r5 -+ vhadd.s16 q3, q2 -+ vdup.16 d3, r6 -+ vmax.s16 q3, q3, q6 -+ vcgt.s16 q1, q4, q1 -+ vmin.s16 q3, q3, q7 -+ vand q1, q5 -+ vadd.i16 q3, q10 -+ lsr r5, #16 -+ lsr r6, #16 -+ vbit q10, q3, q1 -+ -+ vrhadd.s16 q3, q14, q12 -+ vdup.16 d2, r5 -+ vsub.s16 q3, q13 -+ vdup.16 d3, r6 -+ vhsub.s16 q3, q2 -+ vcgt.s16 q1, q4, q1 -+ vmax.s16 q3, q3, q6 -+ vand q1, q5 -+ vmin.s16 q3, q3, q7 -+ vadd.i16 q3, q13 -+ vbit q13, q3, q1 -+ vadd.i16 q0, q11, q2 -+ vsub.i16 q4, q12, q2 -+ vbit q11, q0, q5 -+ vbit q12, q4, q5 ++ vsub.i16 q0, q12, \Q11 @ q0 - p0 ++ vsub.i16 q1, q13, q10 @ q1 - p1 ++ lsr r3, r9, #16 ++ vshl.i16 q2, q0, #3 ++ lsr r7, r5, #16 ++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) ++ lsr r8, r6, #16 ++ vshl.i16 q2, q1, #1 ++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) ++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1 ++ vsub.i16 q5, q3, q4 ++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 ++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 ++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 ++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 ++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 ++ vmax.s16 q6, q5 @ ++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 ++ vdup.16 q0, r2 @ beta ++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] ++ vshr.s16 q4, #1 @ tc_2 = tc >> 1 ++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 ++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 ++ vshr.s16 q2, q0, #1 @ beta >> 1 ++ vadd.i16 q2, q0 @ beta + (beta >> 1) ++ vneg.s16 q0, q4 @ -tc_2 ++ vabs.s16 q5, q5 @ abs(original delta0) ++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 ++ vmax.s16 q1, q0 ++ vmax.s16 q3, q0 ++ vshl.s16 q0, q7, #2 @ 8 * tc ++ vadd.i16 q7, q0 @ 10 * tc ++ vdup.16 d0, r9 ++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering ++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) ++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) ++ vdup.16 d8, r5 @ dp0 + dp3 ++ vdup.16 d9, r7 @ dp0' + dp3' ++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) ++ vdup.16 d10, r6 @ dq0 + dq3 ++ vdup.16 d11, r8 @ dq0' + dq3' ++ vand q7, q0 @ AND block and line masks ++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) ++ vadd.i16 q0, q1, q10 @ p1 + deltap1 ++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) ++ vadd.i16 q3, q3, q13 @ q1 + deltaq1 ++ vadd.i16 q1, \Q11, q6 @ p0 + delta0 ++ vsub.i16 q2, q12, q6 @ q0 - delta0 ++ vand q4, q7 @ AND nd_p test with block/line masks ++ vand q5, q7 @ AND nd_q test with block/line masks ++ vbit q10, q0, q4 ++ vbit \Q11, q1, q7 ++ vbit q12, q2, q7 ++ vbit q13, q3, q5 + +2: +.if \bit_depth == 8 ++ vmovn.i16 d16, q8 ++ vmovn.i16 d23, \Q15 + neg r1, r1 -+ vqmovun.s16 d16, q8 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 -+ vqmovun.s16 d19, q11 ++ vqmovun.s16 d19, \Q11 + lsls r10, #31 + vqmovun.s16 d20, q12 + vqmovun.s16 d21, q13 + vqmovun.s16 d22, q14 -+ vqmovun.s16 d23, q15 +.else -+ movw r5, #(1 << \bit_depth - 1) -+ vmov.i64 q0, #0 -+ vdup.i16 q1, r5 ++ vmov.i16 q0, #0 ++ vmov.i16 q1, #(1 << \bit_depth - 1) + @ q8 & q15 should be unaltered and so don't require clipping + neg r1, r1 + vmax.s16 q9, q0 @@ -2204,14 +2185,14 @@ index 0000000000..e665bd848a + vmin.s16 q13, q1 + vmin.s16 q14, q1 +.endif -+ mov pc, lr ++ bx lr +.endm + +function hevc_loop_filter_luma_body -+ m_filter_luma 8 ++ m_filter_luma 8, q15, q11 +endfunc + -+@ void ff_hevc_rpi_v_loop_filter_luma_neon( ++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( +@ uint8_t *_pix, [r0] +@ ptrdiff_t _stride, [r1] +@ int _beta, [r2] @@ -2219,7 +2200,7 @@ index 0000000000..e665bd848a +@ uint8_t *_no_p, [sp+0] +@ uint8_t *_no_q) [sp+4] + -+function ff_hevc_rpi_v_loop_filter_luma_neon, export=1 ++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + + sub r4, r0, #4 @@ -2245,66 +2226,72 @@ index 0000000000..e665bd848a +.Lv_loop_luma_common: + vpush {d8-d15} + -+ @ Uses slightly fewer instructions to do laned loads than unlaned -+ @ and transpose. This also means that we can use the same code for -+ @ both split & unsplit deblock -+ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1 -+ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1 -+ -+ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ -+ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 -+ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 -+ -+ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ -+ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 -+ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 -+ -+ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ -+ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 -+ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 -+ -+ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32] -+ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32] ++ @ It's slightly faster to do unlaned loads and transpose in the ++ @ 8-bit case, even though it needs more instructions, because ++ @ VLD4.8 is a really slow way to read from memory. ++ vld1.32 {d16[0]}, [r4:32], r1 ++ vld1.32 {d20[0]}, [r0:32], r1 ++ vld1.32 {d16[1]}, [r4:32], r1 ++ vld1.32 {d20[1]}, [r0:32], r1 ++ vld1.32 {d17[0]}, [r4:32], r1 ++ vld1.32 {d21[0]}, [r0:32], r1 ++ vld1.32 {d17[1]}, [r4:32], r1 ++ vld1.32 {d21[1]}, [r0:32], r1 ++ vld1.32 {d18[0]}, [r4:32], r1 ++ vld1.32 {d22[0]}, [r0:32], r1 ++ vld1.32 {d18[1]}, [r4:32], r1 ++ vld1.32 {d22[1]}, [r0:32], r1 ++ vld1.32 {d19[0]}, [r4:32], r1 ++ vld1.32 {d23[0]}, [r0:32], r1 ++ vld1.32 {d19[1]}, [r4:32] ++ vld1.32 {d23[1]}, [r0:32] ++ vuzp.16 q8, q9 ++ vuzp.16 q10, q11 ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vswp d17, d18 ++ vswp d21, d22 + + bl hevc_loop_filter_luma_body + ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ + @ no_p[1] + bmi 1f + vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 -+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 + vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 + + vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 + vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] +1: + @ no_q[1] -+@ tst r10, #2 + bcs 1f + vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 -+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 + vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 + + vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 + vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] +1: ++ pop {r4-r10,pc} ++ +.Lbypasswrite: + vpop {d8-d15} + pop {r4-r10,pc} +endfunc + -+.macro m_filter_v_luma_common_16 bit_depth ++.macro m_filter_v_luma_16 bit_depth + vpush {d8-d15} + + @ Uses slightly fewer instructions to do laned loads than unlaned @@ -2336,29 +2323,34 @@ index 0000000000..e665bd848a + + bl hevc_loop_filter_luma_body_\bit_depth + ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ + @ p[1] + bmi 1f + vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 -+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 + vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 -+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 + vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 -+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 + vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 -+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4] ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] +1: + @ q[1] + bcs 1f + vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 -+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 + vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 -+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 + vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 -+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 + vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 -+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0] ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] +1: -+ vpop {d8-d15} + pop {r4-r10,pc} +.endm + @@ -2374,7 +2366,7 @@ index 0000000000..e665bd848a +@ +@ Src should always be on 8 byte boundry & all in the same slice + -+function ff_hevc_rpi_h_loop_filter_luma_neon, export=1 ++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + b .Lh_loop_filter_luma_common_8 +endfunc @@ -2387,71 +2379,75 @@ index 0000000000..e665bd848a + ldr r10, [sp, #32] + +.Lh_loop_filter_luma_common_8: ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 + vpush {d8-d15} -+ sub r0, r0, r1, lsl #2 + -+ vld1.8 {d16}, [r0], r1 ++ vld1.8 {d16}, [r4], r1 + vld1.8 {d17}, [r0], r1 -+ vld1.8 {d18}, [r0], r1 ++ vld1.8 {d18}, [r4], r1 + vld1.8 {d19}, [r0], r1 -+ vld1.8 {d20}, [r0], r1 ++ vld1.8 {d20}, [r4], r1 + vld1.8 {d21}, [r0], r1 -+ vld1.8 {d22}, [r0], r1 ++ vld1.8 {d22}, [r4] + vld1.8 {d23}, [r0] + + bl hevc_loop_filter_luma_body + -+ add r2, r0, r1, lsl #2 -+ add r0, r0, r1 -+ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f -+ vst1.8 {d22}, [r0], r1 -+ vst1.8 {d21}, [r0], r1 -+ vst1.8 {d20}, [r0] ++ vst1.8 {d22}, [r4], r1 ++ vst1.8 {d21}, [r6] ++ vst1.8 {d20}, [r4] +1: + @ Q0-Q2 + bmi 1f -+ vst1.8 {d19}, [r2], r1 -+ vst1.8 {d18}, [r2], r1 -+ vst1.8 {d17}, [r2] ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r2] ++ vst1.8 {d17}, [r0] +1: + pop {r4-r10,pc} +endfunc + + +.macro m_filter_h_luma_16 bit_depth ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 + vpush {d8-d15} -+ sub r0, r0, r1, lsl #2 + -+ vld1.16 { q8}, [r0], r1 ++ vld1.16 { q8}, [r4], r1 + vld1.16 { q9}, [r0], r1 -+ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q10}, [r4], r1 + vld1.16 {q11}, [r0], r1 -+ vld1.16 {q12}, [r0], r1 ++ vld1.16 {q12}, [r4], r1 + vld1.16 {q13}, [r0], r1 -+ vld1.16 {q14}, [r0], r1 ++ vld1.16 {q14}, [r4] + vld1.16 {q15}, [r0] + + bl hevc_loop_filter_luma_body_\bit_depth + -+ add r2, r0, r1, lsl #2 -+ add r0, r1 -+ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f -+ vst1.16 {q14}, [r0], r1 -+ vst1.16 {q13}, [r0], r1 -+ vst1.16 {q12}, [r0] ++ vst1.16 {q14}, [r4], r1 ++ vst1.16 {q13}, [r6] ++ vst1.16 {q12}, [r4] +1: + bmi 1f -+ vst1.16 {q11}, [r2], r1 -+ vst1.16 {q10}, [r2], r1 -+ vst1.16 { q9}, [r2] ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r2] ++ vst1.16 { q9}, [r0] +1: + pop {r4-r10,pc} +.endm @@ -2474,23 +2470,25 @@ index 0000000000..e665bd848a +@ common in the H direction than V due to how we arrange deblock. + +function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 ++ sub r12, r0, r1 + cmp r2, #0 + bxeq lr -+ sub r0, r0, r1, lsl #1 ++ vld1.8 {d26,d27}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.8 {d18,d19}, [r12], r1 + vld1.8 {d16,d17}, [r0], r1 -+ vld1.8 {d18,d19}, [r0], r1 -+ vld1.8 {d26,d27}, [r0], r1 -+ vld1.8 {d28,d29}, [r0] -+ sub r0, r0, r1, lsl #1 -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29 ++ vld1.8 {d28,d29}, [r12] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ ++ "sub r12, r0, r1, asr #1" + -+ lsls r2, r3, #31 @ b0 -> N, b1 -> C -+ vstrpl d18, [r0, #0] -+ vstrcc d19, [r0, #8] -+ add r0, r1 + lsls r3, #29 @ b2 -> N, b3 -> C + vstrpl d26, [r0, #0] + vstrcc d27, [r0, #8] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ vstrpl d18, [r12, #0] ++ vstrcc d19, [r12, #8] + bx lr + +endfunc @@ -2506,37 +2504,38 @@ index 0000000000..e665bd848a +@ Macro here actual function near bottom + +.macro m_filter_h_uv_16 bit_depth ++ sub r12, r0, r1 + cmp r2, #0 + bxeq lr -+ sub r0, r0, r1, lsl #1 ++ vld1.16 {q12, q13}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.16 {q10, q11}, [r12], r1 + vld1.16 {q8, q9 }, [r0], r1 -+ vld1.16 {q10, q11}, [r0], r1 -+ vld1.16 {q12, q13}, [r0], r1 -+ vld1.16 {q14, q15}, [r0] -+ sub r0, r0, r1, lsl #1 ++ vld1.16 {q14, q15}, [r12] + -+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ ++ "sub r12, r0, r1, asr #1", \ ++ "cmp r3, #0" + -+ cmp r3, #0 + bne 1f -+ vst1.16 {q10, q11}, [r0], r1 ++ vst1.16 {q10, q11}, [r12] + vst1.16 {q12, q13}, [r0] + bx lr + + @ At least one no_f bit is set + @ Which means we need to break this apart in an ugly fashion +1: -+ lsls r2, r3, #31 @ b0 -> N, b1 -> C -+ vstrpl d20, [r0, #0] -+ vstrpl d21, [r0, #8] -+ vstrcc d22, [r0, #16] -+ vstrcc d23, [r0, #24] -+ add r0, r1 + lsls r3, #29 @ b2 -> N, b3 -> C + vstrpl d24, [r0, #0] + vstrpl d25, [r0, #8] + vstrcc d26, [r0, #16] + vstrcc d27, [r0, #24] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ vstrpl d20, [r12, #0] ++ vstrpl d21, [r12, #8] ++ vstrcc d22, [r12, #16] ++ vstrcc d23, [r12, #24] + bx lr +.endm + @@ -2556,6 +2555,7 @@ index 0000000000..e665bd848a +function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + cmp r2, #0 + bxeq lr ++ push {lr} + vld2.16 {d16[0], d18[0]}, [r3], r1 + vld2.16 {d20[0], d22[0]}, [r0], r1 + @@ -2570,106 +2570,112 @@ index 0000000000..e665bd848a + vld2.16 {d20[3], d22[3]}, [r0], r1 + blo 10f + -+ sub r12, r0, r3 + vld2.16 {d17[0], d19[0]}, [r3], r1 + vld2.16 {d21[0], d23[0]}, [r0], r1 + -+ cmp r12, #4 ++ sub ip, r0, r3 + vld2.16 {d17[1], d19[1]}, [r3], r1 + vld2.16 {d21[1], d23[1]}, [r0], r1 + ++ cmp ip, #4 + vld2.16 {d17[2], d19[2]}, [r3], r1 + vld2.16 {d21[2], d23[2]}, [r0], r1 + + vld2.16 {d17[3], d19[3]}, [r3] + vld2.16 {d21[3], d23[3]}, [r0] -+ it eq -+ ldreq r12, [sp, #0] + -+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 -+ cmp r12, #0 -+ add r3, #2 -+ neg r1, r1 ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #2", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ + bne 1f + +@ Much/most of the time r0 == r3 + 4 and no_f == 0 +@ so it is worth having this special case + vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b -+ vst2.16 {d19[2], d21[2]}, [r3], r1 ++ vst2.16 {d19[2], d21[2]}, [ip], r1 + vst2.16 {d19[1], d21[1]}, [r3], r1 -+ vst2.16 {d19[0], d21[0]}, [r3], r1 ++ vst2.16 {d19[0], d21[0]}, [ip], r1 + vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a -+ vst2.16 {d18[2], d20[2]}, [r3], r1 -+ vst2.16 {d18[1], d20[1]}, [r3], r1 -+ vst2.16 {d18[0], d20[0]}, [r3] -+ bx lr ++ vst2.16 {d18[2], d20[2]}, [ip], r1 ++ vst2.16 {d18[1], d20[1]}, [r3] ++ vst2.16 {d18[0], d20[0]}, [ip] ++ pop {pc} + +@ Either split or partial +1: -+ ldr r12, [sp, #0] -+ @ I have no idea if this is faster than any of the other ways of -+ @ testing these bits but it does free up r12 -+ lsl r12, #28 -+ add r2, r0, r1, lsl #2 -+ msr APSR_nzcvq, r12 @ b0 (P0a) -> V, b1 (Q0a) -> C, b2 (P0b) -> Z, b3 (Q0b) -> N -+ add r12, r3, r1, lsl #2 -+ bmi 1f ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f + @ Q0b + vst1.16 {d21[3]}, [r0], r1 -+ vst1.16 {d21[2]}, [r0], r1 ++ vst1.16 {d21[2]}, [r2], r1 + vst1.16 {d21[1]}, [r0], r1 -+ vst1.16 {d21[0]}, [r0] ++ vst1.16 {d21[0]}, [r2], r1 +1: -+ beq 2f ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f + @ P0b + vst1.16 {d19[3]}, [r3], r1 -+ vst1.16 {d19[2]}, [r3], r1 ++ vst1.16 {d19[2]}, [ip], r1 + vst1.16 {d19[1]}, [r3], r1 -+ vst1.16 {d19[0]}, [r3] -+ -+2: -+ bcs 3f ++ vst1.16 {d19[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f + @ Q0a -+ vst1.16 {d20[3]}, [r2], r1 ++ vst1.16 {d20[3]}, [r0], r1 + vst1.16 {d20[2]}, [r2], r1 -+ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[1]}, [r0] + vst1.16 {d20[0]}, [r2] -+ -+3: -+ it vs -+ bxvs lr -+ vst1.16 {d18[3]}, [r12], r1 -+ vst1.16 {d18[2]}, [r12], r1 -+ vst1.16 {d18[1]}, [r12], r1 -+ vst1.16 {d18[0]}, [r12] -+ bx lr ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[3]}, [r3], r1 ++ vst1.16 {d18[2]}, [ip], r1 ++ vst1.16 {d18[1]}, [r3] ++ vst1.16 {d18[0]}, [ip] ++ pop {pc} + +@ Single lump (rather than double) +10: -+ hevc_loop_filter_uv_body1 d16, d18, d20, d22 -+ + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) -+ ldr r12, [sp, #0] -+ add r3, #2 -+ sub r0, r0, r1, lsl #2 -+ sub r3, r3, r1, lsl #2 -+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #2", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" + + bcs 3f ++ @ Q0a + vst1.16 {d20[0]}, [r0], r1 -+ vst1.16 {d20[1]}, [r0], r1 -+ vst1.16 {d20[2]}, [r0], r1 -+ vst1.16 {d20[3]}, [r0] -+ ++ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[2]}, [r0] ++ vst1.16 {d20[3]}, [r2] +3: -+ it mi -+ bxmi lr ++ it mi ++ popmi {pc} ++ @ P0a + vst1.16 {d18[0]}, [r3], r1 -+ vst1.16 {d18[1]}, [r3], r1 -+ vst1.16 {d18[2]}, [r3], r1 -+ vst1.16 {d18[3]}, [r3] -+ bx lr ++ vst1.16 {d18[1]}, [ip], r1 ++ vst1.16 {d18[2]}, [r3] ++ vst1.16 {d18[3]}, [ip] ++ pop {pc} + +endfunc + @@ -2695,14 +2701,14 @@ index 0000000000..e665bd848a +.macro m_filter_v_uv2_16 bit_depth + cmp r2, #0 + bxeq lr -+ ++ push {lr} + vld2.32 {d16[0], d18[0]}, [r3], r1 + vld2.32 {d20[0], d22[0]}, [r0], r1 + ++ cmp r2, #0x10000 + vld2.32 {d16[1], d18[1]}, [r3], r1 + vld2.32 {d20[1], d22[1]}, [r0], r1 + -+ cmp r2, #0x10000 + vld2.32 {d17[0], d19[0]}, [r3], r1 + vld2.32 {d21[0], d23[0]}, [r0], r1 + @@ -2713,170 +2719,509 @@ index 0000000000..e665bd848a + vld2.32 {d24[0], d26[0]}, [r3], r1 + vld2.32 {d28[0], d30[0]}, [r0], r1 + ++ sub ip, r0, r3 + vld2.32 {d24[1], d26[1]}, [r3], r1 + vld2.32 {d28[1], d30[1]}, [r0], r1 -+ sub r12, r0, r3 + ++ cmp ip, #8 + vld2.32 {d25[0], d27[0]}, [r3], r1 + vld2.32 {d29[0], d31[0]}, [r0], r1 -+ cmp r12, #8 + + vld2.32 {d25[1], d27[1]}, [r3] + vld2.32 {d29[1], d31[1]}, [r0] -+ it eq -+ ldreq r12, [sp, #0] + -+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth -+ cmp r12, #0 -+ add r3, #4 -+ neg r1, r1 ++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #4", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ + bne 1f + -+@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ Much/most of the time r0 == r3 + 8 and no_f == 0 +@ so it is worth having this special case -+ vst2.32 {d27[1], d29[1]}, [r3], r1 -+ vst2.32 {d27[0], d29[0]}, [r3], r1 -+ vst2.32 {d26[1], d28[1]}, [r3], r1 -+ vst2.32 {d26[0], d28[0]}, [r3], r1 -+ vst2.32 {d19[1], d21[1]}, [r3], r1 -+ vst2.32 {d19[0], d21[0]}, [r3], r1 -+ vst2.32 {d18[1], d20[1]}, [r3], r1 -+ vst2.32 {d18[0], d20[0]}, [r3] -+ bx lr ++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b ++ vst2.32 {d27[0], d29[0]}, [ip], r1 ++ vst2.32 {d26[1], d28[1]}, [r3], r1 ++ vst2.32 {d26[0], d28[0]}, [ip], r1 ++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a ++ vst2.32 {d19[0], d21[0]}, [ip], r1 ++ vst2.32 {d18[1], d20[1]}, [r3] ++ vst2.32 {d18[0], d20[0]}, [ip] ++ pop {pc} + +@ Either split or partial +1: -+ ldr r12, [sp, #0] -+ lsls r12, #29 @ b2 (P0b) -> N, b3 (Q0b) -> C ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 + bcs 1f + @ Q0b -+ mov r2, r0 -+ vst1.32 {d29[1]}, [r2], r1 ++ vst1.32 {d29[1]}, [r0], r1 + vst1.32 {d29[0]}, [r2], r1 -+ vst1.32 {d28[1]}, [r2], r1 -+ vst1.32 {d28[0]}, [r2] ++ vst1.32 {d28[1]}, [r0], r1 ++ vst1.32 {d28[0]}, [r2], r1 +1: -+ bmi 2f ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f + @ P0b -+ mov r2, r3 -+ vst1.32 {d27[1]}, [r2], r1 -+ vst1.32 {d27[0]}, [r2], r1 -+ vst1.32 {d26[1]}, [r2], r1 -+ vst1.32 {d26[0]}, [r2] -+ -+2: -+ lsls r12, #2 @ b0 (P0a) -> N, b1 (Q0a) -> C -+ bcs 3f ++ vst1.32 {d27[1]}, [r3], r1 ++ vst1.32 {d27[0]}, [ip], r1 ++ vst1.32 {d26[1]}, [r3], r1 ++ vst1.32 {d26[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f + @ Q0a -+ add r0, r0, r1, lsl #2 + vst1.32 {d21[1]}, [r0], r1 -+ vst1.32 {d21[0]}, [r0], r1 -+ vst1.32 {d20[1]}, [r0], r1 -+ vst1.32 {d20[0]}, [r0] -+ -+3: -+ it mi -+ bxmi lr ++ vst1.32 {d21[0]}, [r2], r1 ++ vst1.32 {d20[1]}, [r0] ++ vst1.32 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} + @ P0a -+ add r3, r3, r1, lsl #2 + vst1.32 {d19[1]}, [r3], r1 -+ vst1.32 {d19[0]}, [r3], r1 -+ vst1.32 {d18[1]}, [r3], r1 -+ vst1.32 {d18[0]}, [r3] -+ bx lr -+ ++ vst1.32 {d19[0]}, [ip], r1 ++ vst1.32 {d18[1]}, [r3] ++ vst1.32 {d18[0]}, [ip] ++ pop {pc} + ++@ Single lump (rather than double) +10: -+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth -+ + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) -+ ldr r12, [sp, #0] -+ add r3, #4 -+ sub r0, r0, r1, lsl #2 -+ sub r3, r3, r1, lsl #2 -+ lsls r12, #31 @ b0 (P0a) -> N, b1 (Q0a) -> C ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #4", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" + + bcs 3f + @ Q0a + vst1.32 {d20[0]}, [r0], r1 -+ vst1.32 {d20[1]}, [r0], r1 -+ vst1.32 {d21[0]}, [r0], r1 -+ vst1.32 {d21[1]}, [r0] -+ ++ vst1.32 {d20[1]}, [r2], r1 ++ vst1.32 {d21[0]}, [r0] ++ vst1.32 {d21[1]}, [r2] +3: -+ it mi -+ bxmi lr ++ it mi ++ popmi {pc} + @ P0a + vst1.32 {d18[0]}, [r3], r1 -+ vst1.32 {d18[1]}, [r3], r1 -+ vst1.32 {d19[0]}, [r3], r1 -+ vst1.32 {d19[1]}, [r3] -+ bx lr ++ vst1.32 {d18[1]}, [ip], r1 ++ vst1.32 {d19[0]}, [r3] ++ vst1.32 {d19[1]}, [ip] ++ pop {pc} +.endm + + ++#if 1 // NEON version + + -+/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i -+ * int *curr_rpl0, int *curr_ -+ * MvField *curr, MvField *ne ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ mov ip, sp ++ push {a2,v1-v8,lr} ++ ldm ip, {v1-v5} ++ cmp a1, #2 ++ bls 2f ++ vpush {d8-d13} ++ sub v5, v5, #10 ++ mov v6, #32 ++1: ++ vld2.32 {d0[0], d2[0]}, [a3]! ++ vld2.32 {d4[0], d6[0]}, [a4]! ++ vmov.u8 q12, #0 ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[0]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[0]}, [ip] ++ vld1.32 {d18[0]}, [v8] ++ vld1.32 {d22[0]}, [lr] ++ ++ vld2.32 {d0[1], d2[1]}, [a3]! ++ vld2.32 {d4[1], d6[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d12, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d13, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d27, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[2]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[1]}, [ip] ++ vld1.32 {d18[1]}, [v8] ++ vld1.32 {d22[1]}, [lr] ++ ++ vld2.32 {d1[0], d3[0]}, [a3]! ++ vld2.32 {d5[0], d7[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[4]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[4]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[0]}, [ip] ++ vld1.32 {d19[0]}, [v8] ++ vld1.32 {d23[0]}, [lr] ++ ++ vld2.32 {d1[1], d3[1]}, [a3]! ++ vld2.32 {d5[1], d7[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[6]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[6]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[1]}, [ip] ++ vld1.32 {d19[1]}, [v8] ++ vld1.32 {d23[1]}, [lr] ++ ++ @ So now we have: ++ @ q0.32[i] = curr[i].mv[0] ++ @ q1.32[i] = curr[i].mv[1] ++ @ q2.32[i] = neigh[i].mv[0] ++ @ q3.32[i] = neigh[i].mv[1] ++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d24.16[i] = curr[i].pred_flag ++ @ d25.16[i] = neigh[i].pred_flag ++ ++ vtst.16 d28, d24, d12 ++ vtst.16 d29, d24, d13 ++ vadd.i16 d8, d24, d12 ++ vadd.i16 d9, d25, d12 ++ vtst.16 d30, d25, d12 ++ vtst.16 d31, d25, d13 ++ veor d26, d8, d9 ++ ldr lr, [sp, 6*8] ++ vmovl.s16 q4, d28 ++ vmovl.s16 q5, d29 ++ teq lr, #1 ++ vmovl.s16 q14, d30 ++ lslne v1, lr, #1 ++ vmovl.s16 q15, d31 ++ rsbne v2, v1, #32 ++ vbif q0, q1, q4 ++ vbif q2, q3, q14 ++ vbif q1, q0, q5 ++ vbif q3, q2, q15 ++ vabd.s16 q12, q0, q2 ++ vabd.s16 q2, q1 ++ vabd.s16 q0, q3 ++ vabd.s16 q1, q3 ++ vbif q8, q9, q4 ++ vbif q10, q11, q14 ++ vbif q9, q8, q5 ++ vbif q11, q10, q15 ++ vclt.u16 d6, d24, d27 ++ vclt.u16 d8, d2, d27 ++ vclt.u16 d7, d25, d27 ++ vclt.u16 d9, d3, d27 ++ vclt.u16 d2, d0, d27 ++ vclt.u16 d0, d4, d27 ++ vclt.u16 d3, d1, d27 ++ vclt.u16 d1, d5, d27 ++ vceq.i32 q12, q10, q8 ++ vceq.i32 q10, q9 ++ vceq.i32 q8, q11 ++ vceq.i32 q9, q11 ++ vshrn.i32 d6, q3, #8 ++ vshrn.i32 d7, q4, #8 ++ vshrn.i32 d8, q1, #8 ++ vshrn.i32 d9, q0, #8 ++ vmovn.i32 d4, q12 ++ vmovn.i32 d2, q10 ++ vmovn.i32 d3, q8 ++ vmovn.i32 d5, q9 ++ vand q2, q3 ++ vrev16.8 q3, q3 ++ vand q2, q3 ++ vand q1, q4 ++ vrev16.8 q4, q4 ++ vand q1, q4 ++ vand d4, d5 ++ vand d2, d3 ++ vbic d0, d12, d4 ++ vshr.u16 d26, #2 ++ vbic d0, d2 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d26 ++ bne 10f ++ ++ @ Merge results into result word, no duplicates ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ sub v6, #8 ++ lsl a2, #30 ++ lsl v8, #30 ++ lsl ip, #30 ++ lsl lr, #30 ++ orr a2, ip, a2, lsr #2 ++ orr v8, lr, v8, lsr #2 ++ orr a2, v8, a2, lsr #4 ++ subs a1, #4 ++ orr v7, a2, v7, lsr #8 ++ bhi 1b ++ ++ vpop {d8-d13} ++ mov a1, v7, lsr v6 ++ pop {a2,v1-v8,pc} ++10: ++ @ Merge results into result word, with duplicates ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ sub v6, v6, v1, lsl #2 ++ lsl a2, v2 ++ subs a1, #4 ++ lsl v8, v2 ++ lsl ip, v2 ++ lsl lr, v2 ++ ldr v2, [sp, #6*8 + 10*4 + 1*4] ++ orr a2, ip, a2, lsr v1 ++ lsl ip, v1, #1 ++ orr v8, lr, v8, lsr v1 ++ lsl lr, v1, #2 ++ orr a2, v8, a2, lsr ip ++ ldr v1, [sp, #6*8 + 10*4] ++ orr v7, a2, v7, lsr lr ++ bhi 1b ++ ++ vpop {d8-d13} ++ mov a1, v7, lsr v6 ++ pop {a2,v1-v8,pc} ++ ++ ++2: ++ sub v5, v5, #10 ++ vmov.u8 d16, #0 ++ blo 3f ++ vld2.32 {d0[0], d1[0]}, [a3]! ++ vld2.32 {d2[0], d3[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[4]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[0]}, [ip] ++ vld1.32 {d6[0]}, [v8] ++ vld1.32 {d7[0]}, [lr] ++ ++3: ++ vld2.32 {d0[1], d1[1]}, [a3]! ++ vld2.32 {d2[1], d3[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d17, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d18, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d19, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[6]}, [a4], v5 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[1]}, [ip] ++ vld1.32 {d6[1]}, [v8] ++ vld1.32 {d7[1]}, [lr] ++ ++ @ So now we have: ++ @ d0.32[i] = curr[i].mv[0] ++ @ d1.32[i] = curr[i].mv[1] ++ @ d2.32[i] = neigh[i].mv[0] ++ @ d3.32[i] = neigh[i].mv[1] ++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d16.16[i] = curr[i].pred_flag ++ @ d16.16[2+i] = neigh[i].pred_flag ++ ++ vtst.16 d20, d16, d17 ++ vtst.16 d22, d16, d18 ++ vadd.i16 d30, d16, d17 ++ vswp d2, d3 ++ ldr lr, [sp] ++ vmovl.s16 q10, d20 ++ teq lr, #1 ++ vmovl.s16 q11, d22 ++ lslne v1, lr, #1 ++ vbif d0, d1, d20 ++ vbif d4, d6, d20 ++ vbif d3, d2, d21 ++ vbif d5, d7, d21 ++ vbif d1, d0, d22 ++ vbif d6, d4, d22 ++ vbif d2, d3, d23 ++ vbif d7, d5, d23 ++ vshr.u16 d30, #2 ++ vabd.s16 d24, d0, d3 ++ vabd.s16 d25, d1, d2 ++ vabd.s16 q0, q0, q1 ++ vceq.i32 d2, d4, d5 ++ vceq.i32 d20, d5, d6 ++ vceq.i32 d21, d4, d7 ++ vceq.i32 d3, d6, d7 ++ vclt.u16 d6, d24, d19 ++ vclt.u16 d7, d25, d19 ++ vclt.u16 d22, d1, d19 ++ vclt.u16 d23, d0, d19 ++ vshrn.i32 d6, q3, #8 ++ vmovn.i32 d2, q1 ++ vshrn.i32 d7, q11, #8 ++ vmovn.i32 d3, q10 ++ vand q0, q3, q1 ++ rsbne v2, v1, #32 ++ vrev16.8 q3, q3 ++ vand q0, q3 ++ vsra.u64 d30, #32 ++ vshr.u64 q1, q0, #32 ++ vand q0, q1 ++ vbic d0, d17, d0 ++ vand d30, d30, d17 ++ vbic d0, d1 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d30 ++ bne 10f ++ ++ @ Construct result word, no duplicates ++ cmp a1, #2 ++ vmov.u16 a1, d0[1] ++ vmov.u16 a2, d0[0] ++ orreq a1, a2, a1, lsl #2 ++ pop {a2,v1-v8,pc} ++10: ++ @ Construct result word, with duplicates ++ cmp a1, #2 ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov.u16 a1, d0[1] ++ lsl a2, #16 ++ pkhbt a1, a1, a1, lsl #16 ++ lsr a2, v2 ++ lsr a1, v2 ++ orreq a1, a2, a1, lsl v1 ++ pop {a2,v1-v8,pc} ++endfunc ++ ++ ++ ++#else // non-NEON version ++ ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 + push {a2-a4,v1-v8,lr} -+ ldmia ip, {v5-v7} ++ mov v6, #32 +1: ldmdb ip, {v1-v4} -+ ldrsb a3, [v5, #8] @ curr->ref_idx -+ ldrsb v8, [v5, #9] -+ ldrsb ip, [v6, #8] @ neigh->ref_idx -+ ldrsb lr, [v6, #9] -+ ldr v1, [v1, a3, lsl #2] -+ ldrb a3, [v5, #10] @ curr->pred_flag ++ ldrsb v5, [a3, #8] @ curr->ref_idx ++ ldrsb v8, [a3, #9] ++ ldrsb ip, [a4, #8] @ neigh->ref_idx ++ ldrsb lr, [a4, #9] ++ ldr v1, [v1, v5, lsl #2] ++ ldrb v5, [a3, #10] @ curr->pred_flag + ldr v2, [v2, v8, lsl #2] -+ ldrb v8, [v6, #10] @ neigh->pred_flag ++ ldrb v8, [a4, #10] @ neigh->pred_flag + ldr v3, [v3, ip, lsl #2] + ldr v4, [v4, lr, lsl #2] -+ teq a3, #3 ++ teq v5, #3 + beq 20f + teq v8, #3 + beq 90f + -+ tst a3, #1 ++ tst v5, #1 + itee ne -+ ldrne a3, [v5, #0] @ curr->mv[0] -+ ldreq a3, [v5, #4] @ curr->mv[1] ++ ldrne v5, [a3, #0] @ curr->mv[0] + moveq v1, v2 ++ ldreq v5, [a3, #4] @ curr->mv[1] + tst v8, #1 + itee ne -+ ldrne v8, [v6, #0] @ neigh->mv[0] -+ ldreq v8, [v6, #4] @ neigh->mv[1] ++ ldrne v8, [a4, #0] @ neigh->mv[0] + moveq v3, v4 ++ ldreq v8, [a4, #4] @ neigh->mv[1] + teq v1, v3 + bne 10f + ldr lr, =0xFFFCFFFC -+ ssub16 ip, v8, a3 -+ ssub16 a3, a3, v8 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 ip, v8, v5 ++ ssub16 v5, v5, v8 ++ sel v5, v5, ip ++ ands v5, v5, lr + @ drop through +10: it ne -+ movne a3, #1 -+11: subs a2, a2, #1 -+12: -+A strbhs a3, [v7], a4 -+T itt hs -+T strbhs a3, [v7] -+T addhs v7, v7, a4 ++ movne v5, #1<<30 ++11: ++ sub v6, v6, #2 ++T mov v7, v7, lsr #2 + subs a2, a2, #1 -+ bhs 12b ++A orr v7, v5, v7, lsr #2 ++T orr v7, v5, v7 ++ bhi 11b + -+ ldm sp, {a2, a3} ++ ldr v5, [sp, #16*4] + add ip, sp, #16*4 ++ ldr a2, [sp] + subs a1, a1, #1 -+ add v5, v5, a3 -+ add v6, v6, a3 ++ add a3, a3, v5 ++ add a4, a4, v5 + bhi 1b ++ mov a1, v7, lsr v6 + pop {a2-a4,v1-v8,pc} + +20: teq v8, #3 @@ -2889,43 +3234,43 @@ index 0000000000..e665bd848a + teq v1, v2 + bne 30f + -+ ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 -+ ssub16 a3, v1, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 25f + ssub16 ip, v4, v2 -+ ssub16 a3, v2, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + beq 11b + @ drop through +25: ssub16 ip, v4, v1 -+ ssub16 a3, v1, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 10b + ssub16 ip, v3, v2 -+ ssub16 a3, v2, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + b 10b + -+30: ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++30: ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 -+ ssub16 a3, v1, v3 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr + bne 10b + ssub16 ip, v4, v2 -+ ssub16 a3, v2, v4 -+ sel a3, a3, ip -+ ands a3, a3, lr ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr + b 10b + +40: teq v1, v4 @@ -2933,21 +3278,26 @@ index 0000000000..e665bd848a + teqeq v2, v3 + bne 10b + -+ ldrd v1, v2, [v5] @ curr->mv -+ ldrd v3, v4, [v6] @ neigh->mv ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + b 25b + -+90: mov a3, #1 ++90: ++ mov v5, #1<<30 + b 11b +endfunc + ++ ++#endif ++ ++ +@ ============================================================================= +@ +@ 10 bit + +function hevc_loop_filter_luma_body_10 -+ m_filter_luma 10 ++ m_filter_luma 10, q11, q15 +endfunc + +function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 @@ -2980,7 +3330,7 @@ index 0000000000..e665bd848a + ldr r10, [sp, #32] + +.Lv_loop_luma_common_10: -+ m_filter_v_luma_common_16 10 ++ m_filter_v_luma_16 10 +endfunc + +function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 @@ -3220,7 +3570,7 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..a721e392ab +index 0000000000..ce7e6091f1 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c @@ -0,0 +1,465 @@ @@ -3255,8 +3605,8 @@ index 0000000000..a721e392ab +// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but +// have been removed from head as we never use them. + -+void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); @@ -3455,9 +3805,9 @@ index 0000000000..a721e392ab + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + -+void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, ++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs); ++ int in_inc); + + +static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) @@ -3557,10 +3907,10 @@ index 0000000000..a721e392ab +av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) +{ + if (bit_depth == 8) { -+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon; -+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; + c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; + c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; + c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; @@ -7255,6 +7605,5151 @@ index 0000000000..b56e0f9644 + edge_64b_bodies edge_64b_body_16, 4 +endfunc + +diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h +new file mode 100644 +index 0000000000..36a23a5bf9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_arm.h +@@ -0,0 +1,28 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H ++#define AVCODEC_ARM_HEVCPRED_ARM_H ++ ++#include "libavcodec/rpi_hevcpred.h" ++ ++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c +new file mode 100644 +index 0000000000..80724d4cf3 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/cpu.h" ++#include "libavutil/arm/cpu.h" ++ ++#include "libavcodec/rpi_hevcpred.h" ++#include "rpi_hevcpred_arm.h" ++ ++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevc_rpi_pred_init_neon(c, bit_depth); ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c +new file mode 100644 +index 0000000000..8c267a0368 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcpred_arm.h" ++ ++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ switch (bit_depth) ++ { ++ case 8: ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; ++ break; ++ case 10: ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; ++ break; ++ default: ++ break; ++ } ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +new file mode 100644 +index 0000000000..1a2d413ea2 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +@@ -0,0 +1,2352 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * General angular pred ++ * ++ * Horizontal (10) & Vertical (26) cases have their own file ++ * and are not dealt with properly here (luma filtering is missing) ++ * ++ * The inv_angle calculations are annoying - if it wasn't for the +128 ++ * rounding step then the result would simply be the loop counter :-( ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.text ++ ++@ Horizontal Patch functions ++@ These need a transpose before store so exist as smaller patches ++@ Patches can be called repeatedly without any intermediate setup ++@ to generate a horizontal block ++@ ++@ It is almost certainly the case that larger patch fns can be built ++@ and they would be a little faster, but we would still need the small ++@ fns and code size (or at least instruction cache size) is an issue ++@ given how much code we already have here ++ ++@ Generate 8x8 luma 8 patch ++@ ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ d24 Cur Line - load before 1st call for down - set by _up ++@ d16 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ q0-q3, q14, q15 ++ ++patch_h_down_8x8_8: ++ mov r5, #8 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++ vld1.8 {d24[7]}, [r2]! ++ ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ vdup.8 d30, r6 ++ vext.8 q2, q3, #8 ++ vdup.8 d31, r12 ++ vext.8 q3, q3, #8 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d7, q14, #5 ++ bne 2b ++ ++store_tran_8x8_8: ++ add r12, r0, #4 ++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ] ++ add r5, r0, r3 ++ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r12], r3 ++ add r0, #8 ++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r5 ], r3 ++ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r12], r3 ++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r5 ], r3 ++ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r12], r3 ++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r5 ], r3 ++ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r12], r3 ++ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r5 ], r3 ++ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r12], r3 ++ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r5 ], r3 ++ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r12], r3 ++ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r5 ], r3 ++ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r12], r3 ++ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r5 ] ++ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r12] ++ bx lr ++ ++ ++patch_h_up_8x8_8: ++ mov r5, #8 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ @ r2=left (variable), r1=up (const) ++ adds r8, r7 ++ vmov d24, d16 ++ ldrbmi r12, [r2, #-1]! ++ ldrbpl r12, [r1, r8, asr #8] ++ vext.8 d16, d16, d16, #7 ++ sub r6, #32 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vext.8 q2, q3, #8 ++ vdup.8 d30, r12 ++ vext.8 q3, q3, #8 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d7, q14, #5 ++ bne 2b ++ b store_tran_8x8_8 @ This will return ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ mov r5, #4 @ Loop counter for all cases ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2] ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ ++98: ++ add r12, r0, r3 ++ lsl r3, #1 ++ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0 ], r3 ++ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r12], r3 ++ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0 ] ++ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r12] ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.32 {d16[0]}, [r2] ++ sub r8, r7 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ @ r2=left (variable), r1=up (const) ++ adds r8, r7 ++ vmov d24, d16 ++ ldrbmi r12, [r2, #-1]! ++ ldrbpl r12, [r1, r8, asr #8] ++ vext.8 d16, d16, d16, #7 ++ sub r6, #32 ++ vmov.8 d16[0], r12 ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q2, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ b 98b ++ ++18: ++ cmp r12, #26 ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.32 {d16[0]}, [r1 :32] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov d24, d16 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 d16, d16, #7 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.32 {d0[0]}, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {d24}, [r1] @ Up + up-right, may be on 32-bit align rather than 64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 d24, d24, #1 ++ sub r6, #32 ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.32 {d0[0]}, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ bl patch_h_down_8x8_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ sub r8, r7 ++ bl patch_h_up_8x8_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {d16}, [r1 :64] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov d24, d16 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 d16, d16, #7 ++ vmov.8 d16[0], r12 ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {d24, d25}, [r1 :64]! @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.8 q12, q12, #1 ++ sub r6, #32 ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ ++ mov r2, r1 @ restore r2 ++ sub r0, #16 ++ add r6, r4, #32 @ Force initial load in main loop ++ vld1.8 {d24}, [r2]! ++ add r0, r0, r3, lsl #3 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ vld1.8 {d16}, [r2] ++ sub r8, r7 ++ ++ push {r2, r8} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r2, r8} ++ ++ sub r0, #16 ++ add r6, r4, #32 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #3 ++ add r0, r0, r3, lsl #3 ++ vld1.8 {d16}, [r2] ++ ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov q12, q8 ++ add r8, r7 ++ sub r6, #32 ++ vext.8 q8, q8, q8, #15 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1 :128]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.8 q12, q12, #1 ++ sub r6, #32 ++ vld1.8 {d25[7]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #3 ++ add r0, r0, r3, lsl #3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #32 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8, q9 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ ldrb r12, [r2, r8, asr #8] ++ ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ sub r6, #32 ++ vext.8 q9, q8, q9, #15 ++ vext.8 q8, q8, q8, #15 ++ vmov.8 d16[0], r12 ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmull.u8 q2, d18, d31 ++ vmull.u8 q3, d19, d31 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ vmlal.u8 q2, d26, d30 ++ vmlal.u8 q3, d27, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.8 q12, q13, #1 ++ vext.8 q13, q13, #1 ++ sub r6, #32 ++ vld1.8 {d27[7]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmull.u8 q2, d26, d30 ++ vmull.u8 q3, d27, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ vmlal.u8 q2, d18, d31 ++ vmlal.u8 q3, d19, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++@ Chroma 8 bit 4x4 patch fns ++ .text ++ ++patch_h_down_c_4x4_8: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 d24, d24, #1 ++ sub r6, #32 ++ vld1.16 {d24[3]}, [r2]! ++ ++1: ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q14, d24, d30 ++ add r6, r4 ++ vmlal.u8 q14, d16, d31 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ ++store_tran_c_4x4_8: ++ add r12, r0, r3 ++ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r5 ] ++ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12] ++ bx lr ++ ++patch_h_up_c_4x4_8: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov d24, d16 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #2 ++ addpl r2, r1, r8, asr #7 ++ vext.16 d16, d16, d16, #3 ++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 ++ and r2, #~1 ++ sub r6, #32 ++ vld1.16 d16[0], [r2] ++1: ++ vdup.8 d31, r6 ++ vext.8 q0, q1, #8 ++ rsb r12, r6, #32 ++ vext.8 q1, q1, #8 ++ ++ vmull.u8 q14, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q14, d24, d30 ++ subs r5, #1 ++ vrshrn.u16 d3, q14, #5 ++ bne 2b ++ b store_tran_c_4x4_8 @ This will return ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.8 {d16}, [r2] ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {d16}, [r1 :64] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r12, r8, #8 ++ vmov d24, d16 ++ add r8, r7 ++ vext.16 d16, d16, #3 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vrshrn.u16 d0, q0, #5 ++ ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1] @ Up + UR (only 64-bit aligned) ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q12, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmlal.u8 q0, d16, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {d0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.8 {d24}, [r2]! ++ mov r1, r2 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ ++ sub r0, #16 ++ add r0, r0, r3, lsl #2 ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.8 {d16}, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r2, r8} ++ ++ add r2, r2, #8 ++ sub r0, #16 ++ sub r8, r8, r7, lsl #2 ++ vld1.8 {d16}, [r2] ++ add r0, r0, r3, lsl #2 ++ add r6, r4, #32 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vext.16 q8, q8, #7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vdup.8 d30, r12 ++ vmull.u8 q1, d17, d31 ++ add r6, r4 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12}, [r1 :128]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q12, #1 ++ sub r6, #32 ++ vld1.16 {d25[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.8 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.8 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.8 {q8, q9 }, [r1 :128] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ For other widths we may want different logic ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ vext.16 q9, q8, q9, #7 ++ sub r6, #32 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++ ++1: ++ vdup.8 d31, r6 ++ rsb r12, r6, #32 ++ ++ vmull.u8 q0, d16, d31 ++ vmull.u8 q1, d17, d31 ++ vdup.8 d30, r12 ++ add r6, r4 ++ vmull.u8 q2, d18, d31 ++ vmull.u8 q3, d19, d31 ++ vmlal.u8 q0, d24, d30 ++ vmlal.u8 q1, d25, d30 ++ vmlal.u8 q2, d26, d30 ++ vmlal.u8 q3, d27, d30 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ sub r6, #32 ++ vld1.16 {d27[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vdup.8 d30, r6 ++ vdup.8 d31, r12 ++ ++ vmull.u8 q0, d24, d30 ++ vmull.u8 q1, d25, d30 ++ vmull.u8 q2, d26, d30 ++ vmull.u8 q3, d27, d30 ++ vmlal.u8 q0, d16, d31 ++ vmlal.u8 q1, d17, d31 ++ vmlal.u8 q2, d18, d31 ++ vmlal.u8 q3, d19, d31 ++ ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q3, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ Data ++ ++ .text ++ .balign 64 ++angle_2: ++ .byte 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Sign inverted from standards table ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Standard sign ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ ++ @ Sign inverted from standards table ++inv_angle: ++ .short 4096, 1638, 910, 630, 482, 390, 315 ++ .short 256 ++ .short 315, 390, 482, 630, 910, 1638, 4096 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bit fns ++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code ++@ but runs out of register width for 12+ bit ++ ++ .text ++ .balign 64 ++ ++patch_h_down_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 d24, d24, #1 ++ sub r6, #32 ++ vld1.16 {d24[3]}, [r2]! ++ ++1: ++ rsb r12, r6, #32 ++ vext.16 q1, q2, #4 ++ vmov s0, r6 ++ vmov s1, r12 ++ vext.16 q2, q2, #4 ++ ++ vmul.u16 d1, d24, d0[0] ++ add r6, r4 ++ vmla.u16 d1, d16, d0[2] ++ subs r5, #1 ++ vrshr.u16 d5, d1, #5 ++ bne 2b ++ ++store_tran_4x4_10: ++ add r12, r0, r3 ++ vst4.16 {d2[0], d3[0], d4[0], d5[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.16 {d2[1], d3[1], d4[1], d5[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.16 {d2[2], d3[2], d4[2], d5[2]}, [r5 ] ++ vst4.16 {d2[3], d3[3], d4[3], d5[3]}, [r12] ++ bx lr ++ ++patch_h_up_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov d24, d16 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #2 ++ addpl r2, r1, r8, asr #7 ++ vext.16 d16, d16, d16, #3 ++ @ We get *2 by >> 7 rather than 8, but that means we need to lose bit 0 ++ and r2, #~1 ++ sub r6, #32 ++ vld1.16 d16[0], [r2] ++ ++1: ++ rsb r12, r6, #32 ++ vext.16 q1, q2, #4 ++ vmov s0, r6 ++ vmov s1, r12 ++ vext.16 q2, q2, #4 ++ ++ vmul.u16 d1, d24, d0[2] ++ add r6, r4 ++ vmla.u16 d1, d16, d0[0] ++ subs r5, #1 ++ vrshr.u16 d5, d1, #5 ++ bne 2b ++ b store_tran_4x4_10 @ This will return ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.16 {d24}, [r2]! ++ bl patch_h_down_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.16 {d16}, [r2] ++ bl patch_h_up_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {d16}, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov d24, d16 ++ add r8, r7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vext.16 d16, d16, #3 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 d2, d16, d0[2] ++ vmla.u16 d2, d24, d0[0] ++ vrshr.u16 d2, #5 ++ ++ subs r5, #1 ++ vst1.16 {d2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {d24, d25}, [r1 :64] @ Up + UR (64bit aligned) ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov d16, d24 ++ vext.16 q12, q13, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 d2, d24, d0[0] ++ vmla.u16 d2, d16, d0[2] ++ vrshr.u16 d2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {d2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.16 {d24}, [r2]! ++ mov r1, r2 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ ++ vld1.16 {d24}, [r1]! ++ sub r0, #16 ++ add r6, r4, #32 @ Force initial load in main loop ++ add r0, r0, r3, lsl #2 ++ mov r2, r1 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.16 {d16}, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #16 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ add r12, r2, r12, lsl #1 ++ sub r6, #32 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vrshr.u16 q1, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1 :128] @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.16 q12, q13, #1 ++ sub r6, #32 ++ vext.16 q13, q13, #1 ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vrshr.u16 q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.16 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ subs r10, #1 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ pop {r4-r10, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8, q9}, [r1] @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ sub r6, #32 ++ vext.16 q9, q8, q9, #7 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1 :128]! @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ sub r6, #32 ++ vld1.16 {d27[3]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ vpush {q4 } ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #1 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #8 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.16 {d24}, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ mov r9, #4 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10 ++ subs r9, #1 ++ bne 1b ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.16 {d16}, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ mov r9, #4 ++1: ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10 ++ subs r9, #1 ++ bne 1b ++ pop {r2, r8} ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r2, #8 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++18: ++ cmp r12, #26 ++ mov r5, #32 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vldm r1, {q8-q11} @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #1 ++ vmov q14, q10 ++ vmov q15, q11 ++ sub r6, #32 ++ vext.16 q11, q10, q11, #7 ++ vext.16 q10, q9, q10, #7 ++ vext.16 q9, q8, q9, #7 ++ vext.16 q8, q8, q8, #7 ++ vld1.16 {d16[0]}, [r9] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmul.u16 q3, q10, d0[2] ++ vmul.u16 q4, q11, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vmla.u16 q3, q14, d0[0] ++ vmla.u16 q4, q15, d0[0] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++ b 99f ++ ++@ Right of vertical - works along top - left unused ++26: ++ vldm r1, {q12-q15} @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++ add r1, #64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vmov q10, q14 ++ vmov q11, q15 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q14, #1 ++ vext.16 q14, q15, #1 ++ vext.16 q15, q15, #1 ++ sub r6, #32 ++ vld1.16 {d31[3]}, [r1]! ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmul.u16 q3, q14, d0[0] ++ vmul.u16 q4, q15, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vmla.u16 q3, q10, d0[2] ++ vmla.u16 q4, q11, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++99: ++ vpop {q4 } ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ ++ ++@ Generate 4x4 chroma patch ++@ ++@ In (const) ++@ r1 Up ptr (_up only) ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ q2 Cur Line - load before 1st call for down - set by _up ++@ q8 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ d0, q1, q12-q15 ++ ++patch_h_down_c_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q2 ++ vext.32 q2, q2, #1 ++ sub r6, #32 ++ vld1.32 {d5[1]}, [r2]! ++1: ++ rsb r12, r6, #32 ++ vmov q12, q13 ++ vmov s0, r6 ++ vmov s1, r12 ++ vmov q13, q14 ++ ++ vmul.u16 q3, q2, d0[0] ++ add r6, r4 ++ vmla.u16 q3, q8, d0[2] ++ vmov q14, q15 ++ subs r5, #1 ++ vrshr.u16 q15, q3, #5 ++ bne 2b ++ ++store_tran_c_4x4_10: ++ add r12, r0, r3 ++ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0 ]! ++ add r5, r12, r3 ++ vst4.32 {d24[1], d26[1], d28[1], d30[1]}, [r12] ++ add r12, r12, r3, lsl #1 ++ vst4.32 {d25[0], d27[0], d29[0], d31[0]}, [r5 ] ++ vst4.32 {d25[1], d27[1], d29[1], d31[1]}, [r12] ++ bx lr ++ ++patch_h_up_c_4x4_10: ++ mov r5, #4 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ @ If r8 is -ve then we are still tracking left ++ adds r8, r7 ++ vmov q2, q8 ++ @ Initially r2=left (variable), r1=up (const) ++ @ Use r2 for both up and left, we only ever go from left->up so ++ @ we assume that we are left and thenm overwrite with up if wanted ++ sub r2, #4 ++ addpl r2, r1, r8, asr #6 ++ vext.32 q8, q8, #3 ++ @ We get *4 by >> 6 rather than 8, but that means we need to lose bits 0 & 1 ++ and r2, #~3 ++ sub r6, #32 ++ vld1.32 d16[0], [r2] ++1: ++ rsb r12, r6, #32 ++ vmov q12, q13 ++ vmov s0, r6 ++ vmov s1, r12 ++ vmov q13, q14 ++ ++ vmul.u16 q1, q2, d0[2] ++ add r6, r4 ++ vmla.u16 q1, q8, d0[0] ++ vmov q14, q15 ++ subs r5, #1 ++ vrshr.u16 q15, q1, #5 ++ bne 2b ++ b store_tran_c_4x4_10 @ This will return ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.32 {q2 }, [r2]! ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.32 {q8 }, [r2] ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #4 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r12, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vext.32 q8, q8, q8, #3 ++ add r12, r2, r12, lsl #2 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r12] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vrshr.u16 q1, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1] @ Up + UR ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q13, #1 ++ sub r6, #32 ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vrshr.u16 q1, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r8, lr} ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ add r6, r4, #32 @ Force initial load in main loop ++ bge 18f ++ ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ vld1.32 {q2 }, [r2]! ++ mov r1, r2 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ ++ vld1.32 {q2 }, [r1]! ++ sub r0, #32 ++ add r6, r4, #32 @ Force initial load in main loop ++ add r0, r0, r3, lsl #2 ++ mov r2, r1 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++ vld1.32 {q8 }, [r2] ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #32 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ vld1.32 {q8 }, [r2] ++ add r6, r4, #32 ++ ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++18: ++ cmp r12, #26 ++ mov r5, #8 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vld1.16 {q8, q9 }, [r1] @ Up ++ ldrh r7, [r7] ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q12, q8 ++ asr r12, r8, #8 ++ vmov q13, q9 ++ add r8, r7 ++ vext.32 q9, q8, q9, #3 ++ add r12, r2, r12, lsl #2 ++ vext.32 q8, q8, q8, #3 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r12] ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q12, q13}, [r1]! @ Up ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q14, #1 ++ sub r6, #32 ++ vld1.32 {d27[1]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vst1.16 {q1, q2 }, [r0], r3 ++ bne 2b ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 ++ ldr r12, [sp, #0] ++ push {r4-r10, lr} ++ vpush {q4 } ++ adrl r4, angle_2 - 2 ++ adrl r7, inv_angle - 11*2 ++ lsl r3, #2 ++ ldrsb r4, [r4, r12] ++ add r7, r7, r12, lsl #1 ++ ++ cmp r12, #18 ++ bge 18f ++ ++ cmp r12, #10 ++ mov r10, #4 @ Outer loop counter for "hard" cases ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 ++2: ++ vld1.32 {q2 }, [r1]! ++ add r6, r4, #32 @ Force initial load in main loop ++ mov r2, r1 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10 ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ @ -128 (rather than +128) means we get UL ++ @ from L & don't have to offset U ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ vld1.32 {q8 }, [r2] ++ add r6, r4, #32 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10 ++ pop {r2, r8} ++ ++ sub r0, #64 ++ subs r10, #1 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ bne 2b ++ b 99f ++ ++18: ++ cmp r12, #26 ++ mov r5, #16 @ Loop counter for the "easy" cases ++ bge 26f ++ ++@ Left of vertical - works down left ++ vldm r1, {q8-q11} @ Up ++ ldrh r7, [r7] ++ add r6, r4, #32 ++ mov r8, #-128 ++ ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ asr r9, r8, #8 ++ vmov q12, q8 ++ add r8, r7 ++ vmov q13, q9 ++ add r9, r2, r9, lsl #2 ++ vmov q14, q10 ++ vmov q15, q11 ++ vext.32 q11, q10, q11, #3 ++ vext.32 q10, q9, q10, #3 ++ vext.32 q9, q8, q9, #3 ++ vext.32 q8, q8, q8, #3 ++ sub r6, #32 ++ vld1.32 {d16[0]}, [r9] ++ ++1: ++ vmov s1, r6 ++ rsb r12, r6, #32 ++ add r6, r4 ++ vmov s0, r12 ++ ++ vmul.u16 q1, q8, d0[2] ++ vmul.u16 q2, q9, d0[2] ++ vmul.u16 q3, q10, d0[2] ++ vmul.u16 q4, q11, d0[2] ++ vmla.u16 q1, q12, d0[0] ++ vmla.u16 q2, q13, d0[0] ++ vmla.u16 q3, q14, d0[0] ++ vmla.u16 q4, q15, d0[0] ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++ b 99f ++ ++@ Right of vertical - works along top - left unused ++26: ++ vldm r1, {q12-q15} @ Up ++ add r6, r4, #32 @ Force initial load in main loop ++ add r1, #64 ++2: ++ cmp r6, #32 ++ ble 1f ++ ++ vmov q8, q12 ++ vmov q9, q13 ++ vmov q10, q14 ++ vmov q11, q15 ++ vext.32 q12, q13, #1 ++ vext.32 q13, q14, #1 ++ vext.32 q14, q15, #1 ++ vext.32 q15, q15, #1 ++ sub r6, #32 ++ vld1.32 {d31[1]}, [r1]! ++ ++1: ++ rsb r12, r6, #32 ++ vmov s0, r6 @ Have to use d0-d7 for scalar multiply ++ vmov s1, r12 ++ ++ vmul.u16 q1, q12, d0[0] ++ vmul.u16 q2, q13, d0[0] ++ vmul.u16 q3, q14, d0[0] ++ vmul.u16 q4, q15, d0[0] ++ vmla.u16 q1, q8, d0[2] ++ vmla.u16 q2, q9, d0[2] ++ vmla.u16 q3, q10, d0[2] ++ vmla.u16 q4, q11, d0[2] ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q3, #5 ++ vrshr.u16 q4, #5 ++ ++ add r6, r4 ++ subs r5, #1 ++ vstm r0, {q1-q4} ++ add r0, r3 ++ bne 2b ++99: ++ vpop {q4 } ++ pop {r4-r10, pc} ++ ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +new file mode 100644 +index 0000000000..af7ba1f45e +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +@@ -0,0 +1,682 @@ ++/* ++ * Copyright (c) 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ ldr r2, [r2] ++ vld1.32 {d0[0]}, [r1] ++ mov r1, #2 ++ vmov s1, r2 ++ vmov s2, r2 ++ vmov.i16 q2, #3 ++ add r2, r0, r3 ++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] ++ lsl r3, #1 ++ vmovl.u8 q0, d0 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.32 {d0[0]}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d1, d0, #5*8 ++ vshr.u64 d2, d0, #6*8 ++ vshr.u64 d3, d0, #7*8 ++ vbif d1, d6, d7 ++ vbif d2, d6, d7 ++ vst1.32 {d1[0]}, [r2], r3 ++ vbif d3, d6, d7 ++ vst1.32 {d2[0]}, [r0] ++ vst1.32 {d3[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ vld1.8 {d1}, [r2] ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T lsl r3, #1 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshrn.u16 d0, q1, #3 ++ ++ @ Store ++ vst1.8 {d0}, [r0], r3 ++ vst1.8 {d0}, [r2], r3 ++ vst1.8 {d0}, [r0] ++ vst1.8 {d0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ mov r1, #2 ++ vld1.8 {d16}, [r2] ++ vmov.i16 q2, #3 ++ vmov.i64 d7, #0xffff ++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] ++ vmovl.u8 q0, d0 ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vmovl.u8 q1, d16 ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q1, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d2, q1, #2 ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.8 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d2, #8 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ mov r1, #6 ++1: ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ subs r1, #2 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8 }, [r1] ++ vld1.8 {q12}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q2, d24, d25 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ mov r1, #4 ++ vpadd.i32 d0, d0 @ This add U & V separately ++ lsl r3, #1 @ pels ++ vrshrn.u16 d0, q0, #4 ++ vdup.u16 q0, d0[0] @ Dup results ++ ++ @ Store ++1: ++ vst1.8 {q0 }, [r0], r3 ++ subs r1, #1 ++ vst1.8 {q0 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 { q8}, [r1] ++ vld1.8 {q12}, [r2] ++ vaddl.u8 q0, d16, d24 ++ vaddl.u8 q2, d17, d25 ++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ vrshr.u16 d0, #5 ++ ++ vmov.i64 d31, #0xff ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + dc * 2) ++ ++ vmov.u16 r12, d0[0] @ dc ++ add r2, r12, r12, lsl #1 @ dc*3 ++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 ++ ++ vdup.u16 q3, r2 ++ vaddw.u8 q1, q3, d16 ++ vaddw.u8 q2, q3, d17 ++ vmov.u16 d2[0], r1 ++ vrshrn.u16 d2, q1, #2 ++ vrshrn.u16 d3, q2, #2 ++ ++ @ Construct lhs pels ++ vaddw.u8 q2, q3, d24 ++ vaddw.u8 q3, q3, d25 ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ ++ @ Store top line ++ vst1.8 { q1}, [r0], r3 ++ ++ mov r1, #15 ++ vdup.u8 q0, d0[0] ++ ++1: ++ vext.8 q2, q2, #1 ++ vbit d0, d4, d31 ++ subs r1, #1 ++ vst1.8 { q0}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 { q8, q9}, [r1] ++ vld1.8 {q12,q13}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q1, d18, d19 ++ vaddl.u8 q2, d24, d25 ++ vaddl.u8 q3, d26, d27 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ lsl r3, #1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ mov r1, #4 ++ vpadd.i32 d0, d0 @ This add U & V separately ++ add r2, r0, r3 ++ vmov d1, d0 ++ lsl r3, #1 ++ vrshrn.u16 d0, q0, #5 ++ vmov d1, d0 @ Dup results ++ vmov q1, q0 ++ ++ @ Store ++1: ++ vst1.8 { q0, q1}, [r0], r3 ++ vst1.8 { q0, q1}, [r2], r3 ++ subs r1, #1 ++ vst1.8 { q0, q1}, [r0], r3 ++ vst1.8 { q0, q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_32_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8, q9 }, [r1] ++ vld1.8 {q12, q13}, [r2] ++ vaddl.u8 q0, d16, d17 ++ vaddl.u8 q1, d18, d19 ++ vaddl.u8 q2, d24, d25 ++ vaddl.u8 q3, d26, d27 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ mov r1, #8 ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ add r2, r0, r3 ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ lsl r3, #1 ++ vrshrn.u16 d0, q0, #6 ++ vdup.u8 q1, d0[0] @ Dup results ++ vdup.u8 q0, d0[0] ++ ++ @ Store ++1: ++ vst1.8 {q0, q1 }, [r0], r3 ++ vst1.8 {q0, q1 }, [r2], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r0], r3 ++ vst1.8 {q0, q1 }, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ----------------------------------------------------------------------------- ++@ ++@ 10 Bit versions ++@ ++@ There is no actual bit depth dependency in this code except that our ++@ intermediate results will overflow the 16 bits they are stored in ++@ All there functions are good to 10 bits - with the worst case being ++@ in dc_32 where we use all 16 bits. ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {d0}, [r1] ++ mov r1, #2 ++ vld1.16 {d1}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vmov.i64 d7, #0xffff ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vrshr.u16 q0, #2 ++ ++ @ Store top line ++ vst1.16 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d3, d1, #1*16 ++ vshr.u64 d4, d1, #2*16 ++ vshr.u64 d5, d1, #3*16 ++ vbif d3, d6, d7 ++ vbif d4, d6, d7 ++ vst1.16 {d3}, [r2], r3 ++ vbif d5, d6, d7 ++ vst1.16 {d4}, [r0] ++ vst1.16 {d5}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ vld1.8 {q1}, [r2] ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q0, q1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshr.u16 q0, q1, #3 ++ ++ vst1.16 {q0}, [r0], r3 ++ vst1.16 {q0}, [r2], r3 ++ vst1.16 {q0}, [r0] ++ vst1.16 {q0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0}, [r1] ++ mov r1, #2 ++ vld1.16 {q8}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vbit d0, d2, d7 @ top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q8, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.16 q2, d6[0] ++ vdup.16 q9, d6[0] ++ vrshr.u16 q8, q8, #2 ++ vrshr.u16 q0, q0, #2 ++ vext.16 q1, q8, q8, #1 ++ ++ @ Store top line ++ vst1.16 {q0}, [r0], r3 ++ ++ @ Store the rest ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ mov r1, #6 ++1: ++ vext.16 q8, q8, q8, #2 ++ subs r1, #2 ++ vext.16 q1, q1, q1, #2 ++ vbit d4, d16, d7 ++ vst1.16 {q2}, [r0], r3 ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 ++ vld1.8 { q8, q9 }, [r1] ++ vld1.8 {q12, q13}, [r2] ++ vadd.i16 q8, q9 ++ vadd.i16 q12, q13 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 2 pairs ++ mov r1, #4 ++ vpadd.i32 d16, d16 ++ lsl r3, #2 @ stride in pels ++ vrshr.u16 d16, #4 ++ vdup.u32 q9, d16[0]; ++ vdup.u32 q8, d16[0]; ++ ++ @ Store ++1: ++ vst1.16 {q8, q9 }, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q8, q9 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q8, q9 }, [r1] ++ vld1.16 {q12, q13}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.u16 q0, q8, q12 ++ vadd.u16 q2, q9, q13 ++ vmov.u16 r1, d0[0] @ r1 = top[0] + left[0] ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d0, d0 @ 1 (all the same) ++ vrshr.u16 d0, #5 ++ ++ vmov.i64 d31, #0xffff ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + dc * 2) ++ ++ vmov.u16 r12, d0[0] @ dc ++ add r2, r12, r12, lsl #1 @ dc*3 ++ add r1, r1, r12, lsl #1 @ top[0] + left[0] + dc*2 ++ ++ vdup.u16 q3, r2 ++ vadd.u16 q8, q3 ++ vadd.u16 q9, q3 ++ vmov.u16 d16[0], r1 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ ++ @ Construct lhs pels ++ vadd.u16 q12, q3 ++ vadd.u16 q13, q3 ++ vrshr.u16 q12, #2 ++ vrshr.u16 q13, #2 ++ ++ @ Store top line ++ vst1.16 {q8, q9 }, [r0], r3 ++ ++ mov r1, #15 ++ vdup.u16 q1, d0[0] ++ vdup.u16 q0, d0[0] ++ ++1: ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ vbit d0, d24, d31 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vldm r1, { q8-q11} ++ vldm r2, {q12-q15} ++ vadd.i16 q8, q9 ++ vadd.i16 q10, q11 ++ vadd.i16 q12, q13 ++ vadd.i16 q14, q15 ++ vadd.i16 q8, q10 ++ vadd.i16 q12, q14 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 2 pairs ++ mov r1, #8 ++ vpadd.i32 d16, d16 ++ lsl r3, #2 @ stride in pels ++ vrshr.u16 d16, #5 ++ vmov d17, d16 @ Dup results ++ vmov q9, q8 ++ vmov q10, q8 ++ vmov q11, q8 ++ ++ @ Store ++1: ++ vstm r0, {q8-q11} ++ add r0, r3 ++ subs r1, #1 ++ vstm r0, {q8-q11} ++ add r0, r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels) ++ ++function ff_hevc_rpi_pred_dc_32_neon_10, export=1 ++ ++ @ Average the els of top & left ++ @ With 10 bits we are (just) safe from overflow in i16 ++ vldm r1, { q8-q11} ++ vldm r2, {q12-q15} ++ vadd.i16 q8, q9 ++ vadd.i16 q10, q11 ++ vadd.i16 q12, q13 ++ vadd.i16 q14, q15 ++ vadd.i16 q8, q10 ++ vadd.i16 q12, q14 ++ vadd.i16 q8, q12 ++ vadd.i16 d16, d17 @ d16 has 4 vals ++ mov r1, #16 ++ vpadd.i16 d16, d16 @ 2 (top & bottom the same) ++ lsl r3, #1 @ stride in pels ++ vpadd.i16 d16, d16 @ 1 (all the same) ++ vrshr.u16 d16, #6 ++ vmov d17, d16 @ Dup results ++ vmov q9, q8 ++ vmov q10, q8 ++ vmov q11, q8 ++ ++ @ Store ++1: ++ vstm r0, { q8-q11} ++ add r0, r3 ++ subs r1, #1 ++ vstm r0, { q8-q11} ++ add r0, r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +new file mode 100644 +index 0000000000..ccf13a081f +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +@@ -0,0 +1,888 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * Horizontal & Vertical special cases of angular intra pred ++ * ++ * Split out because: ++ * Vertical, at least, is relatively common ++ * Much simpler code than the general angular case ++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else ++ * ++ * *** Currently luma filtering is mandatory where it occurs, but there are ++ * cases where it should be turned off (rdpcm & an extension sps flag). ++ * These don't occur in the standard conformance suite for Main Profile ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 ++ vld1.32 {d0[0] }, [r1 :32] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.32 {d24[0]}, [r2 :32] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d24, d4 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ mov r1, #4 ++ vdup.8 d2, d2[0] ++ vqadd.s8 d24, d2 ++ vmov.i64 d4, #0xff ++ veor.8 d24, d6 ++ ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 d24, d24, #1 ++ subs r1, #1 ++ vst1.32 {d0[0] }, [r0 :32], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 ++ vld1.8 {d0 }, [r1 :64] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {d24}, [r2 :64] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d24, d4 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ mov r1, #8 ++ vdup.8 d2, d2[0] ++ vqadd.s8 d24, d2 ++ vmov.i64 d4, #0xff ++ veor.8 d24, d6 ++ ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 d24, d24, #1 ++ subs r1, #1 ++ vst1.8 {d0 }, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {q12}, [r2 :128] @ left ++ ++ vdup.8 q2, r12 ++ vmov.u8 q3, #128 ++ vhsub.u8 q12, q2 ++ ++ veor.8 d2, d0, d6 @ Make -128,127 so we can qadd ++ vdup.8 q1, d2[0] ++ vqadd.s8 q12, q1 ++ veor.8 q12, q3 ++ ++ vmov.i64 d4, #0xff ++ mov r1, #16 ++1: ++ vbit.8 d0, d24, d4 ++ vext.8 q12, q12, #1 ++ subs r1, #1 ++ vst1.8 {q0 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vert_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 ++ vld1.8 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d0 }, [r2 :64], r3 ++ vst1.16 {d0 }, [r0 :64] ++ vst1.16 {d0 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++@ ? Might be faster as simple arm ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 ++ vld1.32 {d0[0] }, [r1 :32] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.32 {d16[0]}, [r2 :32] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d0, d4 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 d2, d2[0] ++ lsl r3, #1 ++ vqadd.s8 d0, d2 ++ veor.8 d0, d6 ++ ++ vdup.8 d1, d16[1] ++ vdup.8 d2, d16[2] ++ vdup.8 d3, d16[3] ++ vst1.32 {d0[0] }, [r0 :32], r3 ++ vst1.32 {d1[0] }, [r2 :32], r3 ++ vst1.32 {d2[0] }, [r0 :32] ++ vst1.32 {d3[0] }, [r2 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 ++ vld1.8 {d0 }, [r1 :64] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {d16}, [r2 :64] @ left ++ ++ vdup.8 d4, r12 ++ vmov.u8 d6, #128 ++ vhsub.u8 d0, d4 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 d2, d2[0] ++ lsl r3, #1 ++ vqadd.s8 d0, d2 ++ mov r1, #3 ++ veor.8 d0, d6 ++ ++ vdup.8 d4, d16[1] ++ vst1.8 {d0 }, [r0 :64], r3 ++ vst1.8 {d4 }, [r2 :64], r3 ++ ++1: ++ vext.8 d16, d16, #2 ++ subs r1, #1 ++ vdup.8 d0, d16[0] ++ vdup.8 d4, d16[1] ++ vst1.8 {d0 }, [r0 :64], r3 ++ vst1.8 {d4 }, [r2 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r12, [r2, #-1] @ Up-left ++ vld1.8 {q8 }, [r2 :128] @ left ++ ++ vdup.8 q2, r12 ++ vmov.u8 q3, #128 ++ vhsub.u8 q0, q2 ++ ++ veor.8 d2, d16, d6 @ Make -128,127 so we can qadd ++ add r2, r0, r3 ++ vdup.8 q1, d2[0] ++ lsl r3, #1 ++ vqadd.s8 q0, q1 ++ mov r1, #7 ++ veor.8 q0, q3 ++ ++ vdup.8 q2, d16[1] ++ vst1.8 {q0 }, [r0 :128], r3 ++ vst1.8 {q2 }, [r2 :128], r3 ++ ++1: ++ vext.8 q8, q8, #2 ++ subs r1, #1 ++ vdup.8 q0, d16[0] ++ vdup.8 q2, d16[1] ++ vst1.8 {q0 }, [r0 :128], r3 ++ vst1.8 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 ++ vld1.8 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vdup.8 q0, d16[0] ++ vdup.8 q1, d16[0] ++ vdup.8 q2, d16[1] ++ vdup.8 q3, d16[1] ++ vext.8 q8, q9, #2 ++ vext.8 q9, q9, #2 ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 ++ vld1.16 {d16}, [r2 :64] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vdup.16 d0, d16[0] ++ vdup.16 d1, d16[1] ++ vdup.16 d2, d16[2] ++ vdup.16 d3, d16[3] ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d1 }, [r2 :64], r3 ++ vst1.16 {d2 }, [r0 :64] ++ vst1.16 {d3 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 ++ vld1.16 {q8 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q2, d16[1] ++ vext.16 q8, q8, #2 ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 ++ vld1.16 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ vext.16 q8, q9, #2 ++ vext.16 q9, q9, #2 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 Bit ++@ Has clipping constants so 10-bit only but could easily be macroed up to ++@ 14-bit before we run out of bits ++ ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {d24}, [r2 :64] @ left ++ ++ vdup.16 d4, r12 ++ lsl r3, #1 ++ vhsub.u16 d24, d4 ++ ++ vdup.16 d6, d0[0] ++ vmov.s16 d4, #0 ++ vadd.i16 d24, d6 ++ ++ vmov.s16 d6, #0x3ff ++ vmax.s16 d24, d4 ++ vmov.i64 d4, #0xffff ++ vmin.s16 d24, d6 ++ ++ mov r1, #4 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 d24, d24, #1 ++ subs r1, #1 ++ vst1.16 {d0 }, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q12}, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ lsl r3, #1 ++ vhsub.u16 q12, q2 ++ ++ vdup.16 q3, d0[0] ++ vmov.s16 q2, #0 ++ vadd.i16 q12, q3 ++ ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q12, q2 ++ vmin.s16 q12, q3 ++ ++ vmov.i64 d4, #0xffff ++ mov r1, #8 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 q12, q12, #1 ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q12, q13}, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ lsl r3, #1 ++ vhsub.u16 q12, q2 ++ vhsub.u16 q13, q2 ++ ++ vdup.16 q3, d0[0] ++ vmov.s16 q2, #0 ++ vadd.i16 q12, q3 ++ vadd.i16 q13, q3 ++ ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q12, q2 ++ vmax.s16 q13, q2 ++ vmin.s16 q12, q3 ++ vmin.s16 q13, q3 ++ ++ vmov.i64 d4, #0xffff ++ mov r1, #16 ++1: ++ vbit.8 d0, d24, d4 ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ mov r1, #32 ++1: ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #1 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128] ++ vst1.16 {q0 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ mov r1, #16 ++1: ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++@ ff_hevc_rpi_pred_horizontal_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {d16}, [r2 :64] @ left ++ ++ vdup.16 d4, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 d0, d4 ++ ++ vdup.16 d6, d16[0] ++ vmov.s16 d4, #0 ++ vadd.i16 d0, d6 ++ ++ vmov.s16 d6, #0x3ff ++ vmax.s16 d0, d4 ++ lsl r3, #2 ++ vmin.s16 d0, d6 ++ ++ vdup.16 d1, d16[1] ++ vdup.16 d2, d16[2] ++ vdup.16 d3, d16[3] ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d1 }, [r2 :64], r3 ++ vst1.16 {d2 }, [r0 :64] ++ vst1.16 {d3 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q8 }, [r2 :128] @ left ++ ++ vdup.16 q2, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 q0, q2 ++ ++ vdup.16 q3, d16[0] ++ lsl r3, #2 ++ vmov.s16 q2, #0 ++ vadd.i16 q0, q3 ++ ++ mov r1, #3 ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmin.s16 q0, q3 ++ ++ vdup.16 q2, d16[1] ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q2 }, [r2 :128], r3 ++1: ++ vext.16 q8, q8, #2 ++ vdup.16 q0, d16[0] ++ vdup.16 q2, d16[1] ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q2 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ ldrh r12, [r2, #-2] @ Up-left ++ vld1.16 {q8, q9 }, [r2 :128] @ left ++ ++ ++ vdup.16 q2, r12 ++ add r2, r0, r3, lsl #1 ++ vhsub.u16 q0, q2 ++ vhsub.u16 q1, q2 ++ ++ vdup.16 q3, d16[0] ++ lsl r3, #2 ++ vmov.s16 q2, #0 ++ vadd.i16 q0, q3 ++ vadd.i16 q1, q3 ++ ++ mov r1, #7 ++ vmov.s16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmax.s16 q1, q2 ++ vmin.s16 q0, q3 ++ vmin.s16 q1, q3 ++ ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++1: ++ vext.16 q8, q9, #2 ++ vext.16 q9, q9, #2 ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[1] ++ vdup.16 q3, d16[1] ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ vst1.16 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 ++ vldm r2, { q8-q11} ++ mov r1, #16 ++1: ++ vdup.16 q0, d16[0] ++ vdup.16 q1, d16[0] ++ vdup.16 q2, d16[0] ++ vdup.16 q3, d16[0] ++ add r2, r0, r3, lsl #1 ++ vdup.16 q12, d16[1] ++ vdup.16 q13, d16[1] ++ vdup.16 q14, d16[1] ++ vdup.16 q15, d16[1] ++ vstm r0, { q0-q3 } ++ vstm r2, {q12-q15} ++ ++ vext.16 q8, q9, #2 ++ vext.16 q9, q10, #2 ++ add r0, r0, r3, lsl #2 ++ vext.16 q10, q11, #2 ++ subs r1, #1 ++ vext.16 q11, q11, #2 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 ++ vld1.16 {q8 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[1] ++ vdup.32 q2, d17[0] ++ vdup.32 q3, d17[1] ++ ++ vst1.32 {q0 }, [r0 :128], r3 ++ vst1.16 {q1 }, [r2 :128], r3 ++ vst1.32 {q2 }, [r0 :128] ++ vst1.16 {q3 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 ++ vld1.16 {q8, q9 }, [r2 :128] @ Left ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[0] ++ vdup.32 q2, d16[1] ++ vdup.32 q3, d16[1] ++ vext.32 q8, q9, #2 ++ vext.32 q9, q9, #2 ++ vst1.32 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.32 {q2, q3 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 ++ vldm r2, { q8-q11} ++ mov r1, #8 ++1: ++ vdup.32 q0, d16[0] ++ vdup.32 q1, d16[0] ++ vdup.32 q2, d16[0] ++ vdup.32 q3, d16[0] ++ add r2, r0, r3, lsl #2 ++ vdup.32 q12, d16[1] ++ vdup.32 q13, d16[1] ++ vdup.32 q14, d16[1] ++ vdup.32 q15, d16[1] ++ vstm r0, { q0-q3 } ++ vstm r2, {q12-q15} ++ ++ vext.32 q8, q9, #2 ++ vext.32 q9, q10, #2 ++ add r0, r0, r3, lsl #3 ++ vext.32 q10, q11, #2 ++ subs r1, #1 ++ vext.32 q11, q11, #2 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +new file mode 100644 +index 0000000000..9fb3633862 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +@@ -0,0 +1,930 @@ ++/* ++ * Copyright (c) 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ Planar intra pred (8.4.4.2.4) ++@ ++@ predSamples[ x ][ y ] = ++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + ++@ ( x + 1 ) * p[ nTbS ][ -1 ] + ++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + ++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_8, export=1 ++ adr r12, nb_3_0_1_4 ++ vld1.8 {d24}, [r2] @ Left ++ vld1.8 {d0 }, [r1] @ Up ++ vld1.8 {q8 }, [r12 :128] @ 3.. ++ ++ vdup.8 d30, d24[4] ++ vdup.8 d31, d0[4] ++ ++ vdup.32 d0, d0[0] @ copy lo -> hi ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q0, d0, #2 ++ add r1, r0, r3 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++ vshl.i16 q3, q2, #1 ++ vadd.i16 d0, d4 ++ vadd.i16 d1, d6 ++ lsl r3, #1 ++ vadd.i16 q1, q0, q3 ++ ++ vdup.u8 d20, d24[0] ++ vdup.u8 d21, d24[1] ++ vdup.u8 d22, d24[2] ++ vdup.u8 d23, d24[3] ++ ++ vtrn.32 d20, d21 ++ vtrn.32 d22, d23 ++ ++ vmull.u8 q10, d16, d20 ++ vmull.u8 q11, d16, d22 ++ vadd.i16 q10, q0 ++ vadd.i16 q11, q1 ++ ++ vrshrn.u16 d28, q10, #3 ++ vrshrn.u16 d29, q11, #3 ++ ++ vst1.32 {d28[0]}, [r0 :32], r3 ++ vst1.32 {d28[1]}, [r1 :32], r3 ++ vst1.32 {d29[0]}, [r0 :32] ++ vst1.32 {d29[1]}, [r1 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_8, export=1 ++ adr r12, nb_7_0_1_8 ++ vld1.8 {q12}, [r2] @ Left ++ vld1.8 {q0 }, [r1] @ Up ++ vld1.8 {q8 }, [r12 :128] @ 7.. ++ ++ vdup.8 d30, d25[0] ++ vdup.8 d31, d1[0] ++ ++ mov r1, #8 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q0, d0, #3 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++@ u8 7..0 [1] d16 ++@ u8 left[y] [1] d24 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q0, q2 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 d24, d24, #1 ++ ++ vmull.u8 q10, d16, d20 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d28, q10, #4 ++ ++ subs r1, #1 ++ vst1.8 {d28}, [r0 :64], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_8, export=1 ++ vld1.8 {q12}, [r2 :128] @ Left ++ ldrb r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread ++ adr r12, nb_15_0_1_16 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrb r1, [r1, #16] @ Up-right ++ vld1.8 {q8, q9 }, [r12 :128] @ 15... ++ ++ vdup.8 d30, r2 ++ vdup.8 d31, r1 ++ ++ mov r1, #16 ++ vsubl.u8 q3, d30, d1 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ vshll.u8 q1, d1, #4 ++ vshll.u8 q0, d0, #4 ++ vmlal.u8 q1, d19, d31 ++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free ++ ++@ u8 15..0 [1] q8 ++@ u8 left[y] [1] q12 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q1, q3 ++ vadd.i16 q0, q2 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 q12, q12, #1 ++ ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d29, q11, #5 ++ vrshrn.u16 d28, q10, #5 ++ ++ subs r1, #1 ++ vst1.8 {q14}, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_8, export=1 ++ vpush {q4-q7} ++ vld1.8 {q12, q13}, [r2 :128]! @ Left ++ adr r12, nb_31_0_1_32 ++ vld1.8 {q0, q1 }, [r1 :128]! @ Up ++ vld1.8 {d30[0]}, [r2] @ Down left ++ vld1.8 {d31[0]}, [r1] @ Up-right ++ vldm r12, { q8-q11} @ 1..32, 31..0 ++ ++ vdup.8 d30, d30[0] ++ vdup.8 d31, d31[0] ++ ++ vsubl.u8 q7, d30, d3 ++ vsubl.u8 q6, d30, d2 ++ vsubl.u8 q5, d30, d1 ++ vsubl.u8 q4, d30, d0 @ Add set up ++ ++ vshll.u8 q3, d3, #5 ++ vshll.u8 q2, d2, #5 ++ vshll.u8 q1, d1, #5 ++ vshll.u8 q0, d0, #5 ++ vmlal.u8 q3, d23, d31 ++ vmlal.u8 q2, d22, d31 ++ vmlal.u8 q1, d21, d31 ++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free ++ ++ mov r1, #32 ++ ++@ u8 31..0 [2] q10, q11 ++@ u8 left[y] [2] q12, q13 ++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q3, q7 ++ vadd.i16 q2, q6 ++ vadd.i16 q1, q5 ++ vadd.i16 q0, q4 ++ ++ vdup.u8 d20, d24[0] ++ vext.8 q12, q13, #1 ++ vext.8 q13, q13, #1 ++ ++ vmull.u8 q15, d19, d20 ++ vmull.u8 q14, d18, d20 ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q15, q3 ++ vadd.i16 q14, q2 ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d31, q15, #6 ++ vrshrn.u16 d30, q14, #6 ++ vrshrn.u16 d29, q11, #6 ++ vrshrn.u16 d28, q10, #6 ++ ++ subs r1, #1 ++ vst1.8 {q14, q15}, [r0 :128], r3 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 ++ vld1.8 {q12}, [r2 :64] @ Left + down-left - <1d of overread is OK ++ adr r12, nbx2_3_0_1_4 ++ vld1.8 {q0 }, [r1 :64] @ Up + up right ++ vld1.8 {q8 }, [r12 :128] @ 3,3.. ++ ++ vdup.16 d30, d25[0] ++ vdup.16 d31, d1[0] ++ ++ mov r1, #4 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q0, d0, #2 ++ vmlal.u8 q0, d17, d31 @ Acc set up - q8-q9 free ++ ++@ u8 3,3..0,0 [1] d16 ++@ u8 left[y] [1] d24 ++@ u16 acc [1] q0 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q0, q2 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 d24, d24, #1 ++ ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d28, q10, #3 ++ ++ subs r1, #1 ++ vst1.8 {d28}, [r0 :64], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 ++ vld1.8 {q12}, [r2 :128] @ Left ++ ldrh r2, [r2, #16] @ Down left - could have this in q13, but avoid that much overrread ++ adr r12, nbx2_7_0_1_8 ++ vld1.8 {q0 }, [r1 :128] @ Up ++ ldrh r1, [r1, #16] @ Up-right ++ vld1.8 {q8, q9 }, [r12 :128] @ 7,7... ++ ++ vdup.16 d30, r2 ++ vdup.16 d31, r1 ++ ++ mov r1, #8 ++ vsubl.u8 q3, d30, d1 ++ vsubl.u8 q2, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q1, d1, #3 ++ vshll.u8 q0, d0, #3 ++ vmlal.u8 q1, d19, d31 ++ vmlal.u8 q0, d18, d31 @ Acc set up - q8-q9 free ++ ++@ u8 7,7..0,0 [1] q8 ++@ u8 left[y] [1] q12 ++@ u16 acc [2] q0 .. q1 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2 .. q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q1, q3 ++ vadd.i16 q0, q2 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 q12, q12, #1 ++ ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d29, q11, #4 ++ vrshrn.u16 d28, q10, #4 ++ ++ subs r1, #1 ++ vst1.8 {q14}, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 ++ vpush {q4-q7} ++ vld1.8 {q12, q13}, [r2 :128]! @ Left ++ adr r12, nbx2_15_0_1_16 ++ vld1.8 {q0, q1 }, [r1 :128]! @ Up ++ vld1.16 {d30[0]}, [r2] @ Down left ++ vld1.16 {d31[0]}, [r1] @ Up-right ++ vldm r12, { q8-q11} @ 1..32, 31..0 ++ ++ vdup.16 d30, d30[0] ++ vdup.16 d31, d31[0] ++ ++ mov r1, #16 ++ vsubl.u8 q7, d30, d3 ++ vsubl.u8 q6, d30, d2 ++ vsubl.u8 q5, d30, d1 ++ vsubl.u8 q4, d30, d0 @ Add set up ++ ++ lsl r3, #1 ++ vshll.u8 q3, d3, #4 ++ vshll.u8 q2, d2, #4 ++ vshll.u8 q1, d1, #4 ++ vshll.u8 q0, d0, #4 ++ vmlal.u8 q3, d23, d31 ++ vmlal.u8 q2, d22, d31 ++ vmlal.u8 q1, d21, d31 ++ vmlal.u8 q0, d20, d31 @ Acc set up - q8-q9 free ++ ++@ u8 31..0 [2] q10, q11 ++@ u8 left[y] [2] q12, q13 ++@ u16 acc [4] q0 .. q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4 .. q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vadd.i16 q3, q7 ++ vadd.i16 q2, q6 ++ vadd.i16 q1, q5 ++ vadd.i16 q0, q4 ++ ++ vdup.u16 d20, d24[0] ++ vext.16 q12, q13, #1 ++ vext.16 q13, q13, #1 ++ ++ vmull.u8 q15, d19, d20 ++ vmull.u8 q14, d18, d20 ++ vmull.u8 q11, d17, d20 ++ vmull.u8 q10, d16, d20 ++ ++ vadd.i16 q15, q3 ++ vadd.i16 q14, q2 ++ vadd.i16 q11, q1 ++ vadd.i16 q10, q0 ++ ++ vrshrn.u16 d31, q15, #5 ++ vrshrn.u16 d30, q14, #5 ++ vrshrn.u16 d29, q11, #5 ++ vrshrn.u16 d28, q10, #5 ++ ++ subs r1, #1 ++ vst1.8 {q14, q15}, [r0 :256], r3 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - put btween the 2 code lumps so we can reach it with an adr from both ++@ Beware - it gets quite close which is why nb_3_0_1_4 is 1st... ++ ++ .text ++ .balign 64 ++ ++ @ These could be extracted from the above array, but separate out ++ @ out for better (16 byte) alignment ++nb_3_0_1_4: ++ .byte 3, 2, 1, 0, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 1, 2, 3, 4 ++nb_7_0_1_8: ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++nbh_3_0_1_4: ++ .short 3, 2, 1, 0, 1, 2, 3, 4 ++nbx2_3_0_1_4: ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ ++ @ should be back on a 64-byte boundary here ++nb_31_0_1_32: ++ .byte 31, 30, 29, 28, 27, 26, 25, 24 ++ .byte 23, 22, 21, 20, 19, 18, 17, 16 ++nb_15_0_1_16: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++ .byte 9, 10, 11, 12, 13, 14, 15, 16 ++ .byte 17, 18, 19, 20, 21, 22, 23, 24 ++ .byte 25, 26, 27, 28, 29, 30, 31, 32 ++ ++ @ should be back on a 64-byte boundary here ++nbx2_15_0_1_16: ++ .byte 15, 15, 14, 14, 13, 13, 12, 12 ++ .byte 11, 11, 10, 10, 9, 9, 8, 8 ++nbx2_7_0_1_8: ++ .byte 7, 7, 6, 6, 5, 5, 4, 4 ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ .byte 5, 5, 6, 6, 7, 7, 8, 8 ++ .byte 9, 9, 10, 10, 11, 11, 12, 12 ++ .byte 13, 13, 14, 14, 15, 15, 16, 16 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bits ++@ (all would work with 9) ++ ++@ ff_hevc_rpi_pred_planar_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbh_3_0_1_4 ++ vld1.16 {q14}, [r2 :64] ++ vld1.16 {q8 }, [r12 :128] @ 3..0,1,..4 ++ vld1.16 {q12}, [r1 :64] @ Up ++ vdup.16 d2, d29[0] ++ ++ lsl r3, #1 ++ vsub.i16 d4, d2, d24 @ Add set up ++ ++ vdup.16 d0, d25[0] ++ vshl.i16 d24, #2 ++ vmla.i16 d24, d17, d0 @ Acc set up ++ add r1, r0, r3 ++ vmov d17, d16 ++ ++ vadd.i16 d24, d4 ++ vadd.i16 d25, d24, d4 ++ vshl.i16 d4, d4, #1 @ x2 ++ lsl r3, #1 ++ vadd.i16 d26, d24, d4 ++ vadd.i16 d27, d25, d4 ++ ++ vdup.16 d0, d28[0] ++ vdup.16 d1, d28[1] ++ vdup.16 d2, d28[2] ++ vdup.16 d3, d28[3] ++ ++ vmul.i16 q0, q8, q0 ++ vmul.i16 q1, q8, q1 ++ vadd.i16 q0, q12 ++ vadd.i16 q1, q13 ++ ++ vrshr.u16 q0, #3 ++ vrshr.u16 q1, #3 ++ ++ vst1.16 {d0}, [r0], r3 ++ vst1.16 {d1}, [r1], r3 ++ vst1.16 {d2}, [r0] ++ vst1.16 {d3}, [r1] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_7_0_1_8 ++ vld1.16 {q14}, [r2 :128] ++ ldrh r2, [r2, #16] @ Down left ++ vld1.8 {q0 }, [r12 :128] @ 7..0,1,..8 ++ vld1.16 {q12}, [r1 :128] @ Up ++ ldrh r1, [r1, #16] @ Up-right ++ vmovl.u8 q8, d1 ++ vdup.16 q1, r2 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #1 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ vdup.16 q0, r1 ++ mov r1, #8 ++ vshl.i16 q12, #3 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 15..0 [1] q10 ++@ u32 left[y] [1] q14 ++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.16 q0, d28[0] ++ vext.16 q14, q14, #1 ++ ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q0, q10, q0 ++ vadd.i16 q0, q12 ++ vrshr.u16 q0, #4 ++ ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_15_0_1_16 ++ vld1.16 {q14, q15}, [r2 :128] ++ ldrh r2, [r2, #32] @ Down left ++ vld1.8 {q0, q1 }, [r12 :128] @ 15..0,1,..16 ++ vld1.16 {q12, q13}, [r1 :128] @ Up ++ ldrh r1, [r1, #32] @ Up-right ++ vmovl.u8 q9, d3 ++ vmovl.u8 q8, d2 ++ vdup.16 q1, r2 ++ vmovl.u8 q11, d1 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #1 ++ vsub.i16 q3, q1, q13 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ vdup.16 q0, r1 ++ mov r1, #16 ++ vshl.i16 q13, #4 ++ vshl.i16 q12, #4 ++ vmla.i16 q13, q9, q0 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 15..0 [2] q10..q11 ++@ u32 left[y] [2] q14..q15 ++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.16 q0, d28[0] ++ vext.16 q14, q15, #1 ++ vext.16 q15, q15, #1 ++ ++ vadd.i16 q13, q3 ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q1, q11, q0 ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q1, #5 ++ vrshr.u16 q0, #5 ++ ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_10, export=1 ++ push {r4, lr} ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nb_31_0_1_32 ++ vpush { q4-q7 } ++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0 ++ vldm r1!, {q12-q15} @ Up ++ ldrh r12, [r2, #64] @ Down left ++ vmovl.u8 q8, d4 ++ vmovl.u8 q9, d5 ++ vmovl.u8 q10, d6 ++ vmovl.u8 q11, d7 ++ vdup.16 q3, r12 ++ vld1.16 {d4[0]}, [r1] @ Up-right ++ ++ vsub.i16 q7, q3, q15 ++ vsub.i16 q6, q3, q14 ++ vsub.i16 q5, q3, q13 ++ vsub.i16 q4, q3, q12 @ Add set up ++ ++ vshl.i16 q15, #5 ++ vshl.i16 q14, #5 ++ vshl.i16 q13, #5 ++ vshl.i16 q12, #5 ++ vmla.i16 q15, q11, d4[0] ++ vmla.i16 q14, q10, d4[0] ++ vmla.i16 q13, q9, d4[0] ++ vmla.i16 q12, q8, d4[0] @ Acc set up - q8-q11 free ++ ++ mov r1, #32 ++ vmovl.u8 q8, d0 ++ vmovl.u8 q9, d1 ++ vmovl.u8 q10, d2 ++ vmovl.u8 q11, d3 ++ ++@ u8 31..0 [4] q8..q11 ++@ u8 left[y] [4] [r2] ++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1] ++1: ++ vld1.16 {d0[0]}, [r2]! ++ ++ vadd.i16 q15, q7 ++ vadd.i16 q14, q6 ++ vadd.i16 q13, q5 ++ vadd.i16 q12, q4 ++ ++ vmul.i16 q3, q11, d0[0] ++ vmul.i16 q2, q10, d0[0] ++ vmul.i16 q1, q9, d0[0] ++ vmul.i16 q0, q8, d0[0] ++ ++ vadd.i16 q3, q15 ++ vadd.i16 q2, q14 ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q3, #6 ++ vrshr.u16 q2, #6 ++ vrshr.u16 q1, #6 ++ vrshr.u16 q0, #6 ++ ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #1 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ pop {r4, pc} ++ ++endfunc ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_3_0_1_4 ++ vld1.8 {q0 }, [r12 :128] @ 3,3..0,0,1,1..4,4 ++ vld1.16 {q14}, [r2 :128] @ left ++ ldr r12, [r2, #16] @ Down left ++ vld1.16 {q12}, [r1 :128] @ Up ++ vmovl.u8 q8, d1 ++ vdup.32 q1, r12 ++ ldr r12, [r1, #16] @ Up-right ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #2 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ mov r1, #4 ++ vdup.32 q0, r12 ++ vshl.i16 q12, #2 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 3,3..0,0 [1] q10 ++@ u32 left[y] [1] q14 ++@ u16 acc [1] q12 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q2 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.32 q0, d28[0] ++ vext.32 q14, q14, #1 ++ ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q0, #3 ++ ++ subs r1, #1 ++ vst1.16 {q0 }, [r0 :128], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_7_0_1_8 ++ vld1.8 {q0, q1 }, [r12 :128] @ 7,7..0,0,1,1..8,8 ++ vld1.16 {q14, q15}, [r2 :128] ++ ldr r12, [r2, #32] @ Down left ++ vld1.16 {q12, q13}, [r1 :128] @ Up ++ vmovl.u8 q9, d3 ++ vmovl.u8 q8, d2 ++ vdup.32 q1, r12 ++ ldr r12, [r1, #32] @ Up-right ++ vmovl.u8 q11, d1 ++ vmovl.u8 q10, d0 ++ ++ lsl r3, #2 ++ vsub.i16 q3, q1, q13 ++ vsub.i16 q2, q1, q12 @ Add set up ++ ++ mov r1, #8 ++ vdup.32 q0, r12 ++ vshl.i16 q13, #3 ++ vshl.i16 q12, #3 ++ vmla.i16 q13, q9, q0 ++ vmla.i16 q12, q8, q0 @ Acc set up - q8-q11 free ++ ++@ u16 7,7..0,0 [2] q10..q11 ++@ u32 left[y] [2] q14..q15 ++@ u16 acc [2] q12..q13 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q2..q3 = p[-1][nTbs] - p[x][-1] ++1: ++ vdup.32 q0, d28[0] ++ vext.32 q14, q15, #1 ++ vext.32 q15, q15, #1 ++ ++ vadd.i16 q13, q3 ++ vadd.i16 q12, q2 ++ ++ vmul.i16 q1, q11, q0 ++ vmul.i16 q0, q10, q0 ++ ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q1, #4 ++ vrshr.u16 q0, #4 ++ ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r0 :256], r3 ++ ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr r12, nbx2_15_0_1_16 ++ vpush { q4-q7 } ++ vldm r12, { q0-q3 } @ 1..32, r12 points at 31..0 ++ vldm r1!, {q12-q15} @ Up ++ ldr r12, [r2, #64] @ Down left ++ vmovl.u8 q11, d7 ++ vmovl.u8 q10, d6 ++ vmovl.u8 q9, d5 ++ vmovl.u8 q8, d4 ++ vdup.32 q3, r12 ++ ldr r12, [r1] @ Up-right ++ ++ vsub.i16 q7, q3, q15 ++ vsub.i16 q6, q3, q14 ++ vsub.i16 q5, q3, q13 ++ vsub.i16 q4, q3, q12 @ Add set up ++ ++ vdup.32 q2, r12 ++ vshl.i16 q15, #4 ++ vshl.i16 q14, #4 ++ vshl.i16 q13, #4 ++ vshl.i16 q12, #4 ++ vmla.i16 q15, q11, q2 ++ vmla.i16 q14, q10, q2 ++ vmla.i16 q13, q9, q2 ++ vmla.i16 q12, q8, q2 @ Acc set up - q8-q11 free ++ ++ mov r1, #16 ++ vmovl.u8 q11, d3 ++ vmovl.u8 q10, d2 ++ vmovl.u8 q9, d1 ++ vmovl.u8 q8, d0 ++ ++@ u16 15,15..0,0 [4] q8..q11 ++@ u32 left[y] [4] [r2] ++@ u16 acc [4] q12..q15 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q4..q7 = p[-1][nTbs] - p[x][-1] ++1: ++ ldr r12, [r2], #4 ++ ++ vadd.i16 q15, q7 ++ vadd.i16 q14, q6 ++ vdup.32 q0, r12 ++ vadd.i16 q13, q5 ++ vadd.i16 q12, q4 ++ ++ vmul.i16 q3, q11, q0 ++ vmul.i16 q2, q10, q0 ++ vmul.i16 q1, q9, q0 ++ vmul.i16 q0, q8, q0 ++ ++ vadd.i16 q3, q15 ++ vadd.i16 q2, q14 ++ vadd.i16 q1, q13 ++ vadd.i16 q0, q12 ++ ++ vrshr.u16 q3, #5 ++ vrshr.u16 q2, #5 ++ vrshr.u16 q1, #5 ++ vrshr.u16 q0, #5 ++ ++ subs r1, #1 ++ vstm r0, { q0-q3 } ++ add r0, r0, r3, lsl #2 ++ ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++endfunc ++ ++ diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index fb0c6fae70..9f2ebb16f3 100644 --- a/libavcodec/avcodec.h @@ -10034,10 +15529,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..a8601da4e7 +index 0000000000..4bfa000da4 --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1165 @@ +@@ -0,0 +1,1236 @@ +/* + * HEVC video decoder + * @@ -10599,6 +16094,15 @@ index 0000000000..a8601da4e7 +#endif +} + ++// When bits are delivered to deblock we want them ++//#define TL 1 ++//#define TR 2 ++//#define BL 4 ++//#define BR 8 ++ ++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br ++// so we need to rearrange before passing on ++ +static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; @@ -10614,23 +16118,60 @@ index 0000000000..a8601da4e7 + return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); +} + -+// We sometimes need 17 2-bit entries (annoying!) -+// * This could be avoided if we separate out the H filter left-stub deblock -+// but 64 bit constant shr shouldn't be too bad - though the variable mask here is probably quite nasty -+static inline uint64_t hbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++// We cast away const here as we want this to work for both get and set ++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ -+ unsigned int n = (xr - xl + 7) & ~7; -+ -+ return n == 0 ? (uint64_t)0 : -+ (*(uint64_t *)(s->horizontal_bs + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1); ++ return (uint32_t *)(bs + ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#warning Unexpected masks ++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); +} + -+static inline uint64_t vbs_get(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ -+ unsigned int n = (xr - xl + 7) & ~7; ++ return (uint8_t *)(bs + ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} + -+ return n == 0 ? (uint64_t)0 : -+ (*(uint64_t *)(s->vertical_bs2 + (xl >> 4) + (y >> 3) * s->hbs_stride) >> ((xl >> 1) & 7)) & (((uint64_t)1 << (n >> 1)) - 1); ++ ++// Get block strength ++// Given how we call we will always get within the 32bit boundries ++static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2, ++ const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ if (xr <= xl) { ++ return 0; ++ } ++ else ++ { ++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y); ++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1; ++ ++ return n == 32 ? a : ++ (a >> ((xl >> 1) & 31)) & ~(~0U << n); ++ } ++} ++ ++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); ++} ++ ++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); +} + + @@ -10658,68 +16199,78 @@ index 0000000000..a8601da4e7 + // Main body + for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) + { ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); ++ + const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + ++ if (vbs != 0) + { + const uint8_t * const tcv = tctable + dbp->tc_offset; + const uint8_t * const betav = betatable + dbp->beta_offset; + unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+// const uint8_t * vbs = s->vertical_bs + (bv_l >> 3) * s->bs_height + (y >> 2); -+ uint64_t vbs2 = vbs_get(s, bv_l, bv_r, y); + unsigned int x; + -+ for (x = bv_l; x < bv_r; x += 8) ++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) + { -+ const unsigned int pcmf_v = pcmfa & 3; -+ const unsigned int bs0 = vbs2 & 3; -+ const unsigned int bs1 = (vbs2 & 0xc) >> 2; -+ -+ if ((bs0 | bs1) != 0 && pcmf_v != 3) ++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) + { + const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + frame_stride1(s->frame, LUMA), + betav[qp], -+ (bs0 == 0 ? 0 : tcv[qp + (int)(bs0 & 2)]) | -+ ((bs1 == 0 ? 0 : tcv[qp + (int)(bs1 & 2)]) << 16), -+ pcmf_v, ++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | ++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), ++ pcmfa & 3, + av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); + } -+ -+ pcmfa >>= 1; -+// vbs += s->bs_height; -+ vbs2 >>= 4; + } + } + + if (y != 0) + { -+ unsigned int x; -+ unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ uint64_t hbs = hbs_get(s, bh_l, bh_r + 1, y); // Will give (x <= bh_r) in for loop ++ uint32_t hbs; + -+ for (x = bh_l; hbs != 0; x += 8, hbs >>= 4) ++ // H left - mostly separated out so we only need a uint32_t hbs ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) + { -+ const unsigned int pcmf_h = (pcmfa & 1) | ((pcmfa & 0x10000) >> 15); -+ const unsigned int bs0 = hbs & 3; -+ const unsigned int bs1 = (hbs >> 2) & 3; ++ const unsigned int x = bh_l; ++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const DBParams * const dbph = dbp - 1; ++ const uint8_t * const tc = tctable + dbph->tc_offset + qp; + -+ if ((bs0 | bs1) != 0 && pcmf_h != 3) ++ av_assert2(cb_x - bh_l == 8); ++ ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbph->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ ++ // H ++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); ++ ++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) + { -+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; -+ const DBParams * const dbph = (x < cb_x ? dbp - 1 : dbp); -+ const uint8_t * const tc = tctable + dbph->tc_offset + qp; -+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ frame_stride1(s->frame, LUMA), -+ betatable[qp + dbph->beta_offset], -+ (bs0 == 0 ? 0 : tc[bs0 & 2]) | -+ ((bs1 == 0 ? 0 : tc[bs1 & 2]) << 16), -+ pcmf_h); ++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) ++ { ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + dbp->tc_offset + qp; ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbp->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } + } -+ -+ pcmfa >>= 1; + } + } + @@ -10727,11 +16278,6 @@ index 0000000000..a8601da4e7 + } +} + -+#define TL 1 -+#define TR 2 -+#define BL 4 -+#define BR 8 -+ +static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; @@ -10768,98 +16314,119 @@ index 0000000000..a8601da4e7 + // Deblock V up 8 + // CTB above current + // Top-half only (tc4 & ~0xffff == 0) is special cased in asm -+ unsigned int x; + const unsigned int y = bounds.y - 8; ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; + -+ unsigned int pcmfa = pcm2(s, bv_l - 1, y); -+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; -+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U); -+ -+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8) ++ if (vbs != 0) + { -+ const unsigned int pcmf_v = (pcmfa & 3); -+ if ((vbs2 & 2) != 0 && pcmf_v != 3) ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { -+ const int qp0 = q2h(s, x, y); -+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ frame_stride1(s->frame, 1), -+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), -+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ pcmf_v); ++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ pcmfa & 3); ++ } + } -+ pcmfa >>= 2; + } + } + + for (y = bounds.y; y < b_b; y += 16) + { ++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | ++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); ++ + // V ++ if (vbs != 0) + { + unsigned int x; -+ unsigned int pcmfa = pcm4(s, bv_l - 1, y); -+ const unsigned int pcmf_or = (y + 16 <= b_b) ? 0 : BL | BR; ++ unsigned int pcmfa = ++ (y + 16 > b_b ? ++ pcm2(s, bv_l - 1, y) | 0xffff0000 : ++ pcm4(s, bv_l - 1, y)); + const uint8_t * const tc = tctable + 2 + dbp->tc_offset; -+ uint64_t vbs2 = (vbs_get(s, bv_l, bv_r, y) & 0x0202020202020202U) | -+ ((vbs_get(s, bv_l, bv_r, y + 8) & 0x0202020202020202U) << 4); + -+ for (x = bv_l; x < bv_r; x += 16, vbs2 >>= 8) ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { -+ const unsigned int pcmf_v = pcmf_or | (pcmfa & 3) | ((pcmfa >> 14) & 0xc); -+ const unsigned int bs0 = (~pcmf_v & (TL | TR)) == 0 ? 0 : vbs2 & 2; -+ const unsigned int bs1 = (~pcmf_v & (BL | BR)) == 0 ? 0 : (vbs2 & 0x20) >> 4; -+ -+ if ((bs0 | bs1) != 0) ++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) + { + const int qp0 = q2h(s, x, y); + const int qp1 = q2h(s, x, y + 8); + s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), -+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), + av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ pcmf_v); ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); + } -+ -+ pcmfa >>= 2; + } + } + + // H + if (y != 0) + { -+ unsigned int x; -+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; ++ uint32_t hbs; + const unsigned int bh_l = bv_l - 16; -+ unsigned int pcmfa = pcm4(s, bh_l, y - 1); -+ uint64_t hbs = hbs_get(s, bh_l, bh_r, y) & 0x2222222222222222U; ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + -+ // Chop off bits we don't want... -+ if (bh_l < bounds.x) { -+ pcmfa |= 0x10001; // TL|BL pre rearrangement -+ hbs &= ~(uint64_t)3; // Make BS 0 -+ } -+ -+ for (x = bh_l; hbs != 0; x += 16, hbs >>= 8) ++ // H left - mostly separated out so we only need a uint32_t hbs ++ // Stub is width 8 to the left of bounds, but width 16 internally ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) + { -+ const unsigned int pcmf_h = (x + 16 > bh_r ? TR | BR : 0) | -+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc); -+ const int bs0 = hbs & 2; -+ const int bs1 = (~pcmf_h & (TR | BR)) == 0 ? 0 : (hbs >> 4) & 2; -+ if ((bs0 | bs1) != 0) ++ unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ ++ // Chop off bits we don't want... ++ if (bh_l < bounds.x) { ++ pcmfa |= 0x10001; // TL|BL pre rearrangement ++ hbs &= ~3; // Make BS 0 ++ } ++ ++ // Double check we still want this ++ if (hbs != 0 && (~pcmfa & 0x30003) != 0) + { ++ const unsigned int x = bh_l; + const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; -+ const uint8_t * const tc = tctable + 2 + (x < cb_x ? dbp - 1 : dbp)->tc_offset; ++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; + + s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), -+ ((bs0 == 0) ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | -+ ((bs1 == 0) ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), -+ pcmf_h); ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ ++ // H main ++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it ++ ++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) ++ { ++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } + } -+ pcmfa >>= 2; + } + } + } @@ -10871,18 +16438,18 @@ index 0000000000..a8601da4e7 + return x & ~(~0U << log2_n); +} + -+static inline void set_bs_h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + av_assert2((y & 7) == 0); + + // This doesn't have the same simultainious update issues that bsf_stash + // does (other threads will have a different y) so we can do it the easy way + if ((bsf &= mask) != 0) -+ *(uint32_t *)(s->horizontal_bs + ((x >> 4) & ~3) + (y >> 3) * s->hbs_stride) |= bsf << ((x >> 1) & 31); ++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); +} + + -+static void set_bs_v(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + // We arrange this in a slightly odd fashion but it lines up with + // how we are going to use it in the actual deblock code & it is easier @@ -10894,8 +16461,7 @@ index 0000000000..a8601da4e7 + + if ((bsf &= mask) != 0) + { -+ const unsigned int stride1 = s->hbs_stride; -+ uint8_t *p = s->vertical_bs2 + (x >> 4) + (y >> 3) * stride1; ++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); + const unsigned int sh = ((x & 8) | (y & 4)) >> 1; + + if (mask <= 0xf) @@ -10906,7 +16472,7 @@ index 0000000000..a8601da4e7 + { + do { + *p |= (bsf & 0xf) << sh; -+ p += stride1; ++ p += HEVC_RPI_BS_STRIDE1_BYTES; + } while ((bsf >>= 4) != 0); + } + } @@ -10918,19 +16484,10 @@ index 0000000000..a8601da4e7 + const RefPicList * const rpl_p, const RefPicList * const rpl_q, + const MvField * const mvf_p, const MvField * const mvf_q) +{ -+ uint8_t res[16]; -+ unsigned int i; -+ unsigned int a = 0; -+ -+ s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, -+ sizeof(MvField) * mvf_stride, 1, ++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, ++ mvf_p, mvf_q, + rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, -+ mvf_p, mvf_q, res); -+ -+ for (i = 0; i != rep * dup; ++i) { -+ a |= res[i] << (i * 2); -+ } -+ return a; ++ sizeof(MvField) * mvf_stride); +} + + @@ -11050,7 +16607,7 @@ index 0000000000..a8601da4e7 + } + + // Finally put the results into bs -+ set_bs_h(s, x0, y0, bsf_mask, bsf_h); ++ hbs_set(s, x0, y0, bsf_mask, bsf_h); + } + + // Max of 1 pu internal split - ignore if not on 8pel boundary @@ -11061,7 +16618,7 @@ index 0000000000..a8601da4e7 + // If we have the x split as well then it must be in the middle + const unsigned int log2_rep = has_x_split ? 1 : 0; + -+ set_bs_h(s, x0, lc->cu.y_split, bsf_mask, ++ hbs_set(s, x0, lc->cu.y_split, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl, @@ -11074,7 +16631,7 @@ index 0000000000..a8601da4e7 + { + // Boundary left + if (x0 != 0 && -+ ((x0 & ((1 << s->ps.sps->log2_ctb_size) - 1)) != 0 || ++ (off_boundary(x0, s->ps.sps->log2_ctb_size) || + (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) + { + if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) @@ -11090,7 +16647,7 @@ index 0000000000..a8601da4e7 + mvf_curr, mvf_curr - 1); + } + -+ set_bs_v(s, x0, y0, bsf_mask, bsf_v); ++ vbs_set(s, x0, y0, bsf_mask, bsf_v); + } + + if (has_x_split && !off_boundary(lc->cu.x_split, 3)) @@ -11099,7 +16656,7 @@ index 0000000000..a8601da4e7 + (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size); + const unsigned int log2_rep = has_y_split ? 1 : 0; + -+ set_bs_v(s, lc->cu.x_split, y0, bsf_mask, ++ vbs_set(s, lc->cu.x_split, y0, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl, @@ -11135,6 +16692,12 @@ index 0000000000..a8601da4e7 + + // Deblock may not touch the edges of the bound as they are still needed + // for Intra pred ++ // ++ // Deblock is disabled with a per-slice flag ++ // Given that bounds may cover multiple slices & we dblock outside bounds ++ // anyway we can't avoid deblock using that flag - about the only thing we ++ // could do is have a "no deblock seen yet" flag but it doesn't really ++ // seem worth the effort + + deblock_y_blk(s, bounds, x_end, y_end); + deblock_uv_blk(s, bounds, x_end, y_end); @@ -11150,9 +16713,12 @@ index 0000000000..a8601da4e7 + const unsigned int xl = ussub(bounds.x, xo); + const unsigned int xr = x_end ? br : ussub(br, xo); + -+ for (y = yt; y < yb; y += ctb_size) { -+ for (x = xl; x < xr; x += ctb_size) { -+ sao_filter_CTB(s, x, y); ++ if (s->ps.sps->sao_enabled) ++ { ++ for (y = yt; y < yb; y += ctb_size) { ++ for (x = xl; x < xr; x += ctb_size) { ++ sao_filter_CTB(s, x, y); ++ } + } + } + @@ -12162,7 +17728,7 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..e8df452021 +index 0000000000..744e7cf248 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c @@ -0,0 +1,1957 @@ @@ -13347,7 +18913,7 @@ index 0000000000..e8df452021 + sps->long_term_ref_pics_present_flag = get_bits1(gb); + if (sps->long_term_ref_pics_present_flag) { + sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); -+ if (sps->num_long_term_ref_pics_sps > 31U) { ++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { + av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", + sps->num_long_term_ref_pics_sps); + return AVERROR_INVALIDDATA; @@ -14125,7 +19691,7 @@ index 0000000000..e8df452021 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..c9ecf9a268 +index 0000000000..1e7120a43d --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h @@ -0,0 +1,441 @@ @@ -14388,8 +19954,8 @@ index 0000000000..c9ecf9a268 + uint8_t sao_enabled; + + uint8_t long_term_ref_pics_present_flag; -+ uint16_t lt_ref_pic_poc_lsb_sps[32]; -+ uint8_t used_by_curr_pic_lt_sps_flag[32]; ++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; + uint8_t num_long_term_ref_pics_sps; + + struct { @@ -15093,7 +20659,7 @@ index 0000000000..d7745711ab +} diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c new file mode 100644 -index 0000000000..c5133a8a88 +index 0000000000..cd8149d58e --- /dev/null +++ b/libavcodec/rpi_hevc_sei.c @@ -0,0 +1,368 @@ @@ -15194,10 +20760,11 @@ index 0000000000..c5133a8a88 + s->quincunx_subsampling = get_bits1(gb); + s->content_interpretation_type = get_bits(gb, 6); + -+ // the following skips spatial_flipping_flag frame0_flipped_flag -+ // field_views_flag current_frame_is_frame0_flag -+ // frame0_self_contained_flag frame1_self_contained_flag -+ skip_bits(gb, 6); ++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag ++ skip_bits(gb, 3); ++ s->current_frame_is_frame0_flag = get_bits1(gb); ++ // frame0_self_contained_flag, frame1_self_contained_flag ++ skip_bits(gb, 2); + + if (!s->quincunx_subsampling && s->arrangement_type != 5) + skip_bits(gb, 16); // frame[01]_grid_position_[xy] @@ -15371,8 +20938,8 @@ index 0000000000..c5133a8a88 + return 0; +} + -+static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps, -+ int type, int size, void *logctx) ++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, ++ int type, int size) +{ + switch (type) { + case 256: // Mismatched value from HM 8.1 @@ -15400,8 +20967,8 @@ index 0000000000..c5133a8a88 + } +} + -+static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s, -+ int type, int size, void *logctx) ++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ int type, int size) +{ + switch (type) { + case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: @@ -15413,9 +20980,8 @@ index 0000000000..c5133a8a88 + } +} + -+static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s, -+ const HEVCRpiParamSets *ps, int nal_unit_type, -+ void *logctx) ++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, ++ const HEVCRpiParamSets * const ps, const int nal_unit_type) +{ + int payload_type = 0; + int payload_size = 0; @@ -15436,9 +21002,9 @@ index 0000000000..c5133a8a88 + payload_size += byte; + } + if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { -+ return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx); ++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); + } else { /* nal_unit_type == NAL_SEI_SUFFIX */ -+ return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx); ++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); + } +} + @@ -15453,7 +21019,7 @@ index 0000000000..c5133a8a88 + int ret; + + do { -+ ret = decode_nal_sei_message(gb, s, ps, type, logctx); ++ ret = decode_nal_sei_message(gb, logctx, s, ps, type); + if (ret < 0) + return ret; + } while (more_rbsp_data(gb)); @@ -15467,7 +21033,7 @@ index 0000000000..c5133a8a88 +} diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h new file mode 100644 -index 0000000000..41e4a20127 +index 0000000000..d4ac348df9 --- /dev/null +++ b/libavcodec/rpi_hevc_sei.h @@ -0,0 +1,135 @@ @@ -15533,7 +21099,6 @@ index 0000000000..41e4a20127 +} HEVC_SEI_Type; + +typedef struct HEVCSEIPictureHash { -+ struct AVMD5 *md5_ctx; + uint8_t md5[3][16]; + uint8_t is_md5; +} HEVCSEIPictureHash; @@ -15543,6 +21108,7 @@ index 0000000000..41e4a20127 + int arrangement_type; + int content_interpretation_type; + int quincunx_subsampling; ++ int current_frame_is_frame0_flag; +} HEVCSEIFramePacking; + +typedef struct HEVCSEIDisplayOrientation { @@ -20363,10 +25929,10 @@ index 0000000000..1128a2c054 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..4034c77979 +index 0000000000..08686ff260 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5753 @@ +@@ -0,0 +1,5787 @@ +/* + * HEVC video Decoder + * @@ -21302,9 +26868,9 @@ index 0000000000..4034c77979 + av_freep(&s->tab_slice_address); + av_freep(&s->filter_slice_edges); + -+ av_freep(&s->horizontal_bs); ++ av_freep(&s->bs_horizontal); +// av_freep(&s->vertical_bs); -+ av_freep(&s->vertical_bs2); ++ av_freep(&s->bs_vertical); + av_freep(&s->bsf_stash_left); + av_freep(&s->bsf_stash_up); + @@ -21325,8 +26891,13 @@ index 0000000000..4034c77979 + int ctb_count = sps->ctb_width * sps->ctb_height; + int min_pu_size = sps->min_pu_width * sps->min_pu_height; + -+ s->hbs_stride = ((width + 63) & ~63) >> 4; -+ s->bs_size = (((height + 15) & ~15) >> 3) * s->hbs_stride; ++ { ++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); ++ unsigned int h = ((height + 15) & ~15); ++ ++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size ++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols ++ } + + s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly + s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); @@ -21352,9 +26923,9 @@ index 0000000000..4034c77979 + if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) + goto fail; + -+ s->horizontal_bs = av_mallocz(s->bs_size); -+ s->vertical_bs2 = av_mallocz(s->bs_size); -+ if (s->horizontal_bs == NULL || s->vertical_bs2 == NULL) ++ s->bs_horizontal = av_mallocz(s->bs_size); ++ s->bs_vertical = av_mallocz(s->bs_size); ++ if (s->bs_horizontal == NULL || s->bs_vertical == NULL) + goto fail; + + if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || @@ -21406,15 +26977,22 @@ index 0000000000..4034c77979 + uint8_t chroma_weight_l0_flag[16]; + uint8_t luma_weight_l1_flag[16]; + uint8_t chroma_weight_l1_flag[16]; -+ int luma_log2_weight_denom; ++ unsigned int luma_log2_weight_denom; + + luma_log2_weight_denom = get_ue_golomb_long(gb); -+ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7) ++ if (luma_log2_weight_denom > 7) { + av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom); -+ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3); ++ return AVERROR_INVALIDDATA; ++ } ++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; + if (ctx_cfmt(s) != 0) { -+ int delta = get_se_golomb(gb); -+ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3); ++ const unsigned int chroma_log2_weight_denom = luma_log2_weight_denom + get_se_golomb(gb); ++ if (chroma_log2_weight_denom > 7) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "chroma_log2_weight_denom %d is invalid\n", chroma_log2_weight_denom); ++ return AVERROR_INVALIDDATA; ++ } ++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; + } + + for (i = 0; i < s->sh.nb_refs[L0]; i++) { @@ -21741,6 +27319,7 @@ index 0000000000..4034c77979 + if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { + const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; + const HEVCRpiSPS *last_sps = s->ps.sps; ++ enum AVPixelFormat pix_fmt; + + if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { + if (sps->width != last_sps->width || sps->height != last_sps->height || @@ -21750,10 +27329,20 @@ index 0000000000..4034c77979 + } + ff_hevc_rpi_clear_refs(s); + -+ ret = set_sps(s, sps, get_format(s, sps)); ++ ret = set_sps(s, sps, sps->pix_fmt); + if (ret < 0) + return ret; + ++ pix_fmt = get_format(s, sps); ++ if (pix_fmt < 0) ++ return pix_fmt; ++ ++// ret = set_sps(s, sps, pix_fmt); ++// if (ret < 0) ++// return ret; ++ ++ s->avctx->pix_fmt = pix_fmt; ++ + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + } @@ -25184,6 +30773,13 @@ index 0000000000..4034c77979 + + if (s->sei.frame_packing.content_interpretation_type == 2) + stereo->flags = AV_STEREO3D_FLAG_INVERT; ++ ++ if (s->sei.frame_packing.arrangement_type == 5) { ++ if (s->sei.frame_packing.current_frame_is_frame0_flag) ++ stereo->view = AV_STEREO3D_VIEW_LEFT; ++ else ++ stereo->view = AV_STEREO3D_VIEW_RIGHT; ++ } + } + + if (s->sei.display_orientation.present && @@ -25297,8 +30893,8 @@ index 0000000000..4034c77979 + ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + -+ memset(s->horizontal_bs, 0, s->bs_size); -+ memset(s->vertical_bs2, 0, s->bs_size); ++ memset(s->bs_horizontal, 0, s->bs_size); ++ memset(s->bs_vertical, 0, s->bs_size); + memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); + memset(s->skip_flag, 0, s->ps.sps->min_cb_height * s->skip_flag_stride); + memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); @@ -25421,7 +31017,12 @@ index 0000000000..4034c77979 + } + } +#endif -+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { ++ if ( ++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || ++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || ++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || ++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s))) ++ { + s->is_decoded = 0; + break; + } @@ -25596,7 +31197,7 @@ index 0000000000..4034c77979 + int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; + uint8_t md5[16]; + -+ av_md5_init(s->sei.picture_hash.md5_ctx); ++ av_md5_init(s->md5_ctx); + for (j = 0; j < h; j++) { + const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); +#if HAVE_BIGENDIAN @@ -25606,9 +31207,9 @@ index 0000000000..4034c77979 + src = s->checksum_buf; + } +#endif -+ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift); ++ av_md5_update(s->md5_ctx, src, w << pixel_shift); + } -+ av_md5_final(s->sei.picture_hash.md5_ctx, md5); ++ av_md5_final(s->md5_ctx, md5); + + if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { + av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); @@ -25759,7 +31360,7 @@ index 0000000000..4034c77979 + + pic_arrays_free(s); + -+ av_freep(&s->sei.picture_hash.md5_ctx); ++ av_freep(&s->md5_ctx); + + av_freep(&s->cabac_save); + @@ -25871,8 +31472,7 @@ index 0000000000..4034c77979 + + s->max_ra = INT_MAX; + -+ s->sei.picture_hash.md5_ctx = av_md5_alloc(); -+ if (!s->sei.picture_hash.md5_ctx) ++ if ((s->md5_ctx = av_md5_alloc()) == NULL) + goto fail; + + s->context_initialized = 1; @@ -26122,10 +31722,10 @@ index 0000000000..4034c77979 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..117432de0a +index 0000000000..df2bac1df4 --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,985 @@ +@@ -0,0 +1,1002 @@ +/* + * HEVC video decoder + * @@ -26397,6 +31997,8 @@ index 0000000000..117432de0a + INTRA_ANGULAR_33, + INTRA_ANGULAR_34, +}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 + +enum SAOType { + SAO_NOT_APPLIED = 0, @@ -26813,6 +32415,17 @@ index 0000000000..117432de0a + uint8_t state[HEVC_CONTEXTS]; +} HEVCRpiCabacState; + ++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels ++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) ++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte ++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el ++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row ++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ +typedef struct HEVCRpiContext { + const AVClass *c; // needed by private avoptions + AVCodecContext *avctx; @@ -26882,17 +32495,19 @@ index 0000000000..117432de0a + int eos; ///< current packet contains an EOS/EOB NAL + int last_eos; ///< last packet contains an EOS/EOB NAL + int max_ra; -+ unsigned int hbs_stride; -+ unsigned int bs_size; + + int is_decoded; + int no_rasl_output_flag; + -+ HEVCPredContext hpc; ++ HEVCRpiPredContext hpc; + HEVCDSPContext hevcdsp; + int8_t *qp_y_tab; -+ uint8_t *horizontal_bs; -+ uint8_t *vertical_bs2; ++ ++ // Deblocking block strength bitmaps ++ unsigned int bs_stride2; ++ unsigned int bs_size; ++ uint8_t *bs_horizontal; ++ uint8_t *bs_vertical; + uint8_t *bsf_stash_up; + uint8_t *bsf_stash_left; + @@ -26930,6 +32545,8 @@ index 0000000000..117432de0a + int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) + int nuh_layer_id; + ++ struct AVMD5 *md5_ctx; ++ + HEVCSEIContext sei; + + // Put structures that allocate non-trivial storage at the end @@ -27113,10 +32730,10 @@ index 0000000000..117432de0a +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 -index 0000000000..a6af5ecd85 +index 0000000000..c5d130c377 --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,416 @@ +@@ -0,0 +1,419 @@ +/* + * HEVC video decoder + * @@ -27242,10 +32859,12 @@ index 0000000000..a6af5ecd85 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + -+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc, ++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs) ++ int in_inc) +{ ++ int shift = 32; ++ uint32_t bs = 0; + for (; pus > 0; pus--) { + int strength, out; + int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; @@ -27350,10 +32969,11 @@ index 0000000000..a6af5ecd85 + + for (out = dup; out > 0; out--) + { -+ *bs = strength; -+ bs += out_inc; ++ bs = (bs >> 2) | (strength << 30); ++ shift -= 2; + } + } ++ return bs >> shift; +} + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) @@ -27535,7 +33155,7 @@ index 0000000000..a6af5ecd85 +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 -index 0000000000..59d06bbe28 +index 0000000000..8c9bf725bf --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h @@ -0,0 +1,183 @@ @@ -27707,9 +33327,9 @@ index 0000000000..59d06bbe28 + uint8_t * src_l, + unsigned int no_f); + -+ void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc, ++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs); ++ int in_inc); +} HEVCDSPContext; + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth); @@ -30008,10 +35628,10 @@ index 0000000000..cfe9264fc3 + diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c new file mode 100644 -index 0000000000..f6db76482d +index 0000000000..113ed33d64 --- /dev/null +++ b/libavcodec/rpi_hevcpred.c -@@ -0,0 +1,122 @@ +@@ -0,0 +1,150 @@ +/* + * HEVC video Decoder + * @@ -30037,6 +35657,9 @@ index 0000000000..f6db76482d +#include "rpi_hevcdec.h" + +#include "rpi_hevcpred.h" ++#if (ARCH_ARM) ++#include "arm/rpi_hevcpred_arm.h" ++#endif + +#define PRED_C 0 +#define BIT_DEPTH 8 @@ -30074,7 +35697,7 @@ index 0000000000..f6db76482d +#undef BIT_DEPTH +#undef PRED_C + -+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth) ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) +{ +#undef FUNC +#define FUNC(a, depth) a ## _ ## depth @@ -30091,7 +35714,18 @@ index 0000000000..f6db76482d + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ + hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ -+ hpc->pred_dc = FUNC(pred_dc, depth); \ ++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ ++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ ++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ ++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ ++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ @@ -30106,7 +35740,18 @@ index 0000000000..f6db76482d + hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ + hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ + hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ -+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ ++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ ++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ ++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ ++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ ++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ + hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ @@ -30131,15 +35776,18 @@ index 0000000000..f6db76482d + break; + } + -+ if (ARCH_MIPS) -+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#if (ARCH_ARM) ++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth); ++#elif (ARCH_MIPS) ++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#endif +} diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h new file mode 100644 -index 0000000000..03c6eb3295 +index 0000000000..31d7d57d95 --- /dev/null +++ b/libavcodec/rpi_hevcpred.h -@@ -0,0 +1,57 @@ +@@ -0,0 +1,68 @@ +/* + * HEVC video Decoder + * @@ -30172,37 +35820,48 @@ index 0000000000..03c6eb3295 +struct HEVCRpiContext; +struct HEVCRpiLocalContext; + -+typedef struct HEVCPredContext { ++typedef struct HEVCRpiPredContext { + void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + + void (*pred_planar[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); + void (*pred_angular[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, -+ int c_idx, int mode); ++ int mode); ++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); + void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + + void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); + void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, -+ int c_idx, int mode); -+} HEVCPredContext; ++ int mode); ++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++} HEVCRpiPredContext; + -+void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth); -+void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth); ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); + +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 -index 0000000000..4ee776f955 +index 0000000000..a76ba4c442 --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,850 @@ +@@ -0,0 +1,983 @@ +/* + * HEVC video decoder + * @@ -30396,20 +36055,21 @@ index 0000000000..4ee776f955 + const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : + lc->tu.intra_pred_mode; + pixel4 a; -+ pixel left_array[2 * MAX_TB_SIZE + 1]; ++ ++ // Align so we can do multiple loads in the asm ++ // Padded to 16 byte boundary so as not to confuse anything ++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); +#if !PRED_C -+ pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; -+#endif -+ pixel top_array[2 * MAX_TB_SIZE + 1]; -+#if !PRED_C -+ pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; ++ DECLARE_ALIGNED(16, pixel, filtered_left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ DECLARE_ALIGNED(16, pixel, filtered_top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); +#endif + -+ pixel *left = left_array + 1; -+ pixel *top = top_array + 1; ++ pixel *left = left_array + 16 / sizeof(pixel); ++ pixel *top = top_array + 16 / sizeof(pixel); +#if !PRED_C -+ pixel *filtered_left = filtered_left_array + 1; -+ pixel *filtered_top = filtered_top_array + 1; ++ pixel *filtered_left = filtered_left_array + 16 / sizeof(pixel); ++ pixel *filtered_top = filtered_top_array + 16 / sizeof(pixel); +#endif + int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); + int cand_left = lc->na.cand_left; @@ -30664,12 +36324,22 @@ index 0000000000..4ee776f955 + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, log2_size, c_idx); ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); + break; + default: + s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, c_idx, ++ (uint8_t *)left, stride, + mode); + break; + } @@ -30680,12 +36350,22 @@ index 0000000000..4ee776f955 + (uint8_t *)left, stride); + break; + case INTRA_DC: -+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, log2_size, c_idx); ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, ++ mode); + break; + default: + s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, c_idx, ++ (uint8_t *)left, stride, + mode); + break; + } @@ -30768,7 +36448,7 @@ index 0000000000..4ee776f955 +#if !PRED_C +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int log2_size, int c_idx) ++ ptrdiff_t stride, int log2_size) +{ + int i, j, x, y; + int size = (1 << log2_size); @@ -30788,7 +36468,10 @@ index 0000000000..4ee776f955 + for (j = 0; j < size; j+=4) + AV_WN4P(&POS(j, i), a); + -+ if (c_idx == 0 && size < 32) { ++// if (c_idx == 0 && size < 32) ++// As we now have separate fns for y & c - no need to test that ++ if (size < 32) ++ { + POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; + for (x = 1; x < size; x++) + POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; @@ -30799,7 +36482,7 @@ index 0000000000..4ee776f955 +#else +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int log2_size, int c_idx) ++ ptrdiff_t stride, int log2_size) +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); @@ -30830,6 +36513,20 @@ index 0000000000..4ee776f955 +} +#endif + ++#define PRED_DC(size)\ ++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_DC(0) ++PRED_DC(1) ++PRED_DC(2) ++PRED_DC(3) ++ ++#undef PRED_DC ++ +#ifndef ANGLE_CONSTS +#define ANGLE_CONSTS +static const int intra_pred_angle[] = { @@ -30846,7 +36543,7 @@ index 0000000000..4ee776f955 +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int c_idx, ++ ptrdiff_t stride, + int mode, int size) +{ + int x, y; @@ -30889,10 +36586,12 @@ index 0000000000..4ee776f955 + AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); + } + } -+ if (mode == 26 && c_idx == 0 && size < 32) { ++// if (mode == 26 && c_idx == 0 && size < 32) { ++ if (mode == 26 && size < 32) { + for (y = 0; y < size; y++) + POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); + } ++ + } else { + ref = left - 1; + if (angle < 0 && last < -1) { @@ -30916,7 +36615,8 @@ index 0000000000..4ee776f955 + POS(x, y) = ref[y + idx + 1]; + } + } -+ if (mode == 10 && c_idx == 0 && size < 32) { ++// if (mode == 10 && c_idx == 0 && size < 32) { ++ if (mode == 10 && size < 32) { + for (x = 0; x < size; x += 4) { + POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1)); + POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1)); @@ -30925,12 +36625,61 @@ index 0000000000..4ee776f955 + } + } + } ++ ++ ++ ++#if BIT_DEPTH == 8 && 0 ++ if ((size == 16 || size == 32) && mode != 10 && mode != 26) { ++ DECLARE_ALIGNED(16, uint8_t, a[64*32]); ++ void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++// void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++#if 1 ++ src = (pixel *)_src; ++ printf("C: Mode=%d\n", mode); ++ for (y = 0; y < size; y++, src += stride) ++ { ++ printf("%2d: ", y); ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x ", src[x]); ++ } ++ printf("\n"); ++ } ++#endif ++// ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size); ++ memset(a, 0, sizeof(a)); ++// ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode); ++ ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode); ++#if 1 ++ src = (pixel *)a; ++ printf("A:\n"); ++ for (y = 0; y < size; y++, src += size) ++ { ++ printf("%2d: ", y); ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x ", src[x]); ++ } ++ printf("\n"); ++ } ++#endif ++ src = (pixel *)_src; ++ for (y = 0; y < size; y++, src += stride) ++ { ++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) { ++ printf("Fail at line %d\n", y); ++ av_assert0(0); ++ } ++ } ++ } ++#endif ++ +} +#else +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, -+ ptrdiff_t stride, int c_idx, ++ ptrdiff_t stride, + int mode, int size) +{ + int x, y; @@ -31001,35 +36750,78 @@ index 0000000000..4ee776f955 + } + } + } ++ ++#if BIT_DEPTH == 10 && 0 ++ if (size == 16 && mode != 10 && mode != 26) { ++ DECLARE_ALIGNED(16, uint8_t, a[64*32]); ++// void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++ src = (c_dst_ptr_t)_src; ++ printf("C: mode=%d\n", mode); ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x:%3x ", src[x][0], src[x][1]); ++ } ++ printf("\n"); ++ } ++ ++ memset(a, 0, sizeof(a)); ++ ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode); ++ ++ src = (c_dst_ptr_t)a; ++ printf("A:\n"); ++ for (y = 0; y < size; y++, src += size) ++ { ++ for (x = 0; x < size; x++) ++ { ++ printf("%3x:%3x ", src[x][0], src[x][1]); ++ } ++ printf("\n"); ++ } ++ ++ src = (c_dst_ptr_t)_src; ++ for (y = 0; y < size; y++, src += stride) ++ { ++ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) { ++ printf("Fail at line %d\n", y); ++ av_assert0(0); ++ } ++ } ++ ++ } ++#endif +} +#endif + +static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); +} + +static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); +} + +static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); +} + +static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + const uint8_t *left, -+ ptrdiff_t stride, int c_idx, int mode) ++ ptrdiff_t stride, int mode) +{ -+ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); +} + +#undef cpel