From a9feeb705f37f1aef091119934ff28078c34f1f0 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Thu, 1 Feb 2018 23:50:49 +0000 Subject: [PATCH] ffmpeg: update to ffmpeg-f96fd5c (3.4.1-Leia-Alpha-1) --- packages/multimedia/ffmpeg/package.mk | 4 +- ...l-unsupported-GMC-with-more-than-one.patch | 6 +- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 4526 +++++++---------- ...g-99.1004-added_upstream_mvc_patches.patch | 24 +- 4 files changed, 1867 insertions(+), 2693 deletions(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index c203364dc1..af3edf59da 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -18,8 +18,8 @@ PKG_NAME="ffmpeg" # Current branch is: release/3.4-kodi -PKG_VERSION="d413620" -PKG_SHA256="c02de2197f8b70544f018e83f48c1bed2a1b47e1a1aa34ef59d9167fb0d2090a" +PKG_VERSION="f96fd5c" +PKG_SHA256="35ccc07c72b203101030a35b4bb11779365adb7bbf143ef1d68a1f87c781e38b" PKG_ARCH="any" PKG_LICENSE="LGPLv2.1+" PKG_SITE="https://ffmpeg.org" diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch index c3c09d6325..6721c8d3be 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch @@ -1,4 +1,4 @@ -From 214a8ccc1489db28ce6cec2739365d7eebbdb0f9 Mon Sep 17 00:00:00 2001 +From d8bdcc8791c501921ee8961f3b0de0bd47668ebf Mon Sep 17 00:00:00 2001 From: popcornmix Date: Fri, 5 Jun 2015 22:48:33 +0100 Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp @@ -10,10 +10,10 @@ Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp 2 files changed, 5 insertions(+) diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index c207d3a784..08aa8112b1 100644 +index c26b6d607c..6c4b011b5c 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h -@@ -2967,6 +2967,7 @@ typedef struct AVCodecContext { +@@ -2965,6 +2965,7 @@ typedef struct AVCodecContext { #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. #define FF_BUG_TRUNCATED 16384 #define FF_BUG_IEDGE 32768 diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 5104bfd261..b3fb4b36ac 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -19,7 +19,7 @@ index dabb51762d..0b1f739d22 100644 /ffplay /ffprobe diff --git a/configure b/configure -index 18d80ee87a..d519af9074 100755 +index 18d80ee87a..9e621d09c1 100755 --- a/configure +++ b/configure @@ -313,6 +313,7 @@ External library support: @@ -38,15 +38,32 @@ index 18d80ee87a..d519af9074 100755 runtime_cpudetect safe_bitstream_reader shared -@@ -2500,6 +2502,8 @@ hap_decoder_select="snappy texturedsp" +@@ -2198,6 +2200,7 @@ CONFIG_EXTRA=" + rtpdec + rtpenc_chain + rv34dsp ++ sand + sinewin + snappy + srtp +@@ -2500,6 +2503,8 @@ hap_decoder_select="snappy texturedsp" hap_encoder_deps="libsnappy" hap_encoder_select="texturedspenc" hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp" +hevc_rpi_decoder_deps="rpi" -+hevc_rpi_decoder_select="hevc_decoder" ++hevc_rpi_decoder_select="hevc_decoder sand" huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" iac_decoder_select="imc_decoder" +@@ -3269,6 +3274,8 @@ tinterlace_filter_deps="gpl" + tinterlace_merge_test_deps="tinterlace_filter" + tinterlace_pad_test_deps="tinterlace_filter" + tonemap_filter_deps="const_nan" ++unsand_filter_deps="rpi" ++unsand_filter_select="sand" + uspp_filter_deps="gpl avcodec" + vaguedenoiser_filter_deps="gpl" + vidstabdetect_filter_deps="libvidstab" diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index 3ee31473dc..312864d737 100644 --- a/fftools/ffmpeg.c @@ -451,7 +468,7 @@ index 100fa76e46..93a1b8edaf 100644 /* Add all the streams from the given input file to the global diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index c4ec09b1c4..3b94d47e9a 100644 +index c4ec09b1c4..f2abbb06b3 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -4,6 +4,7 @@ DESC = FFmpeg codec library @@ -494,10 +511,10 @@ index c4ec09b1c4..3b94d47e9a 100644 + +ifneq ("$(wildcard $(QASM_PY))","") +$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm -+ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,rpi_hevc_shader $< > $@ ++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ + +$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm -+ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_shader,rpi_hevc_shader $< > $@ ++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ +endif + +ifneq ("$(wildcard $(VASMVIDCORE))","") @@ -528,7 +545,7 @@ index 4f34312e67..5361a22141 100644 REGISTER_DECODER(HEVC_QSV, hevc_qsv); REGISTER_DECODER(HEVC_RKMPP, hevc_rkmpp); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index 1eeac5449e..64aca64e52 100644 +index 1eeac5449e..022ab7ab3d 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ @@ -539,7 +556,7 @@ index 1eeac5449e..64aca64e52 100644 OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o -@@ -134,9 +135,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -134,9 +135,16 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -550,20 +567,18 @@ index 1eeac5449e..64aca64e52 100644 +NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ + arm/rpi_hevc_misc_neon.o \ + arm/rpi_hevcdsp_deblock_neon.o \ -+ arm/rpi_hevcdsp_epel_neon.o \ + arm/rpi_hevcdsp_idct_neon.o \ + arm/rpi_hevcdsp_res16_neon.o \ -+ arm/rpi_hevcdsp_qpel_neon.o \ + arm/rpi_hevcdsp_sao_neon.o \ + arm/rpi_hevcdsp_cres_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h -index fdbf86b45e..0a3980a1ef 100644 +index fdbf86b45e..a60bc899bd 100644 --- a/libavcodec/arm/cabac.h +++ b/libavcodec/arm/cabac.h -@@ -26,13 +26,34 @@ +@@ -26,83 +26,173 @@ #include "libavutil/internal.h" #include "libavcodec/cabac.h" @@ -590,85 +605,143 @@ index fdbf86b45e..0a3980a1ef 100644 + #define get_cabac_inline get_cabac_inline_arm static av_always_inline int get_cabac_inline_arm(CABACContext *c, - uint8_t *const state) +- uint8_t *const state) ++ uint8_t *state) { - int bit; -+#if 0 - void *reg_b, *reg_c, *tmp; +- int bit; +- void *reg_b, *reg_c, *tmp; - - __asm__ volatile( - "ldrb %[bit] , [%[state]] \n\t" - "add %[r_b] , %[tables] , %[lps_off] \n\t" -@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c, - [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) - : "memory", "cc" - ); -+#else -+ // *** Not thumb compatible yet -+ unsigned int reg_b, tmp; -+ __asm__ ( -+ "ldrb %[bit] , [%[state]] \n\t" -+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" -+ "and %[tmp] , %[range] , #0xC0 \n\t" -+ "add %[r_b] , %[r_b] , %[bit] \n\t" -+ "ldrb %[tmp] , [%[r_b] , %[tmp], lsl #1] \n\t" -+// %bit = *state -+// %range = range -+// %tmp = RangeLPS -+ "sub %[range] , %[range] , %[tmp] \n\t" -+ -+ "cmp %[low] , %[range] , lsl #17 \n\t" -+ "ittt ge \n\t" -+ "subge %[low] , %[low] , %[range], lsl #17 \n\t" -+ "mvnge %[bit] , %[bit] \n\t" -+ "movge %[range] , %[tmp] \n\t" -+ -+ "clz %[tmp] , %[range] \n\t" -+ "sub %[tmp] , #23 \n\t" -+ -+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" -+ "lsl %[low] , %[low] , %[tmp] \n\t" -+ "lsl %[range] , %[range] , %[tmp] \n\t" -+ -+ "strb %[r_b] , [%[state]] \n\t" -+ "lsls %[tmp] , %[low] , #16 \n\t" -+ -+ "bne 2f \n\t" -+ LOAD_16BITS_BEHI -+ "lsr %[tmp] , %[tmp] , #15 \n\t" -+ "movw %[r_b] , #0xFFFF \n\t" -+ "sub %[tmp] , %[tmp] , %[r_b] \n\t" -+ -+ "rbit %[r_b] , %[low] \n\t" -+ "clz %[r_b] , %[r_b] \n\t" -+ "sub %[r_b] , %[r_b] , #16 \n\t" -+#if CONFIG_THUMB -+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t" -+ "add %[low] , %[low] , %[tmp] \n\t" -+#else -+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t" -+#endif -+ "2: \n\t" -+ : [bit]"=&r"(bit), -+ [low]"+&r"(c->low), -+ [range]"+&r"(c->range), -+ [r_b]"=&r"(reg_b), -+ [ptr]"+&r"(c->bytestream), -+ [tmp]"=&r"(tmp) -+ : [state]"r"(state), -+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), -+ [byte]"M"(offsetof(CABACContext, bytestream)), -+#if !UNCHECKED_BITSTREAM_READER -+ [c]"r"(c), -+ [end]"M"(offsetof(CABACContext, bytestream_end)), -+#endif -+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) -+ : "memory", "cc" -+ ); -+#endif - - return bit & 1; - } +- __asm__ volatile( +- "ldrb %[bit] , [%[state]] \n\t" +- "add %[r_b] , %[tables] , %[lps_off] \n\t" +- "mov %[tmp] , %[range] \n\t" +- "and %[range] , %[range] , #0xC0 \n\t" +- "add %[r_b] , %[r_b] , %[bit] \n\t" +- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "sub %[r_c] , %[tmp] , %[range] \n\t" +- "lsl %[tmp] , %[r_c] , #17 \n\t" +- "cmp %[tmp] , %[low] \n\t" +- "it gt \n\t" +- "movgt %[range] , %[r_c] \n\t" +- "itt cc \n\t" +- "mvncc %[bit] , %[bit] \n\t" +- "subcc %[low] , %[low] , %[tmp] \n\t" +- "add %[r_c] , %[tables] , %[mlps_off] \n\t" +- "ldrb %[tmp] , [%[r_b], %[range]] \n\t" +- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t" +- "lsl %[low] , %[low] , %[tmp] \n\t" +- "lsl %[range] , %[range] , %[tmp] \n\t" +- "uxth %[r_c] , %[low] \n\t" +- "strb %[r_b] , [%[state]] \n\t" +- "tst %[r_c] , %[r_c] \n\t" +- "bne 2f \n\t" +- "ldr %[r_c] , [%[c], %[byte]] \n\t" ++ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128; ++ int bit, ptr, low, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[bit], [%[c], %[range_off]] \n\t" ++ "ldrb %[ptr], [%[state]] \n\t" ++ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t" ++ "and %[tmp2], %[bit], #0xc0 \n\t" ++ "add %[tmp1], %[tmp1], %[ptr] \n\t" ++ "ldr %[low], [%[c], %[low_off]] \n\t" ++ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t" ++ "sub %[bit], %[bit], %[tmp2] \n\t" ++ "mov %[tmp1], %[bit] \n\t" ++ "cmp %[low], %[bit], lsl #17 \n\t" ++ "movge %[tmp1], %[tmp2] \n\t" ++ "mvnge %[ptr], %[ptr] \n\t" ++ "clz %[tmp2], %[tmp1] \n\t" ++ "subge %[low], %[low], %[bit], lsl #17 \n\t" ++ "sub %[tmp2], %[tmp2], #23 \n\t" ++ "and %[bit], %[ptr], #1 \n\t" ++ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t" ++ "lsl %[low], %[low], %[tmp2] \n\t" ++ "lsls %[ptr], %[low], #16 \n\t" ++ "bne 1f \n\t" ++ "ldr %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t" + #if UNCHECKED_BITSTREAM_READER +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "add %[r_c] , %[r_c] , #2 \n\t" +- "str %[r_c] , [%[c], %[byte]] \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "ldrh %[tmp1], [%[ptr]], #2 \n\t" + #else +- "ldr %[r_b] , [%[c], %[end]] \n\t" +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "cmp %[r_c] , %[r_b] \n\t" +- "itt lt \n\t" +- "addlt %[r_c] , %[r_c] , #2 \n\t" +- "strlt %[r_c] , [%[c], %[byte]] \n\t" ++ "ldr %[tmp1], [%[c], %[end_off]] \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "cmp %[tmp1], %[ptr] \n\t" ++ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t" + #endif +- "sub %[r_c] , %[low] , #1 \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "eor %[r_c] , %[low] , %[r_c] \n\t" +- "rev %[tmp] , %[tmp] \n\t" +- "lsr %[r_c] , %[r_c] , #15 \n\t" +- "lsr %[tmp] , %[tmp] , #15 \n\t" +- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t" +- "movw %[r_b] , #0xFFFF \n\t" +- "sub %[tmp] , %[tmp] , %[r_b] \n\t" +- "rsb %[r_c] , %[r_c] , #7 \n\t" +- "lsl %[tmp] , %[tmp] , %[r_c] \n\t" +- "add %[low] , %[low] , %[tmp] \n\t" +- "2: \n\t" +- : [bit]"=&r"(bit), +- [low]"+&r"(c->low), +- [range]"+&r"(c->range), +- [r_b]"=&r"(reg_b), +- [r_c]"=&r"(reg_c), +- [tmp]"=&r"(tmp) +- : [c]"r"(c), +- [state]"r"(state), +- [tables]"r"(ff_h264_cabac_tables), +- [byte]"M"(offsetof(CABACContext, bytestream)), ++ "clz %[state], %[state] \n\t" ++ "movw %[mlps_tables], #0xffff \n\t" ++ "sub %[state], %[state], #16 \n\t" ++ "str %[tmp2], [%[c], %[range_off]] \n\t" ++ "rev %[tmp1], %[tmp1] \n\t" ++ "str %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsr %[tmp1], %[tmp1], #15 \n\t" ++ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t" ++ "add %[low], %[low], %[tmp1], lsl %[state] \n\t" ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "b 2f \n\t" ++ "1: \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t" ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "str %[tmp1], [%[c], %[range_off]] \n\t" ++ "2: \n\t" ++ : // Outputs ++ [state]"+r"(state), ++ [mlps_tables]"+r"(mlps_tables), ++ [bit]"=&r"(bit), ++ [ptr]"=&r"(ptr), ++ [low]"=&r"(low), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ return bit; ++} + +#define get_cabac_bypass get_cabac_bypass_arm +static inline int get_cabac_bypass_arm(CABACContext * const c) @@ -689,21 +762,27 @@ index fdbf86b45e..0a3980a1ef 100644 + "sub %[low] , %[low] , %[tmp] \n\t" + "1: \n\t" + : // Outputs -+ [rv]"+&r"(rv), -+ [low]"+&r"(c->low), -+ [tmp]"=&r"(tmp), -+ [ptr]"+&r"(c->bytestream) ++ [rv]"+r"(rv), ++ [low]"+r"(c->low), ++ [tmp]"=r"(tmp), ++ [ptr]"+r"(c->bytestream) + : // Inputs +#if !UNCHECKED_BITSTREAM_READER + [c]"r"(c), -+ [end]"M"(offsetof(CABACContext, bytestream_end)), + [end]"M"(offsetof(CABACContext, bytestream_end)), +- [norm_off]"I"(H264_NORM_SHIFT_OFFSET), +- [lps_off]"I"(H264_LPS_RANGE_OFFSET), +- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) +- : "memory", "cc" +- ); +#endif + [range]"r"(c->range) + : "cc" + ); + return rv; +} -+ + +- return bit & 1; + +#define get_cabac_bypass_sign get_cabac_bypass_sign_arm +static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv) @@ -723,10 +802,10 @@ index fdbf86b45e..0a3980a1ef 100644 + "sub %[low] , %[low] , %[tmp] \n\t" + "1: \n\t" + : // Outputs -+ [rv]"+&r"(rv), -+ [low]"+&r"(c->low), -+ [tmp]"=&r"(tmp), -+ [ptr]"+&r"(c->bytestream) ++ [rv]"+r"(rv), ++ [low]"+r"(c->low), ++ [tmp]"=r"(tmp), ++ [ptr]"+r"(c->bytestream) + : // Inputs +#if !UNCHECKED_BITSTREAM_READER + [c]"r"(c), @@ -736,17 +815,17 @@ index fdbf86b45e..0a3980a1ef 100644 + : "cc" + ); + return rv; -+} + } + #endif /* HAVE_ARMV6T2_INLINE */ #endif /* AVCODEC_ARM_CABAC_H */ diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h new file mode 100644 -index 0000000000..31d3c59205 +index 0000000000..10b2c6f850 --- /dev/null +++ b/libavcodec/arm/rpi_hevc_cabac.h -@@ -0,0 +1,491 @@ +@@ -0,0 +1,477 @@ +/* + * This file is part of FFmpeg. + * @@ -810,19 +889,18 @@ index 0000000000..31d3c59205 + const unsigned int last_coeff_abs_level_remaining, + const unsigned int c_rice_param) +{ -+ int t; ++ int t = last_coeff_abs_level_remaining << 1; + __asm__ ( -+ "lsl %[t], %[coeff], #1 \n\t" + "lsrs %[t], %[t], %[shift] \n\t" ++ + "it eq \n\t" + "subeq %[stat], %[stat], #1 \n\t" + "cmp %[t], #6 \n\t" + "adc %[stat], %[stat], #0 \n\t" + "usat %[stat], #8, %[stat] \n\t" -+ : [stat]"+&r"(*stat_coeff), -+ [t]"=&r"(t) -+ : [coeff]"r"(last_coeff_abs_level_remaining), -+ [shift]"r"(c_rice_param) ++ : [stat]"+r"(*stat_coeff), ++ [t]"+r"(t) ++ : [shift]"r"(c_rice_param) + : "cc" + ); +} @@ -850,10 +928,10 @@ index 0000000000..31d3c59205 + "ite eq \n\t" + "usateq %[st] , #2 , %[i] \n\t" + "movne %[st] , #0 \n\t" -+ -+ "ldrb %[bit] , [%[state0], %[st]] \n\t" + "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" + "and %[tmp] , %[range] , #0xC0 \n\t" ++ ++ "ldrb %[bit] , [%[state0], %[st]] \n\t" + "add %[r_b] , %[r_b] , %[bit] \n\t" + "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t" + "sub %[range] , %[range] , %[tmp] \n\t" @@ -861,20 +939,18 @@ index 0000000000..31d3c59205 + "cmp %[low] , %[range], lsl #17 \n\t" + "ittt ge \n\t" + "subge %[low] , %[low] , %[range], lsl #17 \n\t" -+ "mvnge %[bit] , %[bit] \n\t" + "movge %[range] , %[tmp] \n\t" -+ -+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" -+ "and %[bit] , %[bit] , #1 \n\t" -+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" ++ "mvnge %[bit] , %[bit] \n\t" + + "clz %[tmp] , %[range] \n\t" + "sub %[tmp] , #23 \n\t" -+ ++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "and %[bit] , %[bit] , #1 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" + "lsl %[low] , %[low] , %[tmp] \n\t" ++ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" + "lsl %[range] , %[range] , %[tmp] \n\t" + -+ "strb %[r_b] , [%[state0], %[st]] \n\t" +// There is a small speed gain from combining both conditions, using a single +// branch and then working out what that meant later + "lsls %[tmp] , %[low] , #16 \n\t" @@ -888,29 +964,28 @@ index 0000000000..31d3c59205 + +// Do reload + "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" + "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" + "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" ++ "cmp %[n] , %[i] \n\t" + "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" + -+ "rbit %[r_b] , %[low] \n\t" -+ "clz %[r_b] , %[r_b] \n\t" -+ "sub %[r_b] , %[r_b] , #16 \n\t" -+ +#if CONFIG_THUMB -+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t" ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" + "add %[low] , %[low] , %[tmp] \n\t" +#else -+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t" ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" +#endif + -+ "cmp %[n] , %[i] \n\t" + "bne 1b \n\t" + "2: \n\t" + : [bit]"=&r"(bit), -+ [low]"+&r"(c->low), -+ [range]"+&r"(c->range), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), + [r_b]"=&r"(reg_b), -+ [bptr]"+&r"(c->bytestream), ++ [bptr]"+r"(c->bytestream), + [i]"=&r"(i), + [tmp]"=&r"(tmp), + [st]"=&r"(st), @@ -918,7 +993,6 @@ index 0000000000..31d3c59205 + : [state0]"r"(state0), + [n]"r"(n), + [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), -+ [byte]"M"(offsetof(CABACContext, bytestream)), + [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) + : "memory", "cc" + ); @@ -935,26 +1009,32 @@ index 0000000000..31d3c59205 +{ + unsigned int reg_b, tmp, st, bit; + __asm__ ( -+ "1: \n\t" +// Get bin from map -+ "ldrb %[st] , [%[ctx_map], %[n]] \n\t" ++ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t" ++ "1: \n\t" + +// Load state & ranges -+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" + "ldrb %[bit] , [%[state0], %[st]] \n\t" + "and %[tmp] , %[range] , #0xC0 \n\t" ++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" + "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t" + "ldrb %[tmp] , [%[r_b], %[bit]] \n\t" + "sub %[range] , %[range] , %[tmp] \n\t" + + "cmp %[low] , %[range], lsl #17 \n\t" + "ittt ge \n\t" -+ "subge %[low] , %[low] , %[range], lsl #17 \n\t" + "mvnge %[bit] , %[bit] \n\t" ++ "subge %[low] , %[low] , %[range], lsl #17 \n\t" + "movge %[range] , %[tmp] \n\t" + ++// Renorm ++ "clz %[tmp] , %[range] \n\t" + "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "sub %[tmp] , #23 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" + "tst %[bit] , #1 \n\t" ++ "ldrb %[st] , [%[ctx_map], #-1]! \n\t" ++ "lsl %[low] , %[low] , %[tmp] \n\t" +// GCC asm seems to need strbne written differently for thumb and arm +#if CONFIG_THUMB + "it ne \n\t" @@ -963,24 +1043,17 @@ index 0000000000..31d3c59205 + "strneb %[n] , [%[idx]] , #1 \n\t" +#endif + -+// Renorm -+ "clz %[tmp] , %[range] \n\t" -+ "sub %[tmp] , #23 \n\t" -+ "lsl %[low] , %[low] , %[tmp] \n\t" -+ "lsl %[range] , %[range] , %[tmp] \n\t" -+ -+ "strb %[r_b] , [%[state0], %[st]] \n\t" +// There is a small speed gain from combining both conditions, using a single +// branch and then working out what that meant later + "subs %[n] , %[n] , #1 \n\t" ++ "lsl %[range] , %[range] , %[tmp] \n\t" +#if CONFIG_THUMB + "itt ne \n\t" + "lslsne %[tmp] , %[low] , #16 \n\t" -+ "bne 1b \n\t" +#else + "lslnes %[tmp] , %[low] , #16 \n\t" -+ "bne 1b \n\t" +#endif ++ "bne 1b \n\t" + +// If we have bits left then n must be 0 so give up now + "lsls %[tmp] , %[low] , #16 \n\t" @@ -988,38 +1061,36 @@ index 0000000000..31d3c59205 + +// Do reload + "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" + "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" ++ "cmp %[n] , #0 \n\t" + "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" + "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" + -+ "rbit %[r_b] , %[low] \n\t" -+ "clz %[r_b] , %[r_b] \n\t" -+ "sub %[r_b] , %[r_b] , #16 \n\t" -+ +#if CONFIG_THUMB -+ "lsl %[tmp] , %[tmp] , %[r_b] \n\t" ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" + "add %[low] , %[low] , %[tmp] \n\t" +#else -+ "add %[low] , %[low] , %[tmp], lsl %[r_b] \n\t" ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" +#endif + +// Check to see if we still have more to do -+ "cmp %[n] , #0 \n\t" + "bne 1b \n\t" + "2: \n\t" + : [bit]"=&r"(bit), -+ [low]"+&r"(c->low), -+ [range]"+&r"(c->range), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), + [r_b]"=&r"(reg_b), -+ [bptr]"+&r"(c->bytestream), -+ [idx]"+&r"(p), -+ [n]"+&r"(n), ++ [bptr]"+r"(c->bytestream), ++ [idx]"+r"(p), ++ [n]"+r"(n), + [tmp]"=&r"(tmp), -+ [st]"=&r"(st) ++ [st]"=&r"(st), ++ [ctx_map]"+r"(ctx_map) + : [state0]"r"(state0), -+ [ctx_map]"r"(ctx_map), + [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), -+ [byte]"M"(offsetof(CABACContext, bytestream)), + [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) + : "memory", "cc" + ); @@ -1042,17 +1113,15 @@ index 0000000000..31d3c59205 +#define get_cabac_by22_peek get_cabac_by22_peek_arm +static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c) +{ -+ uint32_t rv, tmp; ++ uint32_t rv = c->low &~ 1, tmp; + __asm__ ( -+ "bic %[rv] , %[low], #1 \n\t" + "cmp %[inv] , #0 \n\t" + "it ne \n\t" + "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t" + : // Outputs -+ [rv]"=&r"(rv), ++ [rv]"+r"(rv), + [tmp]"=r"(tmp) + : // Inputs -+ [low]"r"(c->low), + [inv]"r"(c->range) + : // Clobbers + "cc" @@ -1060,180 +1129,176 @@ index 0000000000..31d3c59205 + return rv << 1; +} + -+#if 0 -+ -+// ***** Slower than the C :-( +#define get_cabac_by22_flush get_cabac_by22_flush_arm -+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val) ++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val) +{ -+ uint32_t m, tmp; -+ __asm__ ( -+ "add %[bits], %[bits], %[n] \n\t" -+ "ldr %[m], [%[ptr], %[bits], lsr #3] \n\t" -+ -+ "rsb %[tmp], %[n], #32 \n\t" -+ "lsr %[tmp], %[val], %[tmp] \n\t" -+ "mul %[tmp], %[range], %[tmp] \n\t" -+ -+ "rev %[m], %[m] \n\t" -+ -+ "lsl %[tmp], %[tmp], #23 \n\t" -+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t" -+ -+ "and %[tmp], %[bits], #7 \n\t" -+ "lsl %[m], %[m], %[tmp] \n\t" -+ -+ "orr %[low], %[low], %[m], lsr #9 \n\t" ++ uint32_t bits, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldrh %[bits], [%[cc], %[bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "rsb %[tmp1], %[n], #32 \n\t" ++ "add %[bits], %[bits], %[n] \n\t" ++ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t" ++ "lsr %[tmp1], %[val], %[tmp1] \n\t" ++ "ldr %[val], [%[cc], %[low_off]] \n\t" ++ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t" ++ "mul %[tmp1], %[tmp2], %[tmp1] \n\t" ++ "and %[tmp2], %[bits], #7 \n\t" ++ "strh %[bits], [%[cc], %[bits_off]] \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[tmp1], %[tmp1], #23 \n\t" ++ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t" ++ "lsl %[ptr], %[ptr], %[tmp2] \n\t" ++ "orr %[val], %[val], %[ptr], lsr #9 \n\t" ++ "str %[val], [%[cc], %[low_off]] \n\t" + : // Outputs -+ [m]"=&r"(m), -+ [tmp]"=&r"(tmp), -+ [bits]"+&r"(c->by22.bits), -+ [low]"+&r"(c->low) ++ [val]"+r"(val), ++ [bits]"=&r"(bits), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) + : // Inputs -+ [n]"r"(n), -+ [val]"r"(val), -+ [inv]"r"(c->range), -+ [range]"r"(c->by22.range), -+ [ptr]"r"(c->bytestream) ++ [cc]"r"(c), ++ [n]"r"(n), ++ [bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [range_off]"J"(offsetof(CABACContext, by22.range)), ++ [low_off]"J"(offsetof(CABACContext, low)) + : // Clobbers ++ "memory" + ); +} + -+ -+// Works but slower than C -+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r) -+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param) ++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm ++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param) +{ -+ uint32_t n, val, tmp, level; -+ -+// PROFILE_START(); -+ -+ __asm__ ( -+ // Peek -+ "bic %[val], %[low], #1 \n\t" -+ "cmp %[inv], #0 \n\t" -+ "umullne %[tmp], %[val], %[inv], %[val] \n\t" -+ "lsl %[val], %[val], #1 \n\t" -+ -+ // Count bits (n = prefix) -+ "mvn %[n], %[val] \n\t" -+ "clz %[n], %[n] \n\t" -+ -+ "lsl %[level], %[val], %[n] \n\t" -+ "subs %[tmp], %[n], #3 \n\t" -+ "blo 2f \n\t" -+ -+ // prefix >= 3 -+ // < tmp = prefix - 3 -+ // > tmp = prefix + rice - 3 -+ "add %[tmp], %[tmp], %[rice] \n\t" -+ // > n = prefix * 2 + rice - 3 -+ "add %[n], %[tmp], %[n] \n\t" -+ "cmp %[n], #21 \n\t" -+ "bhi 3f \n\t" -+ -+ "orr %[level], %[level], #0x80000000 \n\t" -+ "rsb %[tmp], %[tmp], #31 \n\t" -+ "lsr %[level], %[level], %[tmp] \n\t" -+ -+ "mov %[tmp], #2 \n\t" -+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t" -+ "b 1f \n\t" -+ -+ // > 22 bits used in total - need reload -+ "3: \n\t" -+ -+ // Stash prefix + rice - 3 in level (only spare reg) -+ "mov %[level], %[tmp] \n\t" -+ // Restore n to flush value (prefix) -+ "sub %[n], %[n], %[tmp] \n\t" -+ -+ // Flush + reload -+ -+// "rsb %[tmp], %[n], #32 \n\t" -+// "lsr %[tmp], %[val], %[tmp] \n\t" -+// "mul %[tmp], %[range], %[tmp] \n\t" -+ -+ // As it happens we know that all the bits we are flushing are 1 -+ // so we can cheat slightly -+ "rsb %[tmp], %[range], %[range], lsl %[n] \n\t" -+ "lsl %[tmp], %[tmp], #23 \n\t" -+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t" -+ -+ "add %[bits], %[bits], %[n] \n\t" -+ "ldr %[n], [%[ptr], %[bits], lsr #3] \n\t" -+ "rev %[n], %[n] \n\t" -+ "and %[tmp], %[bits], #7 \n\t" -+ "lsl %[n], %[n], %[tmp] \n\t" -+ -+ "orr %[low], %[low], %[n], lsr #9 \n\t" -+ -+ // (reload) -+ -+ "bic %[val], %[low], #1 \n\t" -+ "cmp %[inv], #0 \n\t" -+ "umullne %[tmp], %[val], %[inv], %[val] \n\t" -+ "lsl %[val], %[val], #1 \n\t" -+ -+ // Build value -+ -+ "mov %[n], %[level] \n\t" -+ -+ "orr %[tmp], %[val], #0x80000000 \n\t" -+ "rsb %[level], %[level], #31 \n\t" -+ "lsr %[level], %[tmp], %[level] \n\t" -+ -+ "mov %[tmp], #2 \n\t" -+ "add %[level], %[level], %[tmp], lsl %[rice] \n\t" -+ "b 1f \n\t" -+ -+ // prefix < 3 -+ "2: \n\t" -+ "rsb %[tmp], %[rice], #31 \n\t" -+ "lsr %[level], %[level], %[tmp] \n\t" -+ "orr %[level], %[level], %[n], lsl %[rice] \n\t" -+ "add %[n], %[n], %[rice] \n\t" -+ -+ "1: \n\t" -+ // Flush -+ "add %[n], %[n], #1 \n\t" -+ -+ "rsb %[tmp], %[n], #32 \n\t" -+ "lsr %[tmp], %[val], %[tmp] \n\t" -+ -+ "add %[bits], %[bits], %[n] \n\t" -+ "ldr %[val], [%[ptr], %[bits], lsr #3] \n\t" -+ -+ "mul %[tmp], %[range], %[tmp] \n\t" -+ "lsl %[tmp], %[tmp], #23 \n\t" -+ "rsb %[low], %[tmp], %[low], lsl %[n] \n\t" -+ -+ "rev %[val], %[val] \n\t" -+ "and %[tmp], %[bits], #7 \n\t" -+ "lsl %[val], %[val], %[tmp] \n\t" -+ -+ "orr %[low], %[low], %[val], lsr #9 \n\t" ++ uint32_t last_coeff_abs_level_remaining; ++ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[remain], [%[cc], %[low_off]] \n\t" ++ "ldr %[prefix], [%[cc], %[range_off]] \n\t" ++ "bic %[remain], %[remain], #1 \n\t" ++ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[prefix], #0 \n\t" ++ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t" ++ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t" ++ "lsl %[remain], %[remain], #1 \n\t" ++ "mvn %[prefix], %[remain] \n\t" ++ "clz %[prefix], %[prefix] \n\t" ++ "rsbs %[n1], %[prefix], #2 \n\t" ++ "bcc 1f \n\t" ++ "adc %[n1], %[rice], %[prefix] \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp2], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "ldr %[range], [%[cc], %[low_off]] \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t" ++ "rsb %[tmp2], %[rice], #31 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[prefix], %[rice] \n\t" ++ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t" ++ "b 3f \n\t" ++ "1: \n\t" ++ "add %[n2], %[rice], %[prefix], lsl #1 \n\t" ++ "cmp %[n2], %[peek_bits_plus_2] \n\t" ++ "bhi 2f \n\t" ++ "sub %[n1], %[n2], #2 \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp1], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "rsb %[range], %[rice], #34 \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "ldr %[tmp2], [%[cc], %[low_off]] \n\t" ++ "rsb %[prefix], %[prefix], %[range] \n\t" ++ "orr %[remain], %[remain], #0x80000000 \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++ "mov %[range], #2 \n\t" ++ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t" ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "lsl %[rice], %[range], %[rice] \n\t" ++ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t" ++ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t" ++ "b 4f \n\t" ++ "2: \n\t" ++ "add %[n1], %[tmp2], %[prefix] \n\t" ++ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t" ++ "rsb %[tmp1], %[prefix], #32 \n\t" ++ "push {%[rice]} \n\t" ++ "and %[rice], %[n1], #7 \n\t" ++ "lsr %[tmp1], %[remain], %[tmp1] \n\t" ++ "ldr %[ptr], [%[cc], %[low_off]] \n\t" ++ "mul %[remain], %[range], %[tmp1] \n\t" ++ "rev %[tmp2], %[tmp2] \n\t" ++ "rsb %[n2], %[prefix], %[n2] \n\t" ++ "ldr %[tmp1], [%[cc], %[range_off]] \n\t" ++ "lsl %[rice], %[tmp2], %[rice] \n\t" ++ "sub %[tmp2], %[n2], #2 \n\t" ++ "lsl %[remain], %[remain], #23 \n\t" ++ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t" ++ "orr %[remain], %[remain], %[rice], lsr #9 \n\t" ++ "add %[prefix], %[n1], %[tmp2] \n\t" ++ "bic %[n1], %[remain], #1 \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[tmp1], #0 \n\t" ++ "rsb %[rice], %[tmp2], #32 \n\t" ++ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t" ++ "and %[tmp1], %[prefix], #7 \n\t" ++ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t" ++ "lsl %[n1], %[n1], #1 \n\t" ++ "lsr %[rice], %[n1], %[rice] \n\t" ++ "rsb %[n2], %[n2], #34 \n\t" ++ "mul %[range], %[range], %[rice] \n\t" ++ "pop {%[rice]} \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "orr %[n1], %[n1], #0x80000000 \n\t" ++ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t" ++ "mov %[prefix], #2 \n\t" ++ "lsl %[range], %[range], #23 \n\t" ++ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t" ++ "lsl %[remain], %[prefix], %[rice] \n\t" ++ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t" ++ "3: \n\t" ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "orr %[range], %[range], %[ptr], lsr #9 \n\t" ++ "4: \n\t" ++ "str %[range], [%[cc], %[low_off]] \n\t" + : // Outputs -+ [level]"=&r"(level), -+ [n]"=&r"(n), -+ [val]"=&r"(val), -+ [tmp]"=&r"(tmp), -+ [bits]"+&r"(c->by22.bits), -+ [low]"+&r"(c->low) ++ [remain]"=&r"(last_coeff_abs_level_remaining), ++ [rice]"+r"(rice_param), ++ [prefix]"=&r"(prefix), ++ [n1]"=&r"(n1), ++ [range]"=&r"(range), ++ [n2]"=&r"(n2), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) + : // Inputs -+ [rice]"r"(c_rice_param), -+ [inv]"r"(c->range), -+ [range]"r"(c->by22.range), -+ [ptr]"r"(c->bytestream) ++ [cc]"r"(c), ++ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [by22_range_off]"J"(offsetof(CABACContext, by22.range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)) + : // Clobbers -+ "cc" ++ "cc", "memory" + ); -+ -+// PROFILE_ACC(residual_abs); -+ -+ return level; ++ return last_coeff_abs_level_remaining; +} -+#endif + +#endif /* HAVE_ARMV6T2_INLINE */ + @@ -3359,349 +3424,6 @@ index 0000000000..d691cda836 + m_filter_v_chroma_16 10 +endfunc + -diff --git a/libavcodec/arm/rpi_hevcdsp_epel_neon.S b/libavcodec/arm/rpi_hevcdsp_epel_neon.S -new file mode 100644 -index 0000000000..acc6911091 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_epel_neon.S -@@ -0,0 +1,337 @@ -+/* -+ * Copyright (c) 2014 - 2015 Seppo Tomperi -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+#define MAX_PB_SIZE #64 -+ -+.macro vextin_d4 -+ vld1.8 {q10}, [r1], r2 -+ vmov d16, d20 -+ vext.8 d17, d20, d21, #1 -+ vext.8 d18, d20, d21, #2 -+ vext.8 d19, d20, d21, #3 -+.endm -+ -+.macro vextin_d4_8 -+ vld1.8 d16, [r1], r2 -+ vext.8 d17, d16, d16, #1 -+ vext.8 d18, d16, d16, #2 -+ vext.8 d19, d16, d16, #3 -+.endm -+ -+.macro load_coeffs_16b coeffs -+ ldr \coeffs, [\coeffs] -+ vdup.i8 d0, \coeffs -+ lsr \coeffs, #8 -+ vdup.i8 d1, \coeffs -+ lsr \coeffs, #8 -+ vdup.i8 d2, \coeffs -+ lsr \coeffs, #8 -+ vdup.i8 d3, \coeffs -+.endm -+ -+.macro epel_filter_16b out=q12 -+ vmull.u8 q3, d16, d0 -+ vmull.u8 q11, d19, d3 -+ vmull.u8 \out, d17, d1 -+ vmull.u8 q10, d18, d2 -+ vadd.s16 q3, q11 -+ vadd.s16 \out, q10 -+ vsub.s16 \out, q3 -+.endm -+ -+.macro load_coeffs_32b coeffs -+ ldr \coeffs, [\coeffs] -+ vmov.i64 d4, #0 -+ vmov.8 d4[0], \coeffs -+ lsr \coeffs, #8 -+ vmov.8 d4[2], \coeffs -+ lsr \coeffs, #8 -+ vmov.8 d4[4], \coeffs -+ lsr \coeffs, #8 -+ vmov.8 d4[6], \coeffs -+.endm -+ -+.macro epel_filter_32b -+ vmull.s16 q3, d24, d4[0] //q12 -+ vmull.s16 q4, d25, d4[0] -+ vmull.s16 q5, d30, d4[3] //q15 -+ vmull.s16 q6, d31, d4[3] -+ -+ vmull.s16 q7, d26, d4[1] // q13 -+ vmull.s16 q8, d27, d4[1] -+ vmull.s16 q9, d28, d4[2] // q14 -+ vmull.s16 q10, d29, d4[2] -+ vadd.s32 q3, q5 -+ vadd.s32 q4, q6 -+ vadd.s32 q7, q9 -+ vadd.s32 q8, q10 -+ vsub.s32 q7, q3 -+ vsub.s32 q8, q4 -+ vqshrn.s32 d6, q7, #6 -+ vqshrn.s32 d7, q8, #6 -+.endm -+ -+.macro epel_filter_32b_4 -+ vmull.s16 q3, d24, d4[0] //q12 -+ vmull.s16 q5, d30, d4[3] //q15 -+ vmull.s16 q7, d26, d4[1] // q13 -+ vmull.s16 q9, d28, d4[2] // q14 -+ vadd.s32 q3, q5 -+ vadd.s32 q7, q9 -+ vsub.s32 q7, q3 -+ vqshrn.s32 d6, q7, #6 -+.endm -+ -+function ff_hevc_rpi_put_epel_h_neon_8, export=1 -+ push {r4-r7} -+ mov r4, MAX_PB_SIZE -+ ldr r7, [sp, #16] // mx -+ ldr r5, [sp, #24] // width -+ sub r7, #1 -+ lsl r7, #2 -+ vpush {d8-d15} -+@ adr reaches if we are in thumb mode but not in arm -+T adr r12, epel_coeffs -+A adrl r12, epel_coeffs -+ add r7, r12 -+ sub r1, #1 -+ lsl r4, #1 -+ load_coeffs_16b r7 -+ mov r12, r3 -+ mov r6, r0 -+ mov r7, r1 -+ cmp r5, #6 -+ bgt 8f -+ cmp r5, #4 -+ blt 2f -+ b 4f -+8: subs r3, #1 -+ pld [r1] -+ vextin_d4 -+ epel_filter_16b -+ vst1.16 {q12}, [r0], r4 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r3, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r1, r7 -+ cmp r5, #4 -+ bgt 8b -+4: subs r3, #1 -+ pld [r1] -+ vextin_d4_8 -+ epel_filter_16b -+ vst1.16 d24, [r0], r4 -+ bne 4b -+ subs r5, #4 -+ beq 99f -+ mov r3, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #4 -+ mov r1, r7 -+2: subs r3, #1 -+ pld [r1] -+ vextin_d4_8 -+ epel_filter_16b -+ vst1.32 d24[0], [r0], r4 -+ bne 2b -+99: vpop {d8-d15} -+ pop {r4-r7} -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_epel_v_neon_8, export=1 -+ push {r4-r7} -+ mov r4, MAX_PB_SIZE -+ ldr r7, [sp, #20] // my -+ ldr r5, [sp, #24] // width -+ sub r7, #1 -+ lsl r7, #2 -+ vpush {d8-d15} -+T adr r12, epel_coeffs -+A adrl r12, epel_coeffs -+ add r7, r12 -+ load_coeffs_16b r7 -+ sub r1, r2 -+ lsl r4, #1 -+ mov r12, r3 -+ mov r6, r0 -+ mov r7, r1 -+0: pld [r1] -+ vld1.8 {d16}, [r1], r2 -+ pld [r1] -+ vld1.8 {d17}, [r1], r2 -+ pld [r1] -+ vld1.8 {d18}, [r1], r2 -+ cmp r5, #6 -+ bgt 8f -+ cmp r5, #4 -+ blt 2f -+ b 4f -+8: pld [r1] -+ vld1.8 {d19}, [r1], r2 -+ subs r3, #1 -+ epel_filter_16b -+ vst1.16 {q12}, [r0], r4 -+ vmov d16, d17 -+ vmov d17, d18 -+ vmov d18, d19 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r3, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r1, r7 -+ b 0b -+4: pld [r1] -+ vld1.8 {d19}, [r1], r2 -+ subs r3, #1 -+ epel_filter_16b -+ vst1.16 d24, [r0], r4 -+ vmov d16, d17 -+ vmov d17, d18 -+ vmov d18, d19 -+ bne 4b -+ subs r5, #4 -+ beq 99f -+ mov r3, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #4 -+ mov r1, r7 -+ b 0b -+2: pld [r1] -+ vld1.8 {d19}, [r1], r2 -+ subs r3, #1 -+ epel_filter_16b -+ vst1.32 d24[0], [r0], r4 -+ vmov d16, d17 -+ vmov d17, d18 -+ vmov d18, d19 -+ bne 2b -+99: vpop {d8-d15} -+ pop {r4-r7} -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_epel_hv_neon_8, export=1 -+ push {r4-r7} -+ mov r4, MAX_PB_SIZE -+ ldr r6, [sp, #16] // mx -+ ldr r7, [sp, #20] // my -+ ldr r5, [sp, #24] // width -+ sub r7, #1 -+ lsl r7, #2 -+ vpush {d8-d15} -+ adr r12, epel_coeffs -+ sub r6, #1 -+ lsl r6, #2 -+ add r6, r12 // mx epel coeff offset -+ add r7, r12 -+ sub r1, #1 -+ sub r1, r2 -+ lsl r4, #1 -+ load_coeffs_16b r6 -+ load_coeffs_32b r7 -+ mov r12, r3 -+ mov r6, r0 -+ mov r7, r1 -+0: pld [r1] -+ vextin_d4 -+ epel_filter_16b q12 -+ pld [r1] -+ vextin_d4 -+ epel_filter_16b q13 -+ pld [r1] -+ vextin_d4 -+ epel_filter_16b q14 -+ cmp r5, #6 -+ bgt 8f -+ cmp r5, #4 -+ blt 2f -+ b 4f -+8: pld [r1] -+ vextin_d4 -+ epel_filter_16b q15 -+ subs r3, #1 -+ epel_filter_32b -+ vst1.16 {q3}, [r0], r4 -+ vmov q12, q13 -+ vmov q13, q14 -+ vmov q14, q15 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r3, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r1, r7 -+ b 0b -+4: pld [r1] -+ vextin_d4_8 -+ epel_filter_16b q15 -+ subs r3, #1 -+ epel_filter_32b_4 -+ vst1.16 d6, [r0], r4 -+ vmov q12, q13 -+ vmov q13, q14 -+ vmov q14, q15 -+ bne 4b -+ subs r5, #4 -+ beq 99f -+ mov r3, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #4 -+ mov r1, r7 -+ b 0b -+2: pld [r1] -+ vextin_d4_8 -+ epel_filter_16b q15 -+ subs r3, #1 -+ epel_filter_32b_4 -+ vst1.32 d6[0], [r0], r4 -+ vmov q12, q13 -+ vmov q13, q14 -+ vmov q14, q15 -+ bne 2b -+99: vpop {d8-d15} -+ pop {r4-r7} -+ bx lr -+endfunc -+ -+epel_coeffs: -+ .byte 2, 58, 10, 2 -+ .byte 4, 54, 16, 2 -+ .byte 6, 46, 28, 4 -+ .byte 4, 36, 36, 4 -+ .byte 4, 28, 46, 6 -+ .byte 2, 16, 54, 4 -+ .byte 2, 10, 58, 2 diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S new file mode 100644 index 0000000000..cd79460984 @@ -4127,10 +3849,10 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..472d9d75c9 +index 0000000000..764647fed9 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c -@@ -0,0 +1,652 @@ +@@ -0,0 +1,473 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -4159,6 +3881,9 @@ index 0000000000..472d9d75c9 +#include "libavcodec/avcodec.h" +#include "libavcodec/bit_depth_template.c" + ++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but ++// have been removed from head as we never use them. ++ +void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); @@ -4361,114 +4086,6 @@ index 0000000000..472d9d75c9 + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + -+#define PUT_PIXELS(name) \ -+ void name(int16_t *dst, uint8_t *src, \ -+ ptrdiff_t srcstride, int height, \ -+ intptr_t mx, intptr_t my, int width) -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w2_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w4_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w6_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w8_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w12_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w16_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w24_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w32_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w48_neon_8); -+PUT_PIXELS(ff_hevc_rpi_put_pixels_w64_neon_8); -+#undef PUT_PIXELS -+void ff_hevc_rpi_put_epel_h_neon_8(int16_t *dst, uint8_t *src, -+ ptrdiff_t srcstride, int height, -+ intptr_t mx, intptr_t my, int width); -+void ff_hevc_rpi_put_epel_v_neon_8(int16_t *dst, uint8_t *src, -+ ptrdiff_t srcstride, int height, -+ intptr_t mx, intptr_t my, int width); -+void ff_hevc_rpi_put_epel_hv_neon_8(int16_t *dst, uint8_t *src, -+ ptrdiff_t srcstride, int height, -+ intptr_t mx, intptr_t my, int width); -+ -+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, -+ int height, int width); -+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, -+ int width, int height, int16_t* src2, ptrdiff_t src2stride); -+void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, -+ int height, intptr_t mx, intptr_t my, int width); -+void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, -+ int height, intptr_t mx, intptr_t my, int width); -+void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width); -+#define QPEL_FUNC(name) \ -+ void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \ -+ int height, int width) -+ -+QPEL_FUNC(ff_hevc_rpi_put_qpel_v1_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_v2_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_v3_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v1_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v2_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v3_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v1_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v2_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v3_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v1_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v2_neon_8); -+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v3_neon_8); -+#undef QPEL_FUNC -+ -+#define QPEL_FUNC_UW_PIX(name) \ -+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ -+ int height, intptr_t mx, intptr_t my, int width); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8); -+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8); -+#undef QPEL_FUNC_UW_PIX -+ -+#define QPEL_FUNC_UW(name) \ -+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ -+ int width, int height, int16_t* src2, ptrdiff_t src2stride); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_pixels_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v1_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v2_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v3_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v1_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v2_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v3_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v1_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v2_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v3_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v1_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v2_neon_8); -+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v3_neon_8); -+#undef QPEL_FUNC_UW -+ -+void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, -+ int height, intptr_t mx, intptr_t my, int width) { -+ -+ put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width); -+} -+ -+void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, -+ int height, intptr_t mx, intptr_t my, int width) { -+ -+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0); -+} -+ -+void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, -+ int16_t *src2, -+ int height, intptr_t mx, intptr_t my, int width) { -+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); -+} -+ +void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + const MvField *curr, const MvField *neigh, uint8_t *bs); @@ -4571,7 +4188,6 @@ index 0000000000..472d9d75c9 +av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) +{ + if (bit_depth == 8) { -+ int x; + c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon; + c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon; @@ -4636,79 +4252,6 @@ index 0000000000..472d9d75c9 + c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; + c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; +#endif -+ put_hevc_qpel_neon[1][0] = ff_hevc_rpi_put_qpel_v1_neon_8; -+ put_hevc_qpel_neon[2][0] = ff_hevc_rpi_put_qpel_v2_neon_8; -+ put_hevc_qpel_neon[3][0] = ff_hevc_rpi_put_qpel_v3_neon_8; -+ put_hevc_qpel_neon[0][1] = ff_hevc_rpi_put_qpel_h1_neon_8; -+ put_hevc_qpel_neon[0][2] = ff_hevc_rpi_put_qpel_h2_neon_8; -+ put_hevc_qpel_neon[0][3] = ff_hevc_rpi_put_qpel_h3_neon_8; -+ put_hevc_qpel_neon[1][1] = ff_hevc_rpi_put_qpel_h1v1_neon_8; -+ put_hevc_qpel_neon[1][2] = ff_hevc_rpi_put_qpel_h2v1_neon_8; -+ put_hevc_qpel_neon[1][3] = ff_hevc_rpi_put_qpel_h3v1_neon_8; -+ put_hevc_qpel_neon[2][1] = ff_hevc_rpi_put_qpel_h1v2_neon_8; -+ put_hevc_qpel_neon[2][2] = ff_hevc_rpi_put_qpel_h2v2_neon_8; -+ put_hevc_qpel_neon[2][3] = ff_hevc_rpi_put_qpel_h3v2_neon_8; -+ put_hevc_qpel_neon[3][1] = ff_hevc_rpi_put_qpel_h1v3_neon_8; -+ put_hevc_qpel_neon[3][2] = ff_hevc_rpi_put_qpel_h2v3_neon_8; -+ put_hevc_qpel_neon[3][3] = ff_hevc_rpi_put_qpel_h3v3_neon_8; -+ put_hevc_qpel_uw_neon[1][0] = ff_hevc_rpi_put_qpel_uw_v1_neon_8; -+ put_hevc_qpel_uw_neon[2][0] = ff_hevc_rpi_put_qpel_uw_v2_neon_8; -+ put_hevc_qpel_uw_neon[3][0] = ff_hevc_rpi_put_qpel_uw_v3_neon_8; -+ put_hevc_qpel_uw_neon[0][1] = ff_hevc_rpi_put_qpel_uw_h1_neon_8; -+ put_hevc_qpel_uw_neon[0][2] = ff_hevc_rpi_put_qpel_uw_h2_neon_8; -+ put_hevc_qpel_uw_neon[0][3] = ff_hevc_rpi_put_qpel_uw_h3_neon_8; -+ put_hevc_qpel_uw_neon[1][1] = ff_hevc_rpi_put_qpel_uw_h1v1_neon_8; -+ put_hevc_qpel_uw_neon[1][2] = ff_hevc_rpi_put_qpel_uw_h2v1_neon_8; -+ put_hevc_qpel_uw_neon[1][3] = ff_hevc_rpi_put_qpel_uw_h3v1_neon_8; -+ put_hevc_qpel_uw_neon[2][1] = ff_hevc_rpi_put_qpel_uw_h1v2_neon_8; -+ put_hevc_qpel_uw_neon[2][2] = ff_hevc_rpi_put_qpel_uw_h2v2_neon_8; -+ put_hevc_qpel_uw_neon[2][3] = ff_hevc_rpi_put_qpel_uw_h3v2_neon_8; -+ put_hevc_qpel_uw_neon[3][1] = ff_hevc_rpi_put_qpel_uw_h1v3_neon_8; -+ put_hevc_qpel_uw_neon[3][2] = ff_hevc_rpi_put_qpel_uw_h2v3_neon_8; -+ put_hevc_qpel_uw_neon[3][3] = ff_hevc_rpi_put_qpel_uw_h3v3_neon_8; -+ for (x = 0; x < 10; x++) { -+ c->put_hevc_qpel[x][1][0] = ff_hevc_rpi_put_qpel_neon_wrapper; -+ c->put_hevc_qpel[x][0][1] = ff_hevc_rpi_put_qpel_neon_wrapper; -+ c->put_hevc_qpel[x][1][1] = ff_hevc_rpi_put_qpel_neon_wrapper; -+ c->put_hevc_qpel_uni[x][1][0] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; -+ c->put_hevc_qpel_uni[x][0][1] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; -+ c->put_hevc_qpel_uni[x][1][1] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; -+ c->put_hevc_qpel_bi[x][1][0] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; -+ c->put_hevc_qpel_bi[x][0][1] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; -+ c->put_hevc_qpel_bi[x][1][1] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; -+ c->put_hevc_epel[x][1][0] = ff_hevc_rpi_put_epel_v_neon_8; -+ c->put_hevc_epel[x][0][1] = ff_hevc_rpi_put_epel_h_neon_8; -+ c->put_hevc_epel[x][1][1] = ff_hevc_rpi_put_epel_hv_neon_8; -+ } -+ c->put_hevc_epel[0][0][0] = ff_hevc_rpi_put_pixels_w2_neon_8; -+ c->put_hevc_epel[1][0][0] = ff_hevc_rpi_put_pixels_w4_neon_8; -+ c->put_hevc_epel[2][0][0] = ff_hevc_rpi_put_pixels_w6_neon_8; -+ c->put_hevc_epel[3][0][0] = ff_hevc_rpi_put_pixels_w8_neon_8; -+ c->put_hevc_epel[4][0][0] = ff_hevc_rpi_put_pixels_w12_neon_8; -+ c->put_hevc_epel[5][0][0] = ff_hevc_rpi_put_pixels_w16_neon_8; -+ c->put_hevc_epel[6][0][0] = ff_hevc_rpi_put_pixels_w24_neon_8; -+ c->put_hevc_epel[7][0][0] = ff_hevc_rpi_put_pixels_w32_neon_8; -+ c->put_hevc_epel[8][0][0] = ff_hevc_rpi_put_pixels_w48_neon_8; -+ c->put_hevc_epel[9][0][0] = ff_hevc_rpi_put_pixels_w64_neon_8; -+ -+ c->put_hevc_qpel[0][0][0] = ff_hevc_rpi_put_pixels_w2_neon_8; -+ c->put_hevc_qpel[1][0][0] = ff_hevc_rpi_put_pixels_w4_neon_8; -+ c->put_hevc_qpel[2][0][0] = ff_hevc_rpi_put_pixels_w6_neon_8; -+ c->put_hevc_qpel[3][0][0] = ff_hevc_rpi_put_pixels_w8_neon_8; -+ c->put_hevc_qpel[4][0][0] = ff_hevc_rpi_put_pixels_w12_neon_8; -+ c->put_hevc_qpel[5][0][0] = ff_hevc_rpi_put_pixels_w16_neon_8; -+ c->put_hevc_qpel[6][0][0] = ff_hevc_rpi_put_pixels_w24_neon_8; -+ c->put_hevc_qpel[7][0][0] = ff_hevc_rpi_put_pixels_w32_neon_8; -+ c->put_hevc_qpel[8][0][0] = ff_hevc_rpi_put_pixels_w48_neon_8; -+ c->put_hevc_qpel[9][0][0] = ff_hevc_rpi_put_pixels_w64_neon_8; -+ -+ c->put_hevc_qpel_uni[1][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8; -+ c->put_hevc_qpel_uni[3][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8; -+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8; -+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8; -+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8; -+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8; -+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8; + } + else if (bit_depth == 10) { + c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; @@ -4783,1011 +4326,6 @@ index 0000000000..472d9d75c9 + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; +} -diff --git a/libavcodec/arm/rpi_hevcdsp_qpel_neon.S b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S -new file mode 100644 -index 0000000000..86a9dcc377 ---- /dev/null -+++ b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S -@@ -0,0 +1,999 @@ -+/* -+ * Copyright (c) 2014 - 2015 Seppo Tomperi -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "libavutil/arm/asm.S" -+#include "neon.S" -+ -+#define MAX_PB_SIZE #64 -+ -+.macro regshuffle_d8 -+ vmov d16, d17 -+ vmov d17, d18 -+ vmov d18, d19 -+ vmov d19, d20 -+ vmov d20, d21 -+ vmov d21, d22 -+ vmov d22, d23 -+.endm -+ -+.macro regshuffle_q8 -+ vmov q0, q1 -+ vmov q1, q2 -+ vmov q2, q3 -+ vmov q3, q4 -+ vmov q4, q5 -+ vmov q5, q6 -+ vmov q6, q7 -+.endm -+ -+.macro vextin8 -+ pld [r2] -+ vld1.8 {q11}, [r2], r3 -+ vext.8 d16, d22, d23, #1 -+ vext.8 d17, d22, d23, #2 -+ vext.8 d18, d22, d23, #3 -+ vext.8 d19, d22, d23, #4 -+ vext.8 d20, d22, d23, #5 -+ vext.8 d21, d22, d23, #6 -+ vext.8 d22, d22, d23, #7 -+.endm -+ -+.macro loadin8 -+ pld [r2] -+ vld1.8 {d16}, [r2], r3 -+ pld [r2] -+ vld1.8 {d17}, [r2], r3 -+ pld [r2] -+ vld1.8 {d18}, [r2], r3 -+ pld [r2] -+ vld1.8 {d19}, [r2], r3 -+ pld [r2] -+ vld1.8 {d20}, [r2], r3 -+ pld [r2] -+ vld1.8 {d21}, [r2], r3 -+ pld [r2] -+ vld1.8 {d22}, [r2], r3 -+ pld [r2] -+ vld1.8 {d23}, [r2], r3 -+.endm -+ -+.macro qpel_filter_1_32b -+ vmov.i16 d16, #58 -+ vmov.i16 d17, #10 -+ vmull.s16 q9, d6, d16 // 58 * d0 -+ vmull.s16 q10, d7, d16 // 58 * d1 -+ vmov.i16 d16, #17 -+ vmull.s16 q11, d4, d17 // 10 * c0 -+ vmull.s16 q12, d5, d17 // 10 * c1 -+ vmov.i16 d17, #5 -+ vmull.s16 q13, d8, d16 // 17 * e0 -+ vmull.s16 q14, d9, d16 // 17 * e1 -+ vmull.s16 q15, d10, d17 // 5 * f0 -+ vmull.s16 q8, d11, d17 // 5 * f1 -+ vsub.s32 q9, q11 // 58 * d0 - 10 * c0 -+ vsub.s32 q10, q12 // 58 * d1 - 10 * c1 -+ vshll.s16 q11, d2, #2 // 4 * b0 -+ vshll.s16 q12, d3, #2 // 4 * b1 -+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 -+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 -+ vsubl.s16 q13, d12, d0 // g0 - a0 -+ vsubl.s16 q14, d13, d1 // g1 - a1 -+ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 -+ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 -+ vsub.s32 q13, q15 // g0 - a0 - 5 * f0 -+ vsub.s32 q14, q8 // g1 - a1 - 5 * f1 -+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 -+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 -+ vqshrn.s32 d16, q9, #6 -+ vqshrn.s32 d17, q10, #6 -+.endm -+ -+// input q0 - q7 -+// output q8 -+.macro qpel_filter_2_32b -+ vmov.i32 q8, #11 -+ vaddl.s16 q9, d6, d8 // d0 + e0 -+ vaddl.s16 q10, d7, d9 // d1 + e1 -+ vaddl.s16 q11, d4, d10 // c0 + f0 -+ vaddl.s16 q12, d5, d11 // c1 + f1 -+ vmul.s32 q11, q8 // 11 * (c0 + f0) -+ vmul.s32 q12, q8 // 11 * (c1 + f1) -+ vmov.i32 q8, #40 -+ vaddl.s16 q15, d2, d12 // b0 + g0 -+ vmul.s32 q9, q8 // 40 * (d0 + e0) -+ vmul.s32 q10, q8 // 40 * (d1 + e1) -+ vaddl.s16 q8, d3, d13 // b1 + g1 -+ vaddl.s16 q13, d0, d14 // a0 + h0 -+ vaddl.s16 q14, d1, d15 // a1 + h1 -+ vshl.s32 q15, #2 // 4*(b0+g0) -+ vshl.s32 q8, #2 // 4*(b1+g1) -+ vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 -+ vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 -+ vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) -+ vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) -+ vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) -+ vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) -+ vqshrn.s32 d16, q9, #6 -+ vqshrn.s32 d17, q10, #6 -+.endm -+ -+.macro qpel_filter_3_32b -+ vmov.i16 d16, #58 -+ vmov.i16 d17, #10 -+ vmull.s16 q9, d8, d16 // 58 * d0 -+ vmull.s16 q10, d9, d16 // 58 * d1 -+ vmov.i16 d16, #17 -+ vmull.s16 q11, d10, d17 // 10 * c0 -+ vmull.s16 q12, d11, d17 // 10 * c1 -+ vmov.i16 d17, #5 -+ vmull.s16 q13, d6, d16 // 17 * e0 -+ vmull.s16 q14, d7, d16 // 17 * e1 -+ vmull.s16 q15, d4, d17 // 5 * f0 -+ vmull.s16 q8, d5, d17 // 5 * f1 -+ vsub.s32 q9, q11 // 58 * d0 - 10 * c0 -+ vsub.s32 q10, q12 // 58 * d1 - 10 * c1 -+ vshll.s16 q11, d12, #2 // 4 * b0 -+ vshll.s16 q12, d13, #2 // 4 * b1 -+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 -+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 -+ vsubl.s16 q13, d2, d14 // g0 - a0 -+ vsubl.s16 q14, d3, d15 // g1 - a1 -+ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 -+ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 -+ vsub.s32 q13, q15 // g0 - a0 - 5 * f0 -+ vsub.s32 q14, q8 // g1 - a1 - 5 * f1 -+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 -+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 -+ vqshrn.s32 d16, q9, #6 -+ vqshrn.s32 d17, q10, #6 -+.endm -+ -+.macro qpel_filter_1 out=q7 -+ vmov.u8 d24, #58 -+ vmov.u8 d25, #10 -+ vshll.u8 q13, d20, #4 // 16*e -+ vshll.u8 q14, d21, #2 // 4*f -+ vmull.u8 \out, d19, d24 // 58*d -+ vaddw.u8 q13, q13, d20 // 17*e -+ vmull.u8 q15, d18, d25 // 10*c -+ vaddw.u8 q14, q14, d21 // 5*f -+ vsubl.u8 q12, d22, d16 // g - a -+ vadd.u16 \out, q13 // 58d + 17e -+ vshll.u8 q13, d17, #2 // 4*b -+ vadd.u16 q15, q14 // 10*c + 5*f -+ vadd.s16 q13, q12 // - a + 4*b + g -+ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f -+ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f -+.endm -+ -+.macro qpel_filter_2 out=q7 -+ vmov.i16 q12, #10 -+ vmov.i16 q14, #11 -+ vaddl.u8 q13, d19, d20 // d + e -+ vaddl.u8 q15, d18, d21 // c + f -+ vmul.u16 q13, q12 // 10 * (d+e) -+ vmul.u16 q15, q14 // 11 * ( c + f) -+ vaddl.u8 \out, d17, d22 // b + g -+ vaddl.u8 q12, d16, d23 // a + h -+ vadd.u16 \out, q13 // b + 10 * (d + e) + g -+ vadd.s16 q12, q15 -+ vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) -+ vsub.s16 \out, q12 -+.endm -+ -+.macro qpel_filter_3 out=q7 -+ vmov.u8 d24, #58 -+ vmov.u8 d25, #10 -+ vshll.u8 q13, d19, #4 // 16*e -+ vshll.u8 q14, d18, #2 // 4*f -+ vmull.u8 \out, d20, d24 // 58*d -+ vaddw.u8 q13, q13, d19 // 17*e -+ vmull.u8 q15, d21, d25 // 10*c -+ vaddw.u8 q14, q14, d18 // 5*f -+ vsubl.u8 q12, d17, d23 // g - a -+ vadd.u16 \out, q13 // 58d + 17e -+ vshll.u8 q13, d22, #2 // 4*b -+ vadd.u16 q15, q14 // 10*c + 5*f -+ vadd.s16 q13, q12 // - a + 4*b + g -+ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f -+ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f -+.endm -+ -+.macro hevc_put_qpel_vX_neon_8 filter -+ push {r4, r5, r6, r7} -+ ldr r4, [sp, #16] // height -+ ldr r5, [sp, #20] // width -+ vpush {d8-d15} -+ sub r2, r2, r3, lsl #1 -+ sub r2, r3 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+ lsl r1, #1 -+0: loadin8 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filter -+ vst1.16 {q7}, [r0], r1 -+ regshuffle_d8 -+ vld1.8 {d23}, [r2], r3 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filter -+ vst1.16 d14, [r0], r1 -+ regshuffle_d8 -+ vld1.32 {d23[0]}, [r2], r3 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4, r5, r6, r7} -+ bx lr -+.endm -+ -+.macro hevc_put_qpel_uw_vX_neon_8 filter -+ push {r4-r10} -+ ldr r5, [sp, #28] // width -+ ldr r4, [sp, #32] // height -+ ldr r8, [sp, #36] // src2 -+ ldr r9, [sp, #40] // src2stride -+ vpush {d8-d15} -+ sub r2, r2, r3, lsl #1 -+ sub r2, r3 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+ cmp r8, #0 -+ bne .Lbi\@ -+0: loadin8 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filter -+ vqrshrun.s16 d0, q7, #6 -+ vst1.8 d0, [r0], r1 -+ regshuffle_d8 -+ vld1.8 {d23}, [r2], r3 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filter -+ vqrshrun.s16 d0, q7, #6 -+ vst1.32 d0[0], [r0], r1 -+ regshuffle_d8 -+ vld1.32 {d23[0]}, [r2], r3 -+ bne 4b -+ b 99f -+.Lbi\@: lsl r9, #1 -+ mov r10, r8 -+0: loadin8 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filter -+ vld1.16 {q0}, [r8], r9 -+ vqadd.s16 q0, q7 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.8 d0, [r0], r1 -+ regshuffle_d8 -+ vld1.8 {d23}, [r2], r3 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r10, #16 -+ mov r8, r10 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filter -+ vld1.16 d0, [r8], r9 -+ vqadd.s16 d0, d14 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.32 d0[0], [r0], r1 -+ regshuffle_d8 -+ vld1.32 {d23[0]}, [r2], r3 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4-r10} -+ bx lr -+.endm -+ -+function ff_hevc_rpi_put_qpel_v1_neon_8, export=1 -+ hevc_put_qpel_vX_neon_8 qpel_filter_1 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_v2_neon_8, export=1 -+ hevc_put_qpel_vX_neon_8 qpel_filter_2 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_v3_neon_8, export=1 -+ hevc_put_qpel_vX_neon_8 qpel_filter_3 -+endfunc -+ -+ -+function ff_hevc_rpi_put_qpel_uw_v1_neon_8, export=1 -+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_v2_neon_8, export=1 -+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_v3_neon_8, export=1 -+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 -+endfunc -+ -+.macro hevc_put_qpel_hX_neon_8 filter -+ push {r4, r5, r6, r7} -+ ldr r4, [sp, #16] // height -+ ldr r5, [sp, #20] // width -+ -+ vpush {d8-d15} -+ sub r2, #4 -+ lsl r1, #1 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ vextin8 -+ \filter -+ vst1.16 {q7}, [r0], r1 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ cmp r5, #4 -+ bne 8b -+4: subs r4, #1 -+ vextin8 -+ \filter -+ vst1.16 d14, [r0], r1 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4, r5, r6, r7} -+ bx lr -+.endm -+ -+.macro hevc_put_qpel_uw_hX_neon_8 filter -+ push {r4-r10} -+ ldr r5, [sp, #28] // width -+ ldr r4, [sp, #32] // height -+ ldr r8, [sp, #36] // src2 -+ ldr r9, [sp, #40] // src2stride -+ vpush {d8-d15} -+ sub r2, #4 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+ cmp r8, #0 -+ bne .Lbi\@ -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ vextin8 -+ \filter -+ vqrshrun.s16 d0, q7, #6 -+ vst1.8 d0, [r0], r1 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ cmp r5, #4 -+ bne 8b -+4: subs r4, #1 -+ vextin8 -+ \filter -+ vqrshrun.s16 d0, q7, #6 -+ vst1.32 d0[0], [r0], r1 -+ bne 4b -+ b 99f -+.Lbi\@: -+ lsl r9, #1 -+ cmp r5, #4 -+ beq 4f -+ mov r10, r8 -+8: subs r4, #1 -+ vextin8 -+ \filter -+ vld1.16 {q0}, [r8], r9 -+ vqadd.s16 q0, q7 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.8 d0, [r0], r1 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ add r10, #16 -+ mov r8, r10 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ cmp r5, #4 -+ bne 8b -+4: subs r4, #1 -+ vextin8 -+ \filter -+ vld1.16 d0, [r8], r9 -+ vqadd.s16 d0, d14 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.32 d0[0], [r0], r1 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4-r10} -+ bx lr -+.endm -+ -+function ff_hevc_rpi_put_qpel_h1_neon_8, export=1 -+ hevc_put_qpel_hX_neon_8 qpel_filter_1 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h2_neon_8, export=1 -+ hevc_put_qpel_hX_neon_8 qpel_filter_2 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h3_neon_8, export=1 -+ hevc_put_qpel_hX_neon_8 qpel_filter_3 -+endfunc -+ -+ -+function ff_hevc_rpi_put_qpel_uw_h1_neon_8, export=1 -+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h2_neon_8, export=1 -+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h3_neon_8, export=1 -+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 -+endfunc -+ -+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv -+ push {r4, r5, r6, r7} -+ ldr r4, [sp, #16] // height -+ ldr r5, [sp, #20] // width -+ -+ vpush {d8-d15} -+ sub r2, #4 -+ sub r2, r2, r3, lsl #1 -+ sub r2, r3 // extra_before 3 -+ lsl r1, #1 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+0: vextin8 -+ \filterh q0 -+ vextin8 -+ \filterh q1 -+ vextin8 -+ \filterh q2 -+ vextin8 -+ \filterh q3 -+ vextin8 -+ \filterh q4 -+ vextin8 -+ \filterh q5 -+ vextin8 -+ \filterh q6 -+ vextin8 -+ \filterh q7 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filterv -+ vst1.16 {q8}, [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #16 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filterv -+ vst1.16 d16, [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4, r5, r6, r7} -+ bx lr -+.endm -+ -+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv -+ push {r4-r10} -+ ldr r5, [sp, #28] // width -+ ldr r4, [sp, #32] // height -+ ldr r8, [sp, #36] // src2 -+ ldr r9, [sp, #40] // src2stride -+ vpush {d8-d15} -+ sub r2, #4 -+ sub r2, r2, r3, lsl #1 -+ sub r2, r3 // extra_before 3 -+ mov r12, r4 -+ mov r6, r0 -+ mov r7, r2 -+ cmp r8, #0 -+ bne .Lbi\@ -+0: vextin8 -+ \filterh q0 -+ vextin8 -+ \filterh q1 -+ vextin8 -+ \filterh q2 -+ vextin8 -+ \filterh q3 -+ vextin8 -+ \filterh q4 -+ vextin8 -+ \filterh q5 -+ vextin8 -+ \filterh q6 -+ vextin8 -+ \filterh q7 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filterv -+ vqrshrun.s16 d0, q8, #6 -+ vst1.8 d0, [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filterv -+ vqrshrun.s16 d0, q8, #6 -+ vst1.32 d0[0], [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 4b -+ b 99f -+.Lbi\@: lsl r9, #1 -+ mov r10, r8 -+0: vextin8 -+ \filterh q0 -+ vextin8 -+ \filterh q1 -+ vextin8 -+ \filterh q2 -+ vextin8 -+ \filterh q3 -+ vextin8 -+ \filterh q4 -+ vextin8 -+ \filterh q5 -+ vextin8 -+ \filterh q6 -+ vextin8 -+ \filterh q7 -+ cmp r5, #4 -+ beq 4f -+8: subs r4, #1 -+ \filterv -+ vld1.16 {q0}, [r8], r9 -+ vqadd.s16 q0, q8 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.8 d0, [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 8b -+ subs r5, #8 -+ beq 99f -+ mov r4, r12 -+ add r6, #8 -+ mov r0, r6 -+ add r10, #16 -+ mov r8, r10 -+ add r7, #8 -+ mov r2, r7 -+ b 0b -+4: subs r4, #1 -+ \filterv -+ vld1.16 d0, [r8], r9 -+ vqadd.s16 d0, d16 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.32 d0[0], [r0], r1 -+ regshuffle_q8 -+ vextin8 -+ \filterh q7 -+ bne 4b -+99: vpop {d8-d15} -+ pop {r4-r10} -+ bx lr -+.endm -+ -+ -+function ff_hevc_rpi_put_qpel_h1v1_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h2v1_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h3v1_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h1v2_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h2v2_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h3v2_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h1v3_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h2v3_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_h3v3_neon_8, export=1 -+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b -+endfunc -+ -+ -+function ff_hevc_rpi_put_qpel_uw_h1v1_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h2v1_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h3v1_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h1v2_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h2v2_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h3v2_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h1v3_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h2v3_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_h3v3_neon_8, export=1 -+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b -+endfunc -+ -+.macro init_put_pixels -+ pld [r1] -+ pld [r1, r2] -+ mov r12, MAX_PB_SIZE -+ lsl r12, #1 -+.endm -+ -+function ff_hevc_rpi_put_pixels_w2_neon_8, export=1 -+ init_put_pixels -+ vmov.u8 d5, #255 -+ vshr.u64 d5, #32 -+0: subs r3, #1 -+ vld1.32 {d0[0]}, [r1], r2 -+ pld [r1] -+ vld1.32 d6, [r0] -+ vshll.u8 q0, d0, #6 -+ vbit d6, d0, d5 -+ vst1.32 d6, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w4_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #2 -+ vld1.32 {d0[0]}, [r1], r2 -+ vld1.32 {d0[1]}, [r1], r2 -+ pld [r1] -+ pld [r1, r2] -+ vshll.u8 q0, d0, #6 -+ vst1.64 {d0}, [r0], r12 -+ vst1.64 {d1}, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w6_neon_8, export=1 -+ init_put_pixels -+ vmov.u8 q10, #255 -+ vshr.u64 d21, #32 -+0: subs r3, #1 -+ vld1.16 {d0}, [r1], r2 -+ pld [r1] -+ vshll.u8 q0, d0, #6 -+ vld1.8 {q12}, [r0] -+ vbit q12, q0, q10 -+ vst1.8 {q12}, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w8_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #2 -+ vld1.8 {d0}, [r1], r2 -+ vld1.8 {d2}, [r1], r2 -+ pld [r1] -+ pld [r1, r2] -+ vshll.u8 q0, d0, #6 -+ vshll.u8 q1, d2, #6 -+ vst1.16 {q0}, [r0], r12 -+ vst1.16 {q1}, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w12_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #2 -+ vld1.64 {d0}, [r1] -+ add r1, #8 -+ vld1.32 {d1[0]}, [r1], r2 -+ sub r1, #8 -+ vld1.64 {d2}, [r1] -+ add r1, #8 -+ vld1.32 {d1[1]}, [r1], r2 -+ sub r1, #8 -+ pld [r1] -+ pld [r1, r2] -+ vshll.u8 q8, d0, #6 -+ vshll.u8 q9, d1, #6 -+ vshll.u8 q10, d2, #6 -+ vmov d22, d19 -+ vst1.64 {d16, d17, d18}, [r0], r12 -+ vst1.64 {d20, d21, d22}, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w16_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #2 -+ vld1.8 {q0}, [r1], r2 -+ vld1.8 {q1}, [r1], r2 -+ pld [r1] -+ pld [r1, r2] -+ vshll.u8 q8, d0, #6 -+ vshll.u8 q9, d1, #6 -+ vshll.u8 q10, d2, #6 -+ vshll.u8 q11, d3, #6 -+ vst1.8 {q8, q9}, [r0], r12 -+ vst1.8 {q10, q11}, [r0], r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w24_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #1 -+ vld1.8 {d0, d1, d2}, [r1], r2 -+ pld [r1] -+ vshll.u8 q10, d0, #6 -+ vshll.u8 q11, d1, #6 -+ vshll.u8 q12, d2, #6 -+ vstm r0, {q10, q11, q12} -+ add r0, r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w32_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #1 -+ vld1.8 {q0, q1}, [r1], r2 -+ pld [r1] -+ vshll.u8 q8, d0, #6 -+ vshll.u8 q9, d1, #6 -+ vshll.u8 q10, d2, #6 -+ vshll.u8 q11, d3, #6 -+ vstm r0, {q8, q9, q10, q11} -+ add r0, r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w48_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #1 -+ vld1.8 {q0, q1}, [r1] -+ add r1, #32 -+ vld1.8 {q2}, [r1], r2 -+ sub r1, #32 -+ pld [r1] -+ vshll.u8 q8, d0, #6 -+ vshll.u8 q9, d1, #6 -+ vshll.u8 q10, d2, #6 -+ vshll.u8 q11, d3, #6 -+ vshll.u8 q12, d4, #6 -+ vshll.u8 q13, d5, #6 -+ vstm r0, {q8, q9, q10, q11, q12, q13} -+ add r0, r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_pixels_w64_neon_8, export=1 -+ init_put_pixels -+0: subs r3, #1 -+ vld1.8 {q0, q1}, [r1] -+ add r1, #32 -+ vld1.8 {q2, q3}, [r1], r2 -+ sub r1, #32 -+ pld [r1] -+ vshll.u8 q8, d0, #6 -+ vshll.u8 q9, d1, #6 -+ vshll.u8 q10, d2, #6 -+ vshll.u8 q11, d3, #6 -+ vshll.u8 q12, d4, #6 -+ vshll.u8 q13, d5, #6 -+ vshll.u8 q14, d6, #6 -+ vshll.u8 q15, d7, #6 -+ vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} -+ add r0, r12 -+ bne 0b -+ bx lr -+endfunc -+ -+function ff_hevc_rpi_put_qpel_uw_pixels_neon_8, export=1 -+ push {r4-r9} -+ ldr r5, [sp, #24] // width -+ ldr r4, [sp, #28] // height -+ ldr r8, [sp, #32] // src2 -+ ldr r9, [sp, #36] // src2stride -+ vpush {d8-d15} -+ cmp r8, #0 -+ bne 2f -+1: subs r4, #1 -+ vld1.8 {d0}, [r2], r3 -+ vst1.8 d0, [r0], r1 -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r9} -+ bx lr -+2: subs r4, #1 -+ vld1.8 {d0}, [r2], r3 -+ vld1.16 {q1}, [r8], r9 -+ vshll.u8 q0, d0, #6 -+ vqadd.s16 q0, q1 -+ vqrshrun.s16 d0, q0, #7 -+ vst1.8 d0, [r0], r1 -+ bne 2b -+ vpop {d8-d15} -+ pop {r4-r9} -+ bx lr -+endfunc -+ -+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 -+function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1 -+ ldr r12, [sp] // height -+1: subs r12, #4 -+ vld1.32 {\regs} , [r2], r3 -+ vld1.32 {\regs2} , [r2], r3 -+ vld1.32 {\regs3} , [r2], r3 -+ vld1.32 {\regs4} , [r2], r3 -+ vst1.32 {\regs} , [r0], r1 -+ vst1.32 {\regs2} , [r0], r1 -+ vst1.32 {\regs3} , [r0], r1 -+ vst1.32 {\regs4} , [r0], r1 -+ bne 1b -+ bx lr -+endfunc -+.endm -+ -+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 -+function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1 -+ push {r4-r5} -+ ldr r12, [sp, #8] // height -+1: subs r12, #2 -+ mov r4, r2 -+ vld1.32 {\regs} , [r2]! -+ vld1.32 {\regs2} , [r2] -+ add r2, r4, r3 -+ mov r4, r2 -+ vld1.32 {\regs3} , [r2]! -+ vld1.32 {\regs4} , [r2] -+ add r2, r4, r3 -+ mov r5, r0 -+ vst1.32 {\regs} , [r0]! -+ vst1.32 {\regs2} , [r0] -+ add r0, r5, r1 -+ mov r5, r0 -+ vst1.32 {\regs3} , [r0]! -+ vst1.32 {\regs4} , [r0] -+ add r0, r5, r1 -+ bne 1b -+ pop {r4-r5} -+ bx lr -+endfunc -+.endm -+ -+put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] -+put_qpel_uw_pixels 8, d0, d1, d2, d3 -+put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] -+put_qpel_uw_pixels 16, q0, q1, q2, q3 -+put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 -+put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 -+put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 -+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S new file mode 100644 index 0000000000..7dfcc2751a @@ -6406,12 +4944,13 @@ index 0000000000..7dfcc2751a + diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S new file mode 100644 -index 0000000000..8c32cb23e7 +index 0000000000..b56dc8ccc5 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S -@@ -0,0 +1,1882 @@ +@@ -0,0 +1,2156 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi ++ * 2017 John Cox (for Raspberry Pi) + * + * This file is part of FFmpeg. + * @@ -7245,9 +5784,8 @@ index 0000000000..8c32cb23e7 + vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add + + vtbl.8 d6, {d27}, d6 -+ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add -+ + vtbl.8 d7, {d27}, d7 ++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add + vzip.8 q2, q3 + + vsub.s8 q0, q15 @@ -7309,33 +5847,36 @@ index 0000000000..8c32cb23e7 + + vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) + vadd.s16 q1, q1, q13 -+ vmov.u8 q12, #2 + vadd.s16 q2, q2, q14 + vadd.s16 q3, q3, q15 + ++ vmov.u8 q12, #2 ++ + vmovn.s16 d0, q0 + vmovn.s16 d1, q1 + vmovn.s16 d2, q2 + vmovn.s16 d3, q3 + ++ vldr d26, [r5] ++ + vuzp.8 q0, q1 + -+ vld1.8 {d26, d27}, [r5] ++ vldr d27, [r5, #8] + + vadd.s8 q0, q0, q12 + vadd.s8 q1, q1, q12 + ++ vmov.i64 q12, #0 ++ + vtbl.8 d0, {d26}, d0 + vtbl.8 d1, {d26}, d1 + vtbl.8 d2, {d27}, d2 + vtbl.8 d3, {d27}, d3 + -+ vmov.i64 q12, #0 ++ vdup.i16 q13, r4 + + vzip.8 q0, q1 + -+ vdup.i16 q13, r4 -+ + @ Avoid overwrite whilst widening + vaddw.s8 q2, q6, d2 + vaddw.s8 q3, q7, d3 @@ -7360,19 +5901,19 @@ index 0000000000..8c32cb23e7 +@ q15.u8 #128 + +function edge_16b_body_8 -+ vcgt.u8 q3, q1, q0 @ c > a -> -1 , otherwise 0 -+ vcgt.u8 q0, q1 @ a > c -> -1 , otherwise 0 -+ vcgt.u8 q9, q1, q2 @ c > b -> -1 , otherwise 0 -+ vcgt.u8 q10, q2, q1 @ c < b -> -1 , otherwise 0 ++ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u8 q9, q14, q9 ++ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u8 q9, q9, q0 ++ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u8 q9, q9, q0 ++ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u8 q0, q9, q0 + -+ vsub.s8 q0, q3 -+ vsub.s8 q10, q9 -+ vadd.s8 q0, q10 @ a = sign(c-a) -+ -+ vadd.s8 q0, q14 -+ vuzp.8 d0, d1 + vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add + ++ vuzp.8 d0, d1 ++ + vtbl.8 d0, {d16}, d0 + vtbl.8 d1, {d17}, d1 + @@ -7394,21 +5935,20 @@ index 0000000000..8c32cb23e7 +@ q14.u8 #2 +@ q15.u16 max +function edge_16b_body_16 -+ vcgt.u16 q3, q1, q0 @ c > a -> -1 , otherwise 0 -+ vcgt.u16 q0, q1 @ a > c -> -1 , otherwise 0 -+ vsub.s16 q0, q3 @ a = sign(c-a) -+ vcgt.u16 q3, q1, q2 @ c > b -> -1 , otherwise 0 -+ vsub.s16 q0, q3 -+ vcgt.u16 q3, q2, q1 @ c < b -> -1 , otherwise 0 -+ vadd.s16 q0, q3 @ a = sign(c-a) + sign(c-b) ++ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u16 q9, q14, q9 ++ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u16 q9, q9, q0 ++ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u16 q9, q9, q0 ++ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u16 q0, q9, q0 + + vmovn.s16 d0, q0 + @ d1 will have random contents that we transform but + @ that doesn't matter as we then discard them + vuzp.8 d0, d1 + -+ vadd.s8 q0, q0, q14 -+ + vtbl.8 d0, {d16}, d0 + vtbl.8 d1, {d17}, d1 + @@ -7434,52 +5974,53 @@ index 0000000000..8c32cb23e7 +@ int height) [sp, #sp_base + 8] + +.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0 -+ push {r4-r6, lr} @ 16 bytes -+.set sp_base, 16 + +@ Build translate registers +@ As translate values can only be 0-4 we don't care about junk in the rest +@ of the register -+ mov r12, #2 +.if \is_chroma -+ ldr r4, [sp, #16] -+.set sp_base, sp_base + 4 -+.endif -+ vld1.8 {d16[2]}, [r3], r12 -+ vld1.8 {d16[0]}, [r3], r12 -+ vld1.8 {d16[1]}, [r3], r12 -+ vld1.8 {d16[3]}, [r3], r12 ++ ldr ip, [sp, #0] ++ push {r4-r6, lr} @ 16 bytes ++ vld1.8 {d16[2]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[2]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[1]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[3]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[3]}, [ip] ++ add ip, ip, #2 + vld1.8 {d16[4]}, [r3] -+.if \is_chroma -+ vld1.8 {d17[2]}, [r4], r12 -+ vld1.8 {d17[0]}, [r4], r12 -+ vld1.8 {d17[1]}, [r4], r12 -+ vld1.8 {d17[3]}, [r4], r12 -+ vld1.8 {d17[4]}, [r4] -+.else -+ vmov d17, d16 -+.endif -+ -+@ Setup constant registers -+.if \bit_depth > 8 -+ movw r4, (1 << \bit_depth) - 1 -+.endif -+.if \setup_16b -+.if \bit_depth > 8 -+ vmov.i64 q12, #0 -+ vdup.16 q15, r4 -+.else -+ vmov.u8 q15, #128 -+.endif -+ vmov.u8 q14, #2 -+.endif ++ vld1.8 {d17[4]}, [ip] + movw r3, EDGE_SRC_STRIDE ++.set sp_base, 20 ++.else ++ add ip, r3, #4 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #6 ++ vld1.8 {d17[1]}, [ip] ++ vld1.8 {d16[2]}, [r3] ++ movw r3, EDGE_SRC_STRIDE ++ push {r4-r6, lr} @ 16 bytes ++ vzip.8 d16, d17 ++ vmov d17, d16 ++.set sp_base, 16 ++.endif + -+@ If setup_64b we need the xlat table on the stack and q4-q7 saved ++@ If setup_64b we need the xlat table on the stack +.if \setup_64b + sub r5, sp, #16 -+ vpush {q4-q8} @ 80 bytes, q8 pushed first -+.set sp_base, sp_base + 80 +.endif + +@ Get jump address @@ -7487,18 +6028,40 @@ index 0000000000..8c32cb23e7 +@ If we may have w4 then we add a 2nd jump table after the 1st +.if \check_w4 + ldr r12, [sp, #sp_base + 4] @ width -+ cmp r12, #8 -+.endif -+ ldr r12, [sp, #sp_base + 0] @ e0 + adr r6, \jump_tab -+.if \check_w4 ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ cmp r12, #8 + it lt + addlt r6, #16 ++.else ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ adr r6, \jump_tab +.endif -+ ldr r6, [r6, r12, lsl #2] + + ldr r12, [sp, #sp_base + 8] @ height + ++.if \bit_depth > 8 ++ movw r4, (1 << \bit_depth) - 1 ++.endif ++.if \setup_16b ++.if \bit_depth > 8 ++ vmov.i64 q12, #0 ++ vdup.16 q15, r4 ++ vmov.u16 q14, #2 ++.else ++ vmov.u8 q15, #128 ++ vmov.u8 q14, #2 ++.endif ++.endif ++ ++@ If setup_64b we need q4-q7 saved. ++.if \setup_64b ++ vpush {q4-q8} @ 80 bytes, q8 pushed first ++.set sp_base, sp_base + 80 ++.endif ++ ++ ldr r6, [r6, lr, lsl #2] ++ +@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes +.if \do2 + push {r0, r1, r6, r12} @@ -7529,18 +6092,20 @@ index 0000000000..8c32cb23e7 + + +.macro edge_64b_e0, body_fn, pb -+ mov r6, lr + sub r1, #8 ++ mov r6, lr +1: vldm r1, {d7-d16} -+ subs r12, #1 -+ add r1, r3 + // load a + vext.8 q0, q3, q4, #(16 - \pb) ++ add r1, r3 + vext.8 q1, q4, q5, #(16 - \pb) ++ subs r12, #1 + vext.8 q2, q5, q6, #(16 - \pb) + vext.8 q3, q6, q7, #(16 - \pb) ++ pld [r1] + // load b + vext.8 q11, q7, q8, #\pb @ Avoid overwrite ++ pld [r1, #64] + vext.8 q8, q4, q5, #\pb + vext.8 q9, q5, q6, #\pb + vext.8 q10, q6, q7, #\pb @@ -7552,424 +6117,671 @@ index 0000000000..8c32cb23e7 +.endm + +.macro edge_32bx2_e0, body_fn, pb -+ mov r6, lr -+ -+1: subs r12, #2 -+ -+ vld1.8 {q4-q5}, [r1] -+ sub r1, #\pb -+ vld1.8 {q0-q1}, [r1] -+ add r1, #(\pb * 2) -+ vld1.8 {q8-q9}, [r1], r3 -+ sub r1, #\pb -+ vld1.8 {q6-q7}, [r1] -+ sub r1, #\pb -+ vld1.8 {q2-q3}, [r1] -+ add r1, #(\pb * 2) -+ vld1.8 {q10-q11}, [r1], r3 -+ sub r1, #\pb -+ ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d7-d12} ++ // load a ++ vext.8 q0, q3, q4, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q4, q5, #16 - \pb ++ subs r12, #2 ++ // load b ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vldr d25, [r6, #-8] ++ vldmia r6, {d12-d15} ++ vldr d26, [r6, #32] ++ // load a ++ vext.8 q2, q12, q6, #16 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q6, q7, #16 - \pb ++ // load b ++ vext.8 q10, q6, q7, #\pb ++ vext.8 q11, q7, q13, #\pb + bl \body_fn -+ -+ vst1.8 {q0,q1}, [r0], r2 -+ vst1.8 {q2,q3}, [r0], r2 -+ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 + bgt 1b -+ bx r6 ++ pop {r7,pc} +.endm + +.macro edge_16b_e0, body_fn, pb ++ sub r1, #8 + mov r6, lr -+ sub r1, #\pb -+ sub r3, #\pb * 2 -+ -+1: subs r12, #1 -+ -+ vld1.64 {q0}, [r1] @ load a -+ add r1, #\pb -+ vld1.64 {q1}, [r1, :128] @ load c -+ add r1, #\pb -+ vld1.64 {q2}, [r1], r3 @ load b ++1: vldmia r1, {d1-d4} ++ add r1, r3 ++ subs r12, #1 ++ vext.8 q0, q0, q1, #16 - \pb ++ vext.8 q2, q1, q2, #\pb + + bl \body_fn -+ vst1.8 {q0}, [r0], r2 ++ vst1.8 {q0}, [r0, :128], r2 + bgt 1b + bx r6 +.endm + +.macro edge_8bx2_e0, body_fn, pb -+ mov r6, lr -+ -+1: subs r12, #2 -+ -+ vld1.8 {d2}, [r1, :64] -+ sub r1, #\pb -+ vld1.8 {d0}, [r1] -+ add r1, #(\pb * 2) -+ vld1.8 {d4}, [r1], r3 -+ sub r1, #\pb -+ vld1.8 {d3}, [r1, :64] -+ sub r1, #\pb -+ vld1.8 {d1}, [r1] -+ add r1, #(\pb * 2) -+ vld1.8 {d5}, [r1], r3 -+ sub r1, #\pb ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d1-d2} ++ vldmia r6, {d3-d4} ++ vldr d6, [r1, #16] ++ subs r12, #2 ++ vldr d7, [r6, #-8] ++ add r1, r1, r3, lsl #1 ++ vext.8 d0, d1, d2, #8 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 d5, d3, d4, #\pb ++ vext.8 d4, d2, d6, #\pb ++ vext.8 d1, d7, d3, #8 - \pb + + bl \body_fn -+ + vst1.8 {d0}, [r0, :64], r2 -+ vst1.8 {d1}, [r0, :64], r2 -+ ++ vst1.8 {d1}, [r7, :64], r2 + bgt 1b -+ bx r6 ++ pop {r7,pc} +.endm + +.macro edge_4bx4_e0, body_fn, pb -+ mov r6, lr ++ add r6, r1, r3 ++ push {r7,lr} ++ add r7, r0, r2 ++ lsl r2, #1 + -+1: subs r12, #4 -+ -+ vld1.32 {d2[0]}, [r1] -+ sub r1, #\pb -+ vld1.32 {d0[0]}, [r1] -+ add r1, #(\pb * 2) -+ vld1.32 {d4[0]}, [r1], r3 @ R -+ vld1.32 {d4[1]}, [r1] -+ sub r1, #\pb -+ vld1.32 {d2[1]}, [r1] -+ sub r1, #\pb -+ vld1.32 {d0[1]}, [r1], r3 @ L -+ vld1.32 {d1[0]}, [r1] -+ add r1, #\pb -+ vld1.32 {d3[0]}, [r1] -+ add r1, #\pb -+ vld1.32 {d5[0]}, [r1], r3 @ R -+ vld1.32 {d5[1]}, [r1] -+ sub r1, #(\pb * 2) -+ vld1.32 {d1[1]}, [r1] -+ add r1, #\pb -+ vld1.32 {d3[1]}, [r1], r3 @ M ++ tst r1, #4 ++ bne 2f ++1: // r1 (and assumed r6) are 64-bit aligned ++ vldr d2, [r1] ++ vldr d0, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d20, [r6] ++ subs r12, #4 ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d3, [r1] ++ vshr.u64 d4, d2, #\pb * 8 ++ vldr d1, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d21, [r6] ++ vext.8 d0, d0, d2, #8 - \pb ++ vldr d19, [r6,#-8] ++ add r6, r6, r3, lsl #1 ++ vshr.u64 d22, d20, #\pb * 8 ++ vext.8 d18, d18, d20, #8 - \pb ++ vshr.u64 d5, d3, #\pb * 8 ++ vext.8 d1, d1, d3, #8 - \pb ++ vshr.u64 d23, d21, #\pb * 8 ++ vext.8 d19, d19, d21, #8 - \pb ++ vsli.64 q1, q10, #32 ++ vsli.64 q2, q11, #32 ++ vsli.64 q0, q9, #32 + + bl \body_fn -+ -+ vst1.32 {d0[0]}, [r0], r2 -+ vst1.32 {d0[1]}, [r0], r2 -+ vst1.32 {d1[0]}, [r0], r2 -+ vst1.32 {d1[1]}, [r0], r2 -+ ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 + bgt 1b -+ bx r6 ++ pop {r7,pc} ++ ++2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned ++ vldr d20, [r1, #-4] ++ vldr d22, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d2, [r6, #-4] ++ subs r12, #4 ++ vldr d4, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vldr d21, [r1, #-4] ++ vshl.i64 d18, d20, #\pb * 8 ++ vldr d23, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d3, [r6, #-4] ++ vext.8 d22, d20, d22, #\pb ++ vldr d5, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vshl.i64 d0, d2, #\pb * 8 ++ vext.8 d4, d2, d4, #\pb ++ vshl.i64 d19, d21, #\pb * 8 ++ vext.8 d23, d21, d23, #\pb ++ vshl.i64 d1, d3, #\pb * 8 ++ vext.8 d5, d3, d5, #\pb ++ vsri.64 q1, q10, #32 ++ vsri.64 q0, q9, #32 ++ vsri.64 q2, q11, #32 ++ ++ bl \body_fn ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 2b ++ pop {r7,pc} +.endm + + +.macro edge_64b_e1, body_fn -+ mov r6, lr + sub r1, r3 ++ push {lr} ++ add r6, r1, #32 + // load a -+ vld1.8 {q0-q1}, [r1, :128]! -+ vld1.8 {q2-q3}, [r1, :128], r3 -+ sub r1, #32 ++ vld1.8 {q0-q1}, [r1, :256], r3 ++ vld1.8 {q2-q3}, [r6, :256], r3 + // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+1: subs r12, #1 -+ // load b -+ vld1.8 {q8-q9}, [r1, :128]! -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ sub r1, #32 ++ vld1.8 {q4-q5}, [r1, :256], r3 ++ vld1.8 {q6-q7}, [r6, :256], r3 ++1: // load b ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #1 ++ vld1.8 {q10-q11}, [r6, :256], r3 + bl \body_fn + vstm r0, {q0-q3} -+ add r0, r0, r2 + // copy c to a + vmov.64 q0, q4 ++ pld [r1, r3] + vmov.64 q1, q5 ++ pople {lr} + vmov.64 q2, q6 ++ bxle lr + vmov.64 q3, q7 ++ add r0, r0, r2 + // copy b to c + vmov.64 q4, q8 + vmov.64 q5, q9 + vmov.64 q6, q10 + vmov.64 q7, q11 -+ bgt 1b -+ bx r6 ++ b 1b +.endm + +.macro edge_32bx2_e1, body_fn ++ sub r6, r1, r3 ++ vld1.8 {q2-q3}, [r1, :256], r3 ++ vld1.8 {q0-q1}, [r6, :256] + mov r6, lr -+ sub r1, r3 -+ // load a -+ vld1.8 {q0-q1}, [r1, :128], r3 -+ vld1.8 {q4-q5}, [r1, :128], r3 + -+1: subs r12, #2 -+ @ Given the data duplication here we could obviously do better than ++1: @ Given the data duplication here we could obviously do better than + @ using the generic body_fn but it almost certainly isn't worth it -+ vmov q2, q4 -+ vmov q3, q5 -+ vld1.8 {q8-q9}, [r1, :128], r3 -+ vld1.8 {q10-q11}, [r1, :128], r3 ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #2 ++ vmov q4, q2 ++ vmov q5, q3 ++ vld1.8 {q10-q11}, [r1, :256], r3 + vmov q6, q8 + vmov q7, q9 + + bl \body_fn + -+ vst1.8 {q0,q1}, [r0], r2 -+ vst1.8 {q2,q3}, [r0], r2 -+ -+ // copy c to a -+ vmov.64 q0, q8 -+ vmov.64 q1, q9 -+ -+ // copy b to c -+ vmov.64 q4, q10 -+ vmov.64 q5, q11 -+ bgt 1b -+ bx r6 ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ // copy b to a ++ vmov q0, q8 ++ vmov q1, q9 ++ vst1.8 {q2-q3}, [r0, :256], r2 ++ vmov q2, q10 ++ bxle r6 ++ vmov q3, q11 ++ b 1b +.endm + +.macro edge_16b_e1, body_fn -+ mov r6, lr -+ sub r1, r3 -+ // load a -+ vld1.8 {q0}, [r1, :128], r3 ++ sub r6, r1, r3 + // load c + vld1.8 {q1}, [r1, :128], r3 -+1: subs r12, #1 -+ // load b ++ // load a ++ vld1.8 {q0}, [r6, :128] ++ mov r6, lr ++1: // load b + vld1.8 {q2}, [r1, :128], r3 + bl \body_fn -+ vst1.8 {q0}, [r0], r2 ++ vst1.8 {q0}, [r0, :128], r2 ++ subs r12, #1 + // copy c to a + vmov.64 q0, q1 ++ bxle r6 + // copy b to c + vmov.64 q1, q2 -+ bgt 1b -+ bx r6 ++ b 1b +.endm + +.macro edge_8bx2_e1, body_fn -+ mov r6, lr -+ sub r1, r3 -+ // load a -+ vld1.8 {d0}, [r1, :64], r3 -+ vld1.8 {d2}, [r1, :64], r3 -+ -+1: subs r12, #2 -+ @ Given the data duplication here we could obviously do better than ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.8 {d1}, [r1, :64], r3 ++ vld1.8 {d0}, [r6, :64], r3 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: @ Given the data duplication here we could obviously do better than + @ using the generic body_fn but it almost certainly isn't worth it -+ vmov.64 d1, d2 -+ vld1.8 {d4}, [r1, :64], r3 ++ vld1.8 {d4}, [r6, :64], r3 ++ vmov d2, d1 + vld1.8 {d5}, [r1, :64], r3 -+ vmov.64 d3, d4 ++ subs r12, #2 ++ vmov d3, d4 + + bl \body_fn + -+ vst1.8 {d0}, [r0], r2 -+ vst1.8 {d1}, [r0], r2 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 + -+ // copy c to a -+ vmov.64 d0, d4 -+ // copy b to c -+ vmov.64 d2, d5 ++ // copy b to a ++ vmov q0, q2 + bgt 1b -+ bx r6 ++ pop {r7, pc} +.endm + +.macro edge_4bx4_e1, body_fn -+ mov r6, lr -+debug_me: -+ sub r1, r3 -+ // load a -+ vld1.32 {d0[0]}, [r1], r3 -+ vld1.32 {d0[1]}, [r1], r3 -+ -+1: subs r12, #4 -+ @ Given the data duplication here we could probably do better than -+ @ using the generic body_fn but it almost certainly isn't worth it -+ vld1.32 {d4[0]}, [r1], r3 -+ vld1.32 {d4[1]}, [r1], r3 -+ vld1.32 {d5[0]}, [r1], r3 -+ vld1.32 {d5[1]}, [r1], r3 -+ -+ vmov.32 d1, d4 ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.32 {d0[1]}, [r1, :32], r3 ++ add r7, r0, r2 ++ vld1.32 {d0[0]}, [r6, :32], r3 ++ lsl r2, #1 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vmov d1, d4 + vext.32 d2, d0, d4, #1 ++ subs r12, #4 ++ vmov d22, d5 + vext.32 d3, d4, d5, #1 ++ b 2f + ++1: vst1.32 {d0[0]}, [r0, :32], r2 ++ vext.32 d2, d22, d4, #1 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vmov d0, d22 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vext.32 d3, d4, d5, #1 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ vmov d1, d4 ++ vmov d22, d5 ++2: @ Given the data duplication here we could probably do better than ++ @ using the generic body_fn but it almost certainly isn't worth it + bl \body_fn ++ ble 3f ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ subs r12, #4 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ b 1b + -+ vst1.32 {d0[0]}, [r0], r2 -+ vst1.32 {d0[1]}, [r0], r2 -+ vst1.32 {d1[0]}, [r0], r2 -+ vst1.32 {d1[1]}, [r0], r2 -+ -+ vmov.32 d0, d5 -+ bgt 1b -+ bx r6 ++3: vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32] ++ vst1.32 {d1[1]}, [r7, :32] ++ pop {r7, pc} +.endm + +.macro edge_64b_e2, body_fn, pb -+ mov r6, lr -+ sub r1, #32 -+ sub r3, #(32 - \pb) ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldr d25, [r6, #-8] ++ vldmia r6, {d16-d23} ++ vext.8 q0, q12, q8, #16 - \pb ++ add r6, r1, #32 ++ vext.8 q1, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q2, q9, q10, #16 - \pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q10, q11, #16 - \pb + -+1: sub r1, r3 -+ // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 ++1: // load b ++ vldmia r1, {d16-d24} ++ vext.8 q8, q8, q9, #\pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #\pb + subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ // load b -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #(64 + \pb) ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb + bl \body_fn -+ vstm r0, {q0-q3} ++ // next a is mostly available in c ++ vldr d25, [r6, #-8] ++ vstmia r0, {q0-q3} ++ vext.8 q3, q6, q7, #16 - \pb ++ pople {lr} ++ vext.8 q2, q5, q6, #16 - \pb ++ bxle lr ++ vext.8 q1, q4, q5, #16 - \pb ++ add r6, r6, r3 ++ vext.8 q0, q12, q4, #16 - \pb + add r0, r0, r2 -+ bgt 1b -+ -+ add r3, #(32 - \pb) -+ bx r6 ++ // next c is mostly available in b ++ vldr d8, [r1] ++ vext.8 d9, d16, d17, #8 - \pb ++ vext.8 q5, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q6, q9, q10, #16 - \pb ++ pld [r6, #-8] ++ vext.8 q7, q10, q11, #16 - \pb ++ b 1b +.endm + +.macro edge_32bx2_e2, body_fn, pb -+ mov r6, lr -+ sub r1, #\pb -+ -+1: sub r1, r3 -+ vld1.8 {q0-q1}, [r1], r3 -+ vld1.8 {q2-q3}, [r1] ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vld1.8 {q4-q5}, [r1, :256] ++ vldr d25, [r6, #-8] ++ vld1.8 {q13-q14}, [r6, :256] ++ vldr d31, [r1, #-8] ++ add r6, r6, r3, lsl #1 ++ vext.8 q0, q12, q13, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q13, q14, #16 - \pb ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++1: ++ // load second 32b of c and second 32b of b ++ vldmia r6, {d12-d16} ++ vldmia r1, {d20-d24} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q9, q7, q8, #\pb + subs r12, #2 -+ // load c -+ add r1, #\pb -+ vld1.8 {q4-q5}, [r1, :128], r3 -+ vld1.8 {q6-q7}, [r1, :128] -+ // load b -+ add r1, #\pb -+ vld1.8 {q8-q9}, [r1], r3 -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #(\pb * 2) ++ vext.8 q8, q6, q7, #\pb ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb + + bl \body_fn + -+ vst1.8 {q0-q1}, [r0], r2 -+ vst1.8 {q2-q3}, [r0], r2 -+ bgt 1b ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f + -+ bx r6 ++ vldr d25, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d8, [r1] ++ vext.8 d9, d20, d21, #8 - \pb ++ vldr d31, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q1, q6, q7, #16 - \pb ++ vext.8 q0, q12, q6, #16 - \pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q5, q10, q11, #16 - \pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++ b 1b ++ ++2: pop {r7, pc} +.endm + +.macro edge_16b_e2, body_fn, pb -+ mov r6, lr -+ add r3, #\pb -+ -+1: sub r1, r3 -+ // load a -+ vld1.8 {q0}, [r1], r3 -+ subs r12, #1 -+ // load c ++ push {lr} ++ sub r6, r1, r3 + vld1.8 {q1}, [r1, :128], r3 -+ // load b -+ vld1.8 {q2}, [r1] -+ sub r1, #\pb ++ vldr d19, [r6, #-8] ++ vld1.8 {q10}, [r6, :128], r3 ++ ++1: vldmia r1, {d4-d6} ++ vext.8 q0, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q2, q2, q3, #\pb + bl \body_fn -+ vst1.8 {q0}, [r0], r2 -+ bgt 1b -+ bx r6 ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q10, q1 ++ vldr d2, [r1] ++ add r1, r1, r3 ++ vldr d19, [r6, #-8] ++ add r6, r6, r3 ++ vext.8 d3, d4, d5, #8 - \pb ++ b 1b ++ ++2: pop {pc} +.endm + +.macro edge_8bx2_e2, body_fn, pb -+ mov r6, lr -+ sub r1, #\pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vldr d18, [r6, #-8] ++ vldr d19, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] + -+1: sub r1, r3 -+ vld1.8 {d0}, [r1], r3 -+ vld1.8 {d1}, [r1] ++1: vext.8 d0, d18, d19, #8 - \pb ++ vext.8 d4, d3, d4, #\pb ++ vext.8 d1, d20, d2, #8 - \pb + subs r12, #2 -+ // load c -+ add r1, #\pb -+ vld1.8 {d2}, [r1, :64], r3 -+ vld1.8 {d3}, [r1, :64] -+ // load b -+ add r1, #\pb -+ vld1.8 {d4}, [r1], r3 -+ vld1.8 {d5}, [r1] -+ sub r1, #(\pb * 2) ++ vext.8 d5, d21, d22, #\pb + + bl \body_fn + -+ vst1.8 {d0}, [r0], r2 -+ vst1.8 {d1}, [r0], r2 -+ bgt 1b ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f + -+ bx r6 ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vmov d19, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] ++ b 1b ++ ++2: pop {r7, pc} +.endm + +.macro edge_4bx4_e2, body_fn, pb -+ mov r6, lr -+ sub r1, #\pb ++ sub r6, r1, r3 ++ push {r7-r9, lr} ++ add r8, r1, r3 ++ sub r6, r6, #\pb ++ add r8, r8, #\pb ++ add r7, r0, r2 ++ lsl r2, #1 + -+1: sub r1, r3 -+ @ line 0 {d0[0], -, - } r1 lo -+ vld1.32 {d0[0]}, [r1], r3 ++1: vld1.32 {d0[0]}, [r6], r3 + subs r12, #4 -+ @ Line 1 {d0[1], d2[0], - } r1 lo -+ vld1.32 {d0[1]}, [r1] -+ add r1, #\pb + vld1.32 {d2[0]}, [r1], r3 -+ @ Line 2 {d1[0], d2[1], d4[0]} r1 mid -+ vld1.32 {d2[1]}, [r1] -+ sub r1, #\pb -+ vld1.32 {d1[0]}, [r1] -+ add r1, #\pb * 2 -+ vld1.32 {d4[0]}, [r1], r3 -+ @ Line 2 {d1[1], d3[0], d4[1]} r1 hi -+ vld1.32 {d4[1]}, [r1] -+ sub r1, #\pb * 2 -+ vld1.32 {d1[1]}, [r1] -+ add r1, #\pb ++ vld1.32 {d4[0]}, [r8], r3 ++ vld1.32 {d0[1]}, [r6], r3 ++ vld1.32 {d2[1]}, [r1], r3 ++ vld1.32 {d4[1]}, [r8], r3 ++ vld1.32 {d1[0]}, [r6], r3 + vld1.32 {d3[0]}, [r1], r3 -+ @ Line 3 {-, d3[1], d5[0]} r1 mid -+ vld1.32 {d3[1]}, [r1] -+ add r1, #\pb -+ vld1.32 {d5[0]}, [r1], r3 -+ @ Line 4 {-, -, d5[1]} r1 hi -+ vld1.32 {d5[1]}, [r1] -+ sub r1, #(\pb * 2) ++ vld1.32 {d5[0]}, [r8], r3 ++ vld1.32 {d1[1]}, [r6], r3 ++ vld1.32 {d3[1]}, [r1], r3 ++ vld1.32 {d5[1]}, [r8], r3 + + bl \body_fn + -+ vst1.32 {d0[0]}, [r0], r2 -+ vst1.32 {d0[1]}, [r0], r2 -+ vst1.32 {d1[0]}, [r0], r2 -+ vst1.32 {d1[1]}, [r0], r2 ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 + bgt 1b + -+ bx r6 ++ pop {r7-r9,pc} +.endm + +.macro edge_64b_e3, body_fn, pb -+ @ e3 is the same as e2 but with the X offset reversed -+ edge_64b_e2 \body_fn, (-\pb) ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldmia r6, {d16-d24} ++ vext.8 q0, q8, q9, #\pb ++ add r6, r1, #32 ++ vext.8 q1, q9, q10, #\pb ++ add r1, r1, r3 ++ vext.8 q2, q10, q11, #\pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q11, q12, #\pb ++ ++1: // load b ++ vldr d17, [r1, #-8] ++ vldmia r1, {d18-d25} ++ vext.8 q8, q8, q9, #16 - \pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q10, q10, q11, #16 - \pb ++ vext.8 q11, q11, q12, #16 - \pb ++ bl \body_fn ++ // next a is mostly available in c ++ vldr d24, [r6, #64] ++ vstmia r0, {q0-q3} ++ vext.8 q0, q4, q5, #\pb ++ pople {lr} ++ vext.8 q1, q5, q6, #\pb ++ bxle lr ++ vext.8 q2, q6, q7, #\pb ++ add r6, r6, r3 ++ vext.8 q3, q7, q12, #\pb ++ add r0, r0, r2 ++ // next c is mostly available in b ++ vext.8 d14, d22, d23, #\pb ++ vldr d15, [r1, #56] ++ vext.8 q4, q8, q9, #\pb ++ add r1, r1, r3 ++ vext.8 q5, q9, q10, #\pb ++ vext.8 q6, q10, q11, #\pb ++ b 1b +.endm + +.macro edge_32bx2_e3, body_fn, pb -+ @ e3 is the same as e2 but with the X offset reversed -+ edge_32bx2_e2 \body_fn, (-\pb) ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vldmia r1, {d8-d12} ++ vldmia r6, {d24-d28} ++ vext.8 q2, q4, q5, #\pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q5, q6, #\pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q0, q12, q13, #\pb ++ vext.8 q1, q13, q14, #\pb ++1: ++ // load second 32b of c and second 32b of b ++ vldr d25, [r6, #-8] ++ subs r12, #2 ++ vldmia r6, {d12-d15} ++ vldr d27, [r1, #-8] ++ vldmia r1, {d20-d23} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q8, q12, q6, #16 - \pb ++ vext.8 q9, q6, q7, #16 - \pb ++ vext.8 q11, q10, q11, #16 - \pb ++ vext.8 q10, q13, q10, #16 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f ++ ++ vldr d24, [r6, #32] ++ add r6, r6, r3, lsl #1 ++ vldr d11, [r1, #24] ++ vext.8 d10, d22, d23, #\pb ++ vldr d30, [r1, #32] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q0, q6, q7, #\pb ++ vext.8 q1, q7, q12, #\pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q4, q10, q11, #\pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q3, q5, q15, #\pb ++ vext.8 q2, q4, q5, #\pb ++ b 1b ++ ++2: pop {r7, pc} +.endm + +.macro edge_16b_e3, body_fn, pb -+ @ e3 is the same as e2 but with the X offset reversed -+ edge_16b_e2 \body_fn, (-\pb) ++ push {lr} ++ sub r6, r1, r3 ++ vld1.8 {q1}, [r1, :128], r3 ++ vldmia r6, {d18-d20} ++ add r6, r6, r3 ++ ++1: vldr d5, [r1, #-8] ++ vld1.8 {q3}, [r1, :128] ++ subs r12, #1 ++ vext.8 q0, q9, q10, #\pb ++ vext.8 q2, q2, q3, #16 - \pb ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q9, q1 ++ vldr d3, [r1, #8] ++ add r1, r1, r3 ++ vldr d20, [r6, #16] ++ add r6, r6, r3 ++ vext.8 d2, d4, d5, #\pb ++ b 1b ++ ++2: pop {pc} +.endm + +.macro edge_8bx2_e3, body_fn, pb -+ @ e3 is the same as e2 but with the X offset reversed -+ edge_8bx2_e2 \body_fn, (-\pb) ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vld1.8 {d18-d19}, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ ++1: vext.8 d0, d18, d19, #\pb ++ vext.8 d4, d4, d3, #8 - \pb ++ vext.8 d1, d2, d20, #\pb ++ subs r12, #2 ++ vext.8 d5, d21, d22, #8 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f ++ ++ vldr d19, [r6, #8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vmov d18, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ b 1b ++ ++2: pop {r7, pc} +.endm + +.macro edge_4bx4_e3, body_fn, pb @@ -8349,80 +7161,6 @@ index 1bf1c620d6..ccfa991f60 100644 const uint8_t *bytestream_start; const uint8_t *bytestream; const uint8_t *bytestream_end; -diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c -index af0f6da2e9..bd491c0c55 100644 ---- a/libavcodec/cllc.c -+++ b/libavcodec/cllc.c -@@ -34,6 +34,10 @@ - #define VLC_DEPTH 2 - - -+#define VLC_BITS 7 -+#define VLC_DEPTH 2 -+ -+ - typedef struct CLLCContext { - AVCodecContext *avctx; - BswapDSPContext bdsp; -diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 6a13bbbf0e..478b7c0ffc 100644 ---- a/libavcodec/codec_desc.c -+++ b/libavcodec/codec_desc.c -@@ -1665,6 +1665,41 @@ static const AVCodecDescriptor codec_descriptors[] = { - .props = AV_CODEC_PROP_LOSSLESS, - .mime_types= MT("image/png"), - }, -+ { -+ .id = AV_CODEC_ID_CFHD, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "cfhd", -+ .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"), -+ .props = AV_CODEC_PROP_LOSSY, -+ }, -+ { -+ .id = AV_CODEC_ID_TRUEMOTION2RT, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "truemotion2rt", -+ .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"), -+ .props = AV_CODEC_PROP_LOSSY, -+ }, -+ { -+ .id = AV_CODEC_ID_MAGICYUV, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "magicyuv", -+ .long_name = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"), -+ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, -+ }, -+ { -+ .id = AV_CODEC_ID_SHEERVIDEO, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "sheervideo", -+ .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"), -+ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, -+ }, -+ { -+ .id = AV_CODEC_ID_YLC, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "ylc", -+ .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), -+ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, -+ }, - - /* various PCM "codecs" */ - { -diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index dd0a965af0..053325c26b 100644 ---- a/libavcodec/h264_parser.c -+++ b/libavcodec/h264_parser.c -@@ -115,7 +115,7 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, - goto found; - } - } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA || -- nalu_type == H264_NAL_IDR_SLICE) { -+ nalu_type == H264_NAL_IDR_SLICE)) { - state += 8; - continue; - } diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c index 0b1195dc3e..5ef81fa739 100644 --- a/libavcodec/mmaldec.c @@ -8461,7 +7199,7 @@ index 8da2a9735e..9089f9b4ea 100644 { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index d181b74570..76e844caa8 100644 +index d181b74570..c52c450956 100644 --- a/libavcodec/rawenc.c +++ b/libavcodec/rawenc.c @@ -24,6 +24,7 @@ @@ -8477,7 +7215,7 @@ index d181b74570..76e844caa8 100644 #include "libavutil/imgutils.h" #include "libavutil/internal.h" +#include "libavutil/avassert.h" -+#if CONFIG_RPI ++#if CONFIG_SAND +#include "libavutil/rpi_sand_fns.h" +#endif @@ -8487,7 +7225,7 @@ index d181b74570..76e844caa8 100644 return 0; } -+#if CONFIG_RPI ++#if CONFIG_SAND +static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ @@ -8543,7 +7281,7 @@ index d181b74570..76e844caa8 100644 if (ret < 0) return ret; -+#if CONFIG_RPI ++#if CONFIG_SAND + if (av_rpi_is_sand_frame(frame)) { + ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame); + *got_packet = (ret == 0); @@ -8556,10 +7294,10 @@ index d181b74570..76e844caa8 100644 if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c new file mode 100644 -index 0000000000..e498c1a3eb +index 0000000000..e02c26fea6 --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c -@@ -0,0 +1,2381 @@ +@@ -0,0 +1,2332 @@ +/* + * HEVC CABAC decoding + * @@ -8611,6 +7349,17 @@ index 0000000000..e498c1a3eb +// code size. +#define USE_N_END_1 1 + ++#if !USE_BY22_DIV ++// * 1/x @ 32 bits gets us 22 bits of accuracy ++#define CABAC_BY22_PEEK_BITS 22 ++#else ++// A real 32-bit divide gets us another bit ++// If we have a 64 bit int & a unit time divider then we should get a lot ++// of bits (55) but that is untested and it is unclear if it would give ++// us a large advantage ++#define CABAC_BY22_PEEK_BITS 23 ++#endif ++ +#if ARCH_ARM +#include "arm/rpi_hevc_cabac.h" +#endif @@ -9154,6 +7903,16 @@ index 0000000000..e498c1a3eb +} +#endif + ++static inline int cabac_overflow(const CABACContext * const cc) ++{ ++ av_assert0(cc->bytestream >= cc->bytestream_start); ++ return cc->bytestream >= cc->bytestream_end + 4; ++} ++ ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc) ++{ ++ return cabac_overflow(&lc->cc); ++} + +#if !USE_BY22 +// If no by22 then _by22 functions will revert to normal and so _peek/_flush @@ -9179,17 +7938,6 @@ index 0000000000..e498c1a3eb +// O(1) nature of the code more worthwhile. + + -+#if !USE_BY22_DIV -+// * 1/x @ 32 bits gets us 22 bits of accuracy -+#define CABAC_BY22_PEEK_BITS 22 -+#else -+// A real 32-bit divide gets us another bit -+// If we have a 64 bit int & a unit time divider then we should get a lot -+// of bits (55) but that is untested and it is unclear if it would give -+// us a large advantage -+#define CABAC_BY22_PEEK_BITS 23 -+#endif -+ +// Bypass block start +// Must be called before _by22_peek is used as it sets the CABAC environment +// into the correct state. _by22_finish must be called to return to 'normal' @@ -9274,30 +8022,19 @@ index 0000000000..e498c1a3eb +#endif // USE_BY22 + + -+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts) ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc) +{ -+ // ???? Does this work with tiles + WPP? (No) -+ // **** Need to save rice state too -+ // pred_qpy is handled by get_qPy_pred and lc->first_qp_group -+ if (s->ps.pps->entropy_coding_sync_enabled_flag && -+ (ctb_addr_ts % s->ps.sps->ctb_width == 2 || -+ (s->ps.sps->ctb_width == 2 && -+ ctb_addr_ts % s->ps.sps->ctb_width == 0))) { -+ memcpy(s->cabac_state, lc->cabac_state, HEVC_CONTEXTS); -+ } ++ memcpy(s->cabac_save->rice, lc->stat_coeff, 4); ++ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS); +} + +static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ -+ memcpy(lc->cabac_state, s->cabac_state, HEVC_CONTEXTS); ++ memcpy(lc->stat_coeff, s->cabac_save->rice, 4); ++ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS); +} + -+static int cabac_reinit(HEVCRpiLocalContext *lc) -+{ -+ return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0; -+} -+ -+static int cabac_init_decoder(HEVCRpiLocalContext * const lc) ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc) +{ + GetBitContext * const gb = &lc->gb; + skip_bits(gb, 1); @@ -9331,68 +8068,19 @@ index 0000000000..e498c1a3eb + lc->stat_coeff[i] = 0; +} + -+int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts) ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags) +{ -+ if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) { -+ int ret = cabac_init_decoder(lc); -+ if (ret < 0) -+ return ret; -+ if (s->sh.dependent_slice_segment_flag == 0 || -+ (s->ps.pps->tiles_enabled_flag && -+ s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1])) -+ cabac_init_state(s, lc); -+ -+ if (!s->sh.first_slice_in_pic_flag && -+ s->ps.pps->entropy_coding_sync_enabled_flag) { -+ if (ctb_addr_ts % s->ps.sps->ctb_width == 0) { -+ if (s->ps.sps->ctb_width == 1) -+ cabac_init_state(s, lc); -+ else if (s->sh.dependent_slice_segment_flag == 1) -+ load_states(s, lc); -+ } -+ } -+ } else { -+ if (s->ps.pps->tiles_enabled_flag && -+ s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) { -+ if (!lc->wpp_init) { -+ int ret; -+ if (s->threads_number == 1) // **** Ummm... can only be 1 in our world but this is a wpp test -+ ret = cabac_reinit(lc); -+ else -+ ret = cabac_init_decoder(lc); -+ if (ret < 0) -+ return ret; -+ } -+ lc->wpp_init = 0; -+ -+ cabac_init_state(s, lc); -+ } -+ if (s->ps.pps->entropy_coding_sync_enabled_flag) { -+ if (ctb_addr_ts % s->ps.sps->ctb_width == 0) { // ** Tiles + WPP bust -+ // If wpp_init is set then we have been set up in the correct pos -+ if (!lc->wpp_init) { -+ int ret; -+ // * Strong argument for putting the read terminate & align -+ // at the end of the previous block (where it logically -+ // resides) rather than here -+ get_cabac_terminate(&lc->cc); -+ if (s->threads_number == 1) -+ ret = cabac_reinit(lc); -+ else -+ ret = cabac_init_decoder(lc); -+ if (ret < 0) -+ return ret; -+ } -+ lc->wpp_init = 0; -+ -+ if (s->ps.sps->ctb_width == 1) -+ cabac_init_state(s, lc); -+ else -+ load_states(s, lc); -+ } -+ } ++ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ cabac_init_state(s, lc); + } -+ return 0; ++ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ load_states(s, lc); ++ } ++ lc->cabac_init_req = 0; +} + +#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) @@ -10737,7 +9425,8 @@ index 0000000000..e498c1a3eb + } + } + } while ((i = next_subset(lc, i, c_idx_nz, -+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0); ++ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 && ++ !cabac_overflow(&lc->cc)); + + if (lc->cu.cu_transquant_bypass_flag) { + if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && @@ -12134,10 +10823,10 @@ index 0000000000..a1d6d56b04 +} diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 -index 0000000000..9db79e658f +index 0000000000..93f3530ff5 --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,769 @@ +@@ -0,0 +1,761 @@ +/* + * HEVC video decoder + * @@ -12188,13 +10877,9 @@ index 0000000000..9db79e658f + lc->na.cand_up = (lc->ctb_up_flag || y0b); + lc->na.cand_left = (lc->ctb_left_flag || x0b); + lc->na.cand_up_left = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up; -+ lc->na.cand_up_right_sap = -+ ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ? -+ lc->ctb_up_right_flag && !y0b : lc->na.cand_up; -+ lc->na.cand_up_right = -+ lc->na.cand_up_right_sap -+ && (x0 + nPbW) < lc->end_of_tiles_x; -+ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left; ++ lc->na.cand_up_right = (x0 + nPbW) >= lc->end_of_ctb_x ? ++ (lc->ctb_up_right_flag && !y0b) : lc->na.cand_up; ++ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_ctb_y) ? 0 : lc->na.cand_left; +} + +/* @@ -12436,7 +11121,7 @@ index 0000000000..9db79e658f + const int cand_left = lc->na.cand_left; + const int cand_up_left = lc->na.cand_up_left; + const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right_sap; ++ const int cand_up_right = lc->na.cand_up_right; + + const int xA1 = x0 - 1; + const int yA1 = y0 + nPbH - 1; @@ -12503,7 +11188,6 @@ index 0000000000..9db79e658f + + // above right spatial merge candidate + is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ xB0 < s->ps.sps->width && + PRED_BLOCK_AVAILABLE(B0) && + !is_diff_mer(s, xB0, yB0, x0, y0); + @@ -12517,7 +11201,6 @@ index 0000000000..9db79e658f + + // left bottom spatial merge candidate + is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ yA0 < s->ps.sps->height && + PRED_BLOCK_AVAILABLE(A0) && + !is_diff_mer(s, xA0, yA0, x0, y0); + @@ -12750,7 +11433,7 @@ index 0000000000..9db79e658f + const int cand_left = lc->na.cand_left; + const int cand_up_left = lc->na.cand_up_left; + const int cand_up = lc->na.cand_up; -+ const int cand_up_right = lc->na.cand_up_right_sap; ++ const int cand_up_right = lc->na.cand_up_right; + ref_idx_curr = LX; + ref_idx = mv->ref_idx[LX]; + pred_flag_index_l0 = LX; @@ -12761,7 +11444,6 @@ index 0000000000..9db79e658f + yA0 = y0 + nPbH; + + is_available_a0 = AVAILABLE(cand_bottom_left, A0) && -+ yA0 < s->ps.sps->height && + PRED_BLOCK_AVAILABLE(A0); + + //left spatial merge candidate @@ -12816,7 +11498,6 @@ index 0000000000..9db79e658f + yB0 = y0 - 1; + + is_available_b0 = AVAILABLE(cand_up_right, B0) && -+ xB0 < s->ps.sps->width && + PRED_BLOCK_AVAILABLE(B0); + + // above spatial merge candidate @@ -13099,10 +11780,10 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..f65efa1015 +index 0000000000..d28ae0ec92 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c -@@ -0,0 +1,1712 @@ +@@ -0,0 +1,1756 @@ +/* + * HEVC Parameter Set decoding + * @@ -14367,9 +13048,10 @@ index 0000000000..f65efa1015 + av_freep(&pps->col_idxX); + av_freep(&pps->ctb_addr_rs_to_ts); + av_freep(&pps->ctb_addr_ts_to_rs); -+ av_freep(&pps->tile_pos_rs); ++ av_freep(&pps->tile_pos_ts); + av_freep(&pps->tile_size); + av_freep(&pps->tile_id); ++ av_freep(&pps->ctb_ts_flags); + av_freep(&pps->min_tb_addr_zs_tab); + + av_freep(&pps); @@ -14466,13 +13148,17 @@ index 0000000000..f65efa1015 + pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); + pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); + pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); -+ pps->tile_size = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_size)); ++ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); ++ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); ++ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); + pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab)); + if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || -+ !pps->tile_id || !pps->min_tb_addr_zs_tab) { ++ !pps->tile_id || !pps->min_tb_addr_zs_tab || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { + return AVERROR(ENOMEM); + } + ++ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags)); ++ + for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { + int tb_x = ctb_addr_rs % sps->ctb_width; + int tb_y = ctb_addr_rs / sps->ctb_width; @@ -14506,24 +13192,62 @@ index 0000000000..f65efa1015 + pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; + } + -+ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) -+ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) -+ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) -+ for (x = pps->col_bd[i]; x < pps->col_bd[i + 1]; x++) -+ pps->tile_id[pps->ctb_addr_rs_to_ts[y * sps->ctb_width + x]] = tile_id; ++ { ++ uint8_t * pflags = pps->ctb_ts_flags; ++ uint16_t * ptid = pps->tile_id; + -+ pps->tile_pos_rs = av_malloc_array(tile_id, sizeof(*pps->tile_pos_rs)); -+ if (!pps->tile_pos_rs) -+ return AVERROR(ENOMEM); -+ -+ for (j = 0; j < pps->num_tile_rows; j++) -+ for (i = 0; i < pps->num_tile_columns; i++) ++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) + { -+ pps->tile_size[j * pps->num_tile_columns + i] = -+ pps->column_width[i] * pps->row_height[j]; -+ pps->tile_pos_rs[j * pps->num_tile_columns + i] = -+ pps->row_bd[j] * sps->ctb_width + pps->col_bd[i]; ++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) ++ { ++ const unsigned int tile_w = pps->column_width[i]; ++ ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ for (x = 0; x != tile_w; ++x) { ++ pflags[x] |= CTB_TS_FLAGS_TOT; ++ } ++ ++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) ++ { ++ pflags[0] |= CTB_TS_FLAGS_SOTL; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ { ++ if (pps->column_width[i] != 1) ++ pflags[1] |= CTB_TS_FLAGS_CSAVE; ++ else ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0) ++ pflags[0] |= CTB_TS_FLAGS_CLOAD; ++ } ++ ++ for (x = 0; x != tile_w; ++x) ++ *ptid++ = tile_id; ++ ++ pflags += tile_w; ++ pflags[-1] |= CTB_TS_FLAGS_EOTL; ++ if (i + 1 == pps->num_tile_columns) ++ pflags[-1] |= CTB_TS_FLAGS_EOL; ++ } ++ ++ pflags[-1] |= CTB_TS_FLAGS_EOT; ++ } + } ++ } ++ ++ { ++ unsigned int ts = 0; ++ for (j = 0; j < pps->num_tile_rows; j++) ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ const unsigned int size = pps->column_width[i] * pps->row_height[j]; ++ pps->tile_size[j * pps->num_tile_columns + i] = size; ++ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts; ++ ts += size; ++ } ++ } + + log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size; + pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1]; @@ -14780,6 +13504,7 @@ index 0000000000..f65efa1015 + if (get_bits_left(gb) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Overread PPS by %d bits\n", -get_bits_left(gb)); ++ ret = AVERROR_INVALIDDATA; + goto err; + } + @@ -14817,10 +13542,10 @@ index 0000000000..f65efa1015 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..1600076a69 +index 0000000000..989f8953b4 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,437 @@ +@@ -0,0 +1,446 @@ +/* + * HEVC parameter set parsing + * @@ -14868,7 +13593,7 @@ index 0000000000..1600076a69 + uint8_t nb_refs; +} LongTermRPS; + -+typedef struct SliceHeader { ++typedef struct RpiSliceHeader { + unsigned int pps_id; + + ///< address (in raster order) of the first block in the current slice segment @@ -14941,9 +13666,7 @@ index 0000000000..1600076a69 + + int16_t luma_offset_l1[16]; + int16_t chroma_offset_l1[16][2]; -+ -+ int slice_ctb_addr_rs; -+} SliceHeader; ++} RpiSliceHeader; + +typedef struct HEVCWindow { + unsigned int left_offset; @@ -15138,6 +13861,15 @@ index 0000000000..1600076a69 + int data_size; +} HEVCRpiSPS; + ++#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line ++#define CTB_TS_FLAGS_EOTL (1U << 1) ++#define CTB_TS_FLAGS_EOL (1U << 2) ++#define CTB_TS_FLAGS_EOT (1U << 3) ++#define CTB_TS_FLAGS_CSAVE (1U << 4) ++#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request ++#define CTB_TS_FLAGS_TOT (1U << 6) ++#define CTB_TS_FLAGS_CLOAD (1U << 7) ++ +typedef struct HEVCRpiPPS { + unsigned int sps_id; ///< seq_parameter_set_id + @@ -15198,19 +13930,21 @@ index 0000000000..1600076a69 + uint8_t log2_sao_offset_scale_chroma; + + // Inferred parameters -+ unsigned int *column_width; ///< ColumnWidth -+ unsigned int *row_height; ///< RowHeight -+ unsigned int *col_bd; ///< ColBd -+ unsigned int *row_bd; ///< RowBd -+ int *col_idxX; ++ uint16_t *column_width; ///< ColumnWidth ++ uint16_t *row_height; ///< RowHeight ++ uint16_t *col_bd; ///< ColBd ++ uint16_t *row_bd; ///< RowBd ++ uint16_t *col_idxX; + -+ int *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS -+ int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS -+ int *tile_id; ///< TileId -+ int *tile_pos_rs; ///< TilePosRS -+ int *tile_size; ///< TileSize ++ // We can limit these to uint16_t given our other size limits ++ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS ++ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS ++ uint16_t *tile_id; ///< TileId ++ uint16_t *tile_pos_ts; ///< TilePosRS ++ uint16_t *tile_size; ///< TileSize + int *min_tb_addr_zs; ///< MinTbAddrZS + int *min_tb_addr_zs_tab;///< MinTbAddrZS ++ uint8_t * ctb_ts_flags; + + uint8_t data[4096]; + int data_size; @@ -15384,7 +14118,7 @@ index 0000000000..7fa6af1cdf +} diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c new file mode 100644 -index 0000000000..ef15784317 +index 0000000000..d7745711ab --- /dev/null +++ b/libavcodec/rpi_hevc_refs.c @@ -0,0 +1,515 @@ @@ -15668,7 +14402,7 @@ index 0000000000..ef15784317 + +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) +{ -+ SliceHeader *sh = &s->sh; ++ RpiSliceHeader *sh = &s->sh; + + uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; + uint8_t list_idx; @@ -15905,10 +14639,10 @@ index 0000000000..ef15784317 +} diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c new file mode 100644 -index 0000000000..c98b0804ed +index 0000000000..c5133a8a88 --- /dev/null +++ b/libavcodec/rpi_hevc_sei.c -@@ -0,0 +1,364 @@ +@@ -0,0 +1,368 @@ +/* + * HEVC Supplementary Enhancement Information messages + * @@ -16235,12 +14969,16 @@ index 0000000000..c98b0804ed + av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); + + while (byte == 0xFF) { ++ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255) ++ return AVERROR_INVALIDDATA; + byte = get_bits(gb, 8); + payload_type += byte; + } + byte = 0xFF; + while (byte == 0xFF) { -+ byte = get_bits(gb, 8); ++ if (get_bits_left(gb) < 8 + 8LL*payload_size) ++ return AVERROR_INVALIDDATA; ++ byte = get_bits(gb, 8); + payload_size += byte; + } + if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { @@ -16416,7 +15154,7 @@ index 0000000000..41e4a20127 +#endif /* AVCODEC_RPI_HEVC_SEI_H */ diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c new file mode 100644 -index 0000000000..4f1d6c71f2 +index 0000000000..fe506c8ad0 --- /dev/null +++ b/libavcodec/rpi_hevc_shader.c @@ -0,0 +1,1570 @@ @@ -17988,7 +16726,7 @@ index 0000000000..4f1d6c71f2 +// ::mc_end +}; +#ifdef __HIGHC__ -+#pragma Align_to(8, rpi_shader) ++#pragma Align_to(8, ff_hevc_rpi_shader) +#endif diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h new file mode 100644 @@ -27686,10 +26424,10 @@ index 0000000000..56d5206827 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..00bd911a86 +index 0000000000..0ad64f9f19 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5630 @@ +@@ -0,0 +1,5679 @@ +/* + * HEVC video Decoder + * @@ -27772,8 +26510,20 @@ index 0000000000..00bd911a86 +#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) +#define QPU_C_CMD_PER_CTU_MAX (8 * 8) + -+#define QPU_C_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX) -+#define QPU_Y_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX) ++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64) ++ ++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP) ++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS) ++ ++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2) ++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2) ++ ++// Total cmds to allocate - allow for slack & setup ++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX) ++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX) ++ ++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2)) ++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2)) + +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 @@ -28951,7 +27701,7 @@ index 0000000000..00bd911a86 +static int hls_slice_header(HEVCRpiContext *s) +{ + GetBitContext *gb = &s->HEVClc->gb; -+ SliceHeader *sh = &s->sh; ++ RpiSliceHeader *sh = &s->sh; + int i, ret; + + // Coded parameters @@ -29310,8 +28060,16 @@ index 0000000000..00bd911a86 + return AVERROR(ENOMEM); + } + for (i = 0; i < sh->num_entry_point_offsets; i++) { -+ unsigned val = get_bits_long(gb, offset_len); -+ sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size ++ uint32_t val_minus1 = get_bits_long(gb, offset_len); ++ if (val_minus1 > (1 << 28)) ++ { ++ // We can declare offsets of > 2^28 bad without loss of generality ++ // Will check actual bounds wrt NAL later, but this keeps ++ // the values within bounds we can deal with easily ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size + } + if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) { + s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here @@ -29344,13 +28102,6 @@ index 0000000000..00bd911a86 + return AVERROR_INVALIDDATA; + } + -+ sh->slice_ctb_addr_rs = sh->slice_segment_addr; -+ -+ if (!s->sh.slice_ctb_addr_rs && s->sh.dependent_slice_segment_flag) { -+ av_log(s->avctx, AV_LOG_ERROR, "Impossible slice segment.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ + if (get_bits_left(gb) < 0) { + av_log(s->avctx, AV_LOG_ERROR, + "Overread slice header by %d bits\n", -get_bits_left(gb)); @@ -29504,14 +28255,17 @@ index 0000000000..00bd911a86 + lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta; + lc->tu.is_cu_qp_delta_coded = 1; + -+ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || -+ lc->tu.cu_qp_delta > (25 + s->ps.sps->qp_bd_offset / 2)) { ++// Was: ++// if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || ++// if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || ++// 2016 standard says: ++ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset) || ++ lc->tu.cu_qp_delta > 25) { + av_log(s->avctx, AV_LOG_ERROR, + "The cu_qp_delta %d is outside the valid range " + "[%d, %d].\n", + lc->tu.cu_qp_delta, -+ -(26 + s->ps.sps->qp_bd_offset / 2), -+ (25 + s->ps.sps->qp_bd_offset / 2)); ++ -(26 + s->ps.sps->qp_bd_offset), 25); + return AVERROR_INVALIDDATA; + } + @@ -29953,11 +28707,25 @@ index 0000000000..00bd911a86 +static HEVCRpiInterPredQ * +rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) +{ -+ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr; -+ HEVCRpiInterPredQ * ypt = yp + 1; -+ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) { -+ if (ypt->load < yp->load) ++ HEVCRpiInterPredQ * yp = NULL; ++ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr; ++ const unsigned int max_fill = ipe->max_fill; ++ unsigned int load = UINT_MAX; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) { ++ // We will always have enough room between the Qs but if we are ++ // running critically low due to poor scheduling then use fill size ++ // rather than load to determine QPU. This has obvious dire ++ // performance implications but (a) it is better than crashing ++ // and (b) it should (almost) never happen ++ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base; ++ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load; ++ ++ if (tload < load) ++ { + yp = ypt; ++ load = tload; ++ } + } + + yp->load += load_val; @@ -29980,7 +28748,9 @@ index 0000000000..00bd911a86 + } +} + -+// Returns 0 on success, -1 if Q is dangerously full ++// Returns 0 on success ++// We no longer check for Q fullness as wew have emergncy code in ctu alloc ++// * However it might be an idea to have some means of spotting that we've used it +static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) +{ + if (!ipe->used_grp) @@ -29994,12 +28764,6 @@ index 0000000000..00bd911a86 + ipe->used = 1; + ipe->used_grp = 0; + -+ for (unsigned int i = 0; i != ipe->n_grp; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr; -+ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) { -+ return -1; -+ } -+ } + return 0; +} + @@ -31029,44 +29793,38 @@ index 0000000000..00bd911a86 +static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x_ctb, const int y_ctb, const int ctb_addr_ts) +{ -+ const int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice -+ const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size]; ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; ++ const unsigned int line_w = s->ps.sps->ctb_width; + + s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; + -+ lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width : -+ (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size); -+ -+ if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] || -+ (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX])) -+ { -+// lc->first_qp_group = 1; -+ lc->qPy_pred = s->sh.slice_qp; -+ } -+ -+ lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); ++ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width); ++ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); + + lc->boundary_flags = 0; + -+ if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]]) ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0) + lc->boundary_flags |= BOUNDARY_LEFT_TILE; + if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) + lc->boundary_flags |= BOUNDARY_LEFT_SLICE; -+ if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]) ++ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0) + lc->boundary_flags |= BOUNDARY_UPPER_TILE; -+ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width]) ++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) + lc->boundary_flags |= BOUNDARY_UPPER_SLICE; + + lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0; + lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0; -+ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && -+ (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width); + -+ lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x && -+ (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) && -+ (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]])); ++ // Use line width rather than tile width for addr_in_slice test as ++ // addr_in_slice is in raster units ++ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_rs_in_slice >= line_w + 1); ++ ++ lc->ctb_up_right_flag = (ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && ++ (ctb_addr_rs_in_slice + 1 >= line_w); +} + + @@ -31091,11 +29849,10 @@ index 0000000000..00bd911a86 + + // Flush (SAO) + if (y > y0) { -+ const int tile_end = y_end || -+ s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1]; + const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0; + const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0; -+ const unsigned int yb = tile_end ? bound_b : y - ctb_size; ++ const unsigned int yb = (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0 ? ++ bound_b : y - ctb_size; + + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); + rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, @@ -31169,7 +29926,7 @@ index 0000000000..00bd911a86 + break; + + default: -+ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); + abort(); + } + } @@ -31531,16 +30288,21 @@ index 0000000000..00bd911a86 + + // * Sizeof the union structure might be overkill but at the moment it + // is correct (it certainly isn't going to be too small) -+ // *** really should add per ctu sync words to be accurate ++ // Set max fill to slack/2 from the end of the Q ++ // If we exceed this in any Q then we will schedule by size (which should ++ // mean that we never use that Q again part from syncs) ++ // * Given how agressive the overflow resonse is we could maybe put the ++ // threshold even nearer the end, but I don't expect us to ever hit ++ // it on any real stream anyway. + + rpi_inter_pred_alloc(&jb->chroma_ip, + QPU_N_MAX, QPU_N_GRP, -+ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t), -+ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t)); ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t), ++ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2); + rpi_inter_pred_alloc(&jb->luma_ip, + QPU_N_MAX, QPU_N_GRP, -+ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t), -+ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t)); ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t), ++ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2); + + return jb; +} @@ -31673,15 +30435,17 @@ index 0000000000..00bd911a86 + +static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) +{ -+ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; ++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; ++ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts]; + + // Check for obvious disasters -+ if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { ++ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) { + av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); + return AVERROR_INVALIDDATA; + } + ++ // If dependant then ctb_addr_ts != 0 from previous check + if (s->sh.dependent_slice_segment_flag) { + int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; + if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { @@ -31691,7 +30455,7 @@ index 0000000000..00bd911a86 + } + + if (!s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles) ++ tile_id + s->sh.num_entry_point_offsets >= tiles) + { + av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); + return AVERROR_INVALIDDATA; @@ -31700,20 +30464,21 @@ index 0000000000..00bd911a86 + // Tiled stuff must start at start of tile if it has multiple entry points + if (!s->ps.pps->entropy_coding_sync_enabled_flag && + s->sh.num_entry_point_offsets != 0 && -+ s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]]) ++ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id]) + { + av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); + return AVERROR_INVALIDDATA; + } + -+ // Setup any required decode vars -+ if (!s->sh.dependent_slice_segment_flag) -+ lc->qPy_pred = s->sh.slice_qp; ++ ff_hevc_rpi_cabac_init_decoder(lc); + ++ // Setup any required decode vars ++ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag; ++ ++// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot); + lc->qp_y = s->sh.slice_qp; + + // General setup -+ lc->wpp_init = 0; + lc->bt_line_no = 0; + lc->ts = ctb_addr_ts; + return 0; @@ -31722,6 +30487,7 @@ index 0000000000..00bd911a86 +static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) +{ + const GetBitContext * const gb = &s->HEVClc->gb; ++ RpiSliceHeader * const sh = &s->sh; + int i, j; + + const unsigned int length = nal->size; @@ -31729,38 +30495,46 @@ index 0000000000..00bd911a86 + unsigned int cmpt; + unsigned int startheader; + -+ if (s->sh.num_entry_point_offsets == 0) { ++ if (sh->num_entry_point_offsets == 0) { ++ s->data = NULL; + return 0; + } + -+ for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) { ++ // offset in slice header includes emulation prevention bytes. ++ // Unfortunately those have been removed by the time we get here so we ++ // have to compensate. The nal layer keeps a track of where they were. ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) { + if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { + startheader--; + cmpt++; + } + } + -+ for (i = 1; i < s->sh.num_entry_point_offsets; i++) { -+ offset += (s->sh.entry_point_offset[i - 1] - cmpt); -+ for (j = 0, cmpt = 0, startheader = offset -+ + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) { ++ for (i = 1; i < sh->num_entry_point_offsets; i++) { ++ offset += (sh->entry_point_offset[i - 1] - cmpt); ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) { + if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { + startheader--; + cmpt++; + } + } -+ s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt; -+ s->sh.offset[i - 1] = offset; -+ } -+ if (s->sh.num_entry_point_offsets != 0) { -+ offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt; -+ if (length < offset) { -+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); ++ if (sh->entry_point_offset[i] <= cmpt) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n"); + return AVERROR_INVALIDDATA; + } -+ s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset; -+ s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset; ++ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt; ++ sh->offset[i - 1] = offset; + } ++ ++ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt; ++ if (length < offset) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->size[sh->num_entry_point_offsets - 1] = length - offset; ++ sh->offset[sh->num_entry_point_offsets - 1] = offset; ++ ++ // Remember data start pointer as we won't have nal later + s->data = nal->data; + return 0; +} @@ -31787,10 +30561,11 @@ index 0000000000..00bd911a86 + const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; + const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; + int q_full; ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; + + hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); + -+ ff_hevc_rpi_cabac_init(s, lc, ctb_addr_ts); ++ ff_hevc_rpi_cabac_init(s, lc, ctb_flags); + + hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); + @@ -31800,28 +30575,40 @@ index 0000000000..00bd911a86 + + more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + ++ if (ff_hevc_rpi_cabac_overflow(lc)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n "); ++ more_data = AVERROR_INVALIDDATA; ++ } ++ + if (more_data < 0) { -+ s->tab_slice_address[ctb_addr_rs] = -1; ++ s->tab_slice_address[ctb_addr_rs] = -1; // Mark slice as broken + return more_data; + } + -+ // Inc TS to next. -+ // N.B. None of the other position vars have changed -+ ctb_addr_ts++; -+ ff_hevc_rpi_save_states(s, lc, ctb_addr_ts); ++ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 || ++ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0))) ++ { ++ if (get_cabac_terminate(&lc->cc) < 0 || ++ skip_bytes(&lc->cc, 0) == NULL) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n "); ++ return -1; ++ } ++ } ++ ++ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) ++ ff_hevc_rpi_save_states(s, lc); + + // Report progress so we can use our MVs in other frames -+ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) { ++ if (s->threads_type == FF_THREAD_FRAME && (ctb_flags & CTB_TS_FLAGS_EOL) != 0) + ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); -+ } + + // End of line || End of tile line || End of tile + // (EoL covers end of frame for our purposes here) -+ q_full = x_ctb + ctb_size >= s->ps.sps->width || -+ s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 || -+ s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts]; ++ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0); + -+ // Allocate QPU chuncks on fixed size 64 pel boundries rather than ++ // Allocate QPU chunks on fixed size 64 pel boundries rather than + // whatever ctb_size is today. + // * We might quite like to continue to 64 pel vertical too but that + // currently confuses WPP @@ -31837,11 +30624,14 @@ index 0000000000..00bd911a86 + // * This is very annoying (and slow) to cope with in WPP so + // we treat it as an error there (no known stream triggers this + // with the current buffer sizes). Non-wpp should cope fine. -+ av_log(s, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); + q_full = 1; + } + } + ++ // Inc TS to next. ++ ctb_addr_ts++; ++ + if (q_full) + { + // Do job @@ -31897,6 +30687,8 @@ index 0000000000..00bd911a86 + // Always need to store where we are in the bitstream + dst_lc->ts = src_lc->ts; + dst_lc->gb = src_lc->gb; ++ // Cabac init request will be built at start of next slice ++ + // Need to store context if we might have a dependent seg + if (is_dep) + { @@ -31927,7 +30719,7 @@ index 0000000000..00bd911a86 + line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? + INT_MAX : + is_tile ? -+ s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] : ++ s->ps.pps->tile_pos_ts[tile_id + line_inc] : + lc->ts + lc->bt_line_width * line_inc; + // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) + const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; @@ -31951,14 +30743,11 @@ index 0000000000..00bd911a86 + return err; + + ff_init_cabac_decoder(&lc->cc, data, len); -+ -+ lc->wpp_init = 1; // Stop ff_hevc_rpi_cabac_init trying to read non-existant termination bits + } + + // We should never be processing a dependent slice here so reset is good + // ?? These probably shouldn't be needed (as they should be set by later + // logic) but do seem to be required -+ lc->qPy_pred = s->sh.slice_qp; + lc->qp_y = s->sh.slice_qp; + + do @@ -32007,7 +30796,7 @@ index 0000000000..00bd911a86 + (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) + { + if (err == 0) { -+ av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); ++ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); + err = AVERROR_INVALIDDATA; + } + worker_free(s, lc); @@ -32113,7 +30902,7 @@ index 0000000000..00bd911a86 + HEVCRpiLocalContext * const lc = s->HEVClcList[i]; + const unsigned int tile = tile0 + line; + -+ lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]]; ++ lc->ts = pps->tile_pos_ts[tile]; + lc->bt_line_no = line; + lc->bt_is_tile = 1; + lc->bt_line_width = line_ts_width(s, lc->ts); @@ -32134,10 +30923,10 @@ index 0000000000..00bd911a86 + + if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp + if (lc->bt_terminate) { -+ av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); + break; + } -+ av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); + } + } + @@ -32341,7 +31130,7 @@ index 0000000000..00bd911a86 + +fail: + // Cleanup -+ av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); ++ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); + // Free our job & wait for temination + worker_free(s, lc); + worker_wait(s, lc); @@ -32716,7 +31505,7 @@ index 0000000000..00bd911a86 + /* split the input packet into NAL units, so we know the upper bound on the + * number of slices in the frame */ + ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, -+ s->nal_length_size, s->avctx->codec_id, 1); ++ s->nal_length_size, s->avctx->codec_id, 0); + if (ret < 0) { + av_log(s->avctx, AV_LOG_ERROR, + "Error splitting the input into NAL units.\n"); @@ -32965,7 +31754,7 @@ index 0000000000..00bd911a86 + + av_freep(&s->sei.picture_hash.md5_ctx); + -+ av_freep(&s->cabac_state); ++ av_freep(&s->cabac_save); + +#if RPI_EXTRA_BIT_THREADS + bit_threads_kill(s); @@ -33059,12 +31848,10 @@ index 0000000000..00bd911a86 + ff_hevc_rpi_progress_init_state(s->progress_states + i); + } + -+ s->cabac_state = av_malloc(HEVC_CONTEXTS); -+ if (!s->cabac_state) ++ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL) + goto fail; + -+ s->output_frame = av_frame_alloc(); -+ if (!s->output_frame) ++ if ((s->output_frame = av_frame_alloc()) == NULL) + goto fail; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { @@ -33091,7 +31878,7 @@ index 0000000000..00bd911a86 + return 0; + +fail: -+ av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__); ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__); + hevc_decode_free(avctx); + return AVERROR(ENOMEM); +} @@ -33197,13 +31984,13 @@ index 0000000000..00bd911a86 + HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); + if (jbg == NULL) + { -+ av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); + return -1; + } + + if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) + { -+ av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); + return -1; + } + } @@ -33322,10 +32109,10 @@ index 0000000000..00bd911a86 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..f61b29e669 +index 0000000000..2201017cb3 --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,1054 @@ +@@ -0,0 +1,1061 @@ +/* + * HEVC video decoder + * @@ -33445,6 +32232,12 @@ index 0000000000..f61b29e669 +#define HEVC_RPI_MAX_HEIGHT 1088 + + ++// Min CTB size is 16 ++#if ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) >= (1 << 16) ++#error Check CTB translation array el sizes (currently uint16_t) ++#endif ++ ++ +/** + * Value of the luma sample at position (x, y) in the 2D array tab. + */ @@ -33629,14 +32422,13 @@ index 0000000000..f61b29e669 + uint8_t cu_transquant_bypass_flag; +} CodingUnit; + -+typedef struct NeighbourAvailable { -+ int cand_bottom_left; -+ int cand_left; -+ int cand_up; -+ int cand_up_left; -+ int cand_up_right; -+ int cand_up_right_sap; -+} NeighbourAvailable; ++typedef struct RpiNeighbourAvailable { ++ char cand_bottom_left; ++ char cand_left; ++ char cand_up; ++ char cand_up_left; ++ char cand_up_right; ++} RpiNeighbourAvailable; + +typedef struct PredictionUnit { + int mpm_idx; @@ -33708,12 +32500,14 @@ index 0000000000..f61b29e669 + +typedef struct HEVCRpiLocalContextIntra { + TransformUnit tu; -+ NeighbourAvailable na; ++ RpiNeighbourAvailable na; +} HEVCRpiLocalContextIntra; + +typedef struct HEVCRpiLocalContext { + TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!) -+ NeighbourAvailable na; ++ RpiNeighbourAvailable na; ++ ++ CABACContext cc; + + // Vars that allow us to locate everything from just an lc + struct HEVCRpiContext * context; // ??? make const ??? @@ -33739,37 +32533,24 @@ index 0000000000..f61b29e669 + + struct HEVCRpiJob * jb0; + char unit_done; // Set once we have dealt with this slice -+// char max_done; + char bt_is_tile; + char last_progress_good; -+ -+ char wpp_init; // WPP/Tile bitstream init has happened ++ char cabac_init_req; + + uint8_t cabac_state[HEVC_CONTEXTS]; -+ + uint8_t stat_coeff[4]; -+ -+// uint8_t first_qp_group; -+ + GetBitContext gb; -+ CABACContext cc; + + int8_t qp_y; + int8_t curr_qp_y; -+ -+ int qPy_pred; ++ int8_t qPy_pred; + + uint8_t ctb_left_flag; + uint8_t ctb_up_flag; + uint8_t ctb_up_right_flag; + uint8_t ctb_up_left_flag; -+ int end_of_tiles_x; -+ int end_of_tiles_y; -+ /* +7 is for subpixel interpolation, *2 for high bit depths */ -+ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; -+ /* The extended size between the new edge emu buffer is abused by SAO */ -+ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; -+ DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); ++ int end_of_ctb_x; ++ int end_of_ctb_y; + + int ct_depth; + CodingUnit cu; @@ -33781,7 +32562,14 @@ index 0000000000..f61b29e669 +#define BOUNDARY_UPPER_TILE (1 << 3) + /* properties of the boundary of the current CTB for the purposes + * of the deblocking filter */ -+ int boundary_flags; ++ unsigned int boundary_flags; ++ ++ /* +7 is for subpixel interpolation, *2 for high bit depths */ ++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ /* The extended size between the new edge emu buffer is abused by SAO */ ++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); ++ +} HEVCRpiLocalContext; + + @@ -33999,6 +32787,11 @@ index 0000000000..f61b29e669 +} HEVCRpiStats; +#endif + ++typedef struct HEVCRpiCabacState ++{ ++ uint8_t rice[4]; ++ uint8_t state[HEVC_CONTEXTS]; ++} HEVCRpiCabacState; + +typedef struct HEVCRpiContext { + const AVClass *c; // needed by private avoptions @@ -34069,7 +32862,7 @@ index 0000000000..f61b29e669 + unsigned int dvq_n; +#endif + -+ uint8_t *cabac_state; ++ HEVCRpiCabacState *cabac_save; + + /** 1 if the independent slice segment header was successfully parsed */ + uint8_t slice_initialized; @@ -34087,7 +32880,7 @@ index 0000000000..f61b29e669 + ///< candidate references for the current frame + RefPicList rps[5]; + -+ SliceHeader sh; ++ RpiSliceHeader sh; + SAOParams *sao; + DBParams *deblock; + enum HEVCNALUnitType nal_unit_type; @@ -34195,8 +32988,9 @@ index 0000000000..f61b29e669 + */ +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); + -+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts); -+int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts); ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags); +int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); @@ -34269,7 +33063,7 @@ index 0000000000..f61b29e669 + const int c_idx); + +void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); -+ ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); + +extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; +extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; @@ -40543,22 +39337,6 @@ index 0000000000..26fb3be999 + +#endif + -diff --git a/libavcodec/snowdec.c b/libavcodec/snowdec.c -index 13668c2105..bebf9024ec 100644 ---- a/libavcodec/snowdec.c -+++ b/libavcodec/snowdec.c -@@ -405,6 +405,11 @@ static int decode_header(SnowContext *s){ - s->qbias = 0; - return AVERROR_INVALIDDATA; - } -+ if (FFABS(s->qbias) > 127) { -+ av_log(s->avctx, AV_LOG_ERROR, "qbias %d is too large\n", s->qbias); -+ s->qbias = 0; -+ return AVERROR_INVALIDDATA; -+ } - - return 0; - } diff --git a/libavcodec/utils.c b/libavcodec/utils.c index 9551f312e7..a1f68b8e30 100644 --- a/libavcodec/utils.c @@ -40604,18 +39382,169 @@ index 9551f312e7..a1f68b8e30 100644 AVCodec *avcodec_find_decoder_by_name(const char *name) { AVCodec *p; -diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c -index f0f849b326..cd97974748 100644 ---- a/libavfilter/avfilter.c -+++ b/libavfilter/avfilter.c -@@ -995,6 +995,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args) - "options, but options were provided: %s.\n", args); - return AVERROR(EINVAL); - } -+ printf("=== args='%s'\n", args); +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index d2f0495f37..56bb87f851 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -323,6 +323,7 @@ OBJS-$(CONFIG_TONEMAP_FILTER) += vf_tonemap.o + OBJS-$(CONFIG_TRANSPOSE_FILTER) += vf_transpose.o + OBJS-$(CONFIG_TRIM_FILTER) += trim.o + OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o ++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o + OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o + OBJS-$(CONFIG_USPP_FILTER) += vf_uspp.o + OBJS-$(CONFIG_VAGUEDENOISER_FILTER) += vf_vaguedenoiser.o +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 9b672a7a7e..d92e47e651 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -334,6 +334,7 @@ static void register_all(void) + REGISTER_FILTER(TRANSPOSE, transpose, vf); + REGISTER_FILTER(TRIM, trim, vf); + REGISTER_FILTER(UNPREMULTIPLY, unpremultiply, vf); ++ REGISTER_FILTER(UNSAND, unsand, vf); + REGISTER_FILTER(UNSHARP, unsharp, vf); + REGISTER_FILTER(USPP, uspp, vf); + REGISTER_FILTER(VAGUEDENOISER, vaguedenoiser, vf); +diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c +index 4304c06847..7bed282dff 100644 +--- a/libavfilter/avfiltergraph.c ++++ b/libavfilter/avfiltergraph.c +@@ -31,6 +31,9 @@ + #include "libavutil/internal.h" + #include "libavutil/opt.h" + #include "libavutil/pixdesc.h" ++#if CONFIG_UNSAND_FILTER ++#include "libavutil/rpi_sand_fns.h" ++#endif - #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR - if ( !strcmp(filter->filter->name, "format") || + #define FF_INTERNAL_FIELDS 1 + #include "framequeue.h" +@@ -420,6 +423,19 @@ static int can_merge_formats(AVFilterFormats *a_arg, + } + } + ++#if CONFIG_UNSAND_FILTER ++static int has_sand_format(const AVFilterFormats * const ff) ++{ ++ int i; ++ for (i = 0; i != ff->nb_formats; ++i) { ++ if (av_rpi_is_sand_format(ff->formats[i])) { ++ return 1; ++ } ++ } ++ return 0; ++} ++#endif ++ + /** + * Perform one round of query_formats() and merging formats lists on the + * filter graph. +@@ -460,6 +476,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + for (j = 0; j < filter->nb_inputs; j++) { + AVFilterLink *link = filter->inputs[j]; + int convert_needed = 0; ++ unsigned int extra_convert_tried = 0; + + if (!link) + continue; +@@ -507,12 +524,15 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + ) + #undef MERGE_DISPATCH + +- if (convert_needed) { ++ while (convert_needed) { + AVFilterContext *convert; + AVFilter *filter; + AVFilterLink *inlink, *outlink; + char scale_args[256]; + char inst_name[30]; ++ int can_retry = 0; ++ ++ convert_needed = 0; + + if (graph->disable_auto_convert) { + av_log(log_ctx, AV_LOG_ERROR, +@@ -525,19 +545,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + /* couldn't merge format lists. auto-insert conversion filter */ + switch (link->type) { + case AVMEDIA_TYPE_VIDEO: +- if (!(filter = avfilter_get_by_name("scale"))) { +- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " +- "not present, cannot convert pixel formats.\n"); +- return AVERROR(EINVAL); +- } +- +- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", +- scaler_count++); ++#if CONFIG_UNSAND_FILTER ++ // Only try each extra conversion once ++ // The unsand output pad should never trigger has_sand_format ++ // but it is better to be safe ++ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) { ++ if (!(filter = avfilter_get_by_name("unsand"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, "", NULL, ++ graph)) < 0) ++ return ret; + +- if ((ret = avfilter_graph_create_filter(&convert, filter, +- inst_name, graph->scale_sws_opts, NULL, +- graph)) < 0) +- return ret; ++ extra_convert_tried |= 1; ++ can_retry = 1; ++ } ++ else ++#endif ++ { ++ if (!(filter = avfilter_get_by_name("scale"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, graph->scale_sws_opts, NULL, ++ graph)) < 0) ++ return ret; ++ } + break; + case AVMEDIA_TYPE_AUDIO: + if (!(filter = avfilter_get_by_name("aresample"))) { +@@ -583,9 +629,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + av_assert0(outlink-> in_channel_layouts->refcount > 0); + av_assert0(outlink->out_channel_layouts->refcount > 0); + } +- if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type) || +- !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type)) ++ // If we have added an extra filter we must merge the input ++ // side but we can have another go at the output ++ if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type)) ++ ret = AVERROR(ENOSYS); ++ else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type)) ++ { ++ if (can_retry) { ++ link = outlink; ++ convert_needed = 1; ++ continue; ++ } + ret = AVERROR(ENOSYS); ++ } + if (inlink->type == AVMEDIA_TYPE_AUDIO && + (!ff_merge_samplerates(inlink->in_samplerates, + inlink->out_samplerates) || diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c index ad5aedd5f7..0d2df8b870 100644 --- a/libavfilter/buffersrc.c @@ -40629,19 +39558,248 @@ index ad5aedd5f7..0d2df8b870 100644 frame->format); break; case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c +new file mode 100644 +index 0000000000..64578b7ac4 +--- /dev/null ++++ b/libavfilter/vf_unsand.c +@@ -0,0 +1,232 @@ ++/* ++ * Copyright (c) 2007 Bobby Bingham ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * format and noformat video filters ++ */ ++ ++#include ++ ++#include "libavutil/internal.h" ++#include "libavutil/mem.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/opt.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct UnsandContext { ++ const AVClass *class; ++} UnsandContext; ++ ++static av_cold void uninit(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++} ++ ++static av_cold int init(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ ++ return 0; ++} ++ ++ ++static int filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterLink * const outlink = link->dst->outputs[0]; ++ AVFrame *out = NULL; ++ int rv = 0; ++ ++ if (outlink->format == in->format) { ++ // If nothing to do then do nothing ++ out = in; ++ } ++ else ++ { ++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) ++ { ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ if (av_rpi_sand_to_planar_frame(out, in) != 0) ++ { ++ rv = -1; ++ goto fail; ++ } ++ ++ av_frame_free(&in); ++ } ++ ++ return ff_filter_frame(outlink, out); ++ ++fail: ++ av_frame_free(&out); ++ av_frame_free(&in); ++ return rv; ++} ++ ++#if 0 ++static void dump_fmts(const AVFilterFormats * fmts) ++{ ++ int i; ++ if (fmts== NULL) { ++ printf("NULL\n"); ++ return; ++ } ++ for (i = 0; i < fmts->nb_formats; ++i) { ++ printf(" %d", fmts->formats[i]); ++ } ++ printf("\n"); ++} ++#endif ++ ++static int query_formats(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ int ret; ++ ++ // If we aren't connected at both ends then just do nothing ++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) ++ return 0; ++ ++// printf("Unsand: %s in: ", __func__); ++// dump_fmts(ctx->inputs[0]->in_formats); ++// printf("Unsand: %s out: ", __func__); ++// dump_fmts(ctx->outputs[0]->out_formats); ++ ++ // Our output formats depend on our input formats and we can't/don't ++ // want to convert between bit depths so we need to wait for the source ++ // to have an opinion before we do ++ if (ctx->inputs[0]->in_formats == NULL) ++ return AVERROR(EAGAIN); ++ ++ // Accept anything ++ if (ctx->inputs[0]->out_formats == NULL && ++ (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0) ++ return ret; ++ ++ // Filter out sand formats ++ ++ // Generate a container if we don't already have one ++ if (ctx->outputs[0]->in_formats == NULL) ++ { ++ // Somewhat rubbish way of ensuring we have a good structure ++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; ++ AVFilterFormats *formats = ff_make_format_list(out_fmts); ++ ++ if (formats == NULL) ++ return AVERROR(ENOMEM); ++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0) ++ return ret; ++ } ++ ++ // Replace old format list with new filtered list derived from what our ++ // input says it can do ++ { ++ const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats; ++ AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats; ++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); ++ int i; ++ int n = 0; ++ int seen_420p = 0; ++ int seen_420p10 = 0; ++ ++ for (i = 0; i < src_ff->nb_formats; ++i) { ++ const enum AVPixelFormat f = src_ff->formats[i]; ++ ++ switch (f){ ++ case AV_PIX_FMT_YUV420P: ++ case AV_PIX_FMT_SAND128: ++ if (!seen_420p) { ++ seen_420p = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ case AV_PIX_FMT_YUV420P10: ++ if (!seen_420p10) { ++ seen_420p10 = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; ++ } ++ break; ++ default: ++ dst_fmts[n++] = f; ++ break; ++ } ++ } ++ ++ av_freep(&dst_ff->formats); ++ dst_ff->formats = dst_fmts; ++ dst_ff->nb_formats = n; ++ } ++ ++// printf("Unsand: %s calc: ", __func__); ++// dump_fmts(ctx->outputs[0]->in_formats); ++ ++ return 0; ++} ++ ++ ++#define OFFSET(x) offsetof(UnsandContext, x) ++static const AVOption unsand_options[] = { ++ { NULL } ++}; ++ ++ ++AVFILTER_DEFINE_CLASS(unsand); ++ ++static const AVFilterPad avfilter_vf_unsand_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = filter_frame, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad avfilter_vf_unsand_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO ++ }, ++ { NULL } ++}; ++ ++AVFilter ff_vf_unsand = { ++ .name = "unsand", ++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), ++ ++ .init = init, ++ .uninit = uninit, ++ ++ .query_formats = query_formats, ++ ++ .priv_size = sizeof(UnsandContext), ++ .priv_class = &unsand_class, ++ ++ .inputs = avfilter_vf_unsand_inputs, ++ .outputs = avfilter_vf_unsand_outputs, ++}; ++ diff --git a/libavformat/utils.c b/libavformat/utils.c -index 1a7996c4fd..271e70ed84 100644 +index 1a7996c4fd..8119fc07f7 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c -@@ -750,7 +750,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in - int default_stream_index = av_find_default_stream_index(s); - if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) { - for (i = 0; i < s->nb_streams; i++) { -- if (av_find_program_from_stream(s, NULL, i)) -+ if (0 && av_find_program_from_stream(s, NULL, i)) - continue; - s->streams[i]->pts_wrap_reference = pts_wrap_reference; - s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; @@ -2940,6 +2940,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) return 1; } @@ -40736,14 +39894,14 @@ index 1a7996c4fd..271e70ed84 100644 if (!options) av_dict_free(&thread_opt); diff --git a/libavutil/Makefile b/libavutil/Makefile -index 65e285a701..2ca778c59f 100644 +index 65e285a701..4909d2682e 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -165,6 +165,7 @@ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o OBJS-$(CONFIG_LIBDRM) += hwcontext_drm.o OBJS-$(CONFIG_LZO) += lzo.o OBJS-$(CONFIG_OPENCL) += opencl.o opencl_internal.o -+OBJS-$(CONFIG_RPI) += rpi_sand_fns.o ++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o @@ -40831,7 +39989,7 @@ index 73b6bd0b14..d907de3f1c 100644 * @} */ diff --git a/libavutil/frame.c b/libavutil/frame.c -index d5fd2932e3..151a33a24d 100644 +index d5fd2932e3..b127cd833b 100644 --- a/libavutil/frame.c +++ b/libavutil/frame.c @@ -16,6 +16,8 @@ @@ -40847,7 +40005,7 @@ index d5fd2932e3..151a33a24d 100644 #include "imgutils.h" #include "mem.h" #include "samplefmt.h" -+#if CONFIG_RPI ++#if CONFIG_SAND +#include "rpi_sand_fns.h" +#endif @@ -40857,7 +40015,7 @@ index d5fd2932e3..151a33a24d 100644 (frame->crop_top + frame->crop_bottom) >= frame->height) return AVERROR(ERANGE); -+#if CONFIG_RPI ++#if CONFIG_SAND + // Sand cannot be cropped - do not try + if (av_rpi_is_sand_format(frame->format)) + return 0; @@ -41128,15 +40286,16 @@ index 0000000000..52d52a2a83 + diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c new file mode 100644 -index 0000000000..b8bfad915e +index 0000000000..3e31ef77ec --- /dev/null +++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,96 @@ +@@ -0,0 +1,151 @@ +#include "config.h" +#include +#include +#include "rpi_sand_fns.h" +#include "avassert.h" ++#include "frame.h" + +#define PW 1 +#include "rpi_sand_fn_pw.h" @@ -41228,12 +40387,66 @@ index 0000000000..b8bfad915e + } +} + ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) ++{ ++ const int w = av_frame_cropped_width(src); ++ const int h = av_frame_cropped_height(src); ++ const int x = src->crop_left; ++ const int y = src->crop_top; ++ ++ // We will crop as part of the conversion ++ dst->crop_top = 0; ++ dst->crop_left = 0; ++ dst->crop_bottom = 0; ++ dst->crop_right = 0; ++ ++ switch (src->format){ ++ case AV_PIX_FMT_SAND128: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x*2, y, w*2, h); ++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ default: ++ return -1; ++ } ++ ++ return av_frame_copy_props(dst, src); ++} diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h new file mode 100644 -index 0000000000..ebaa2b6d08 +index 0000000000..1f50b68ea8 --- /dev/null +++ b/libavutil/rpi_sand_fns.h -@@ -0,0 +1,131 @@ +@@ -0,0 +1,136 @@ +#ifndef AVUTIL_RPI_SAND_FNS +#define AVUTIL_RPI_SAND_FNS + @@ -41286,6 +40499,11 @@ index 0000000000..ebaa2b6d08 + unsigned int w, unsigned int h, const unsigned int shr); + + ++// dst must contain required pixel format & allocated data buffers ++// Cropping on the src buffer will be honoured and dst crop will be set to zero ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); ++ ++ +static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) +{ +#ifdef RPI_ZC_SAND128_ONLY @@ -41365,50 +40583,6 @@ index 0000000000..ebaa2b6d08 + +#endif + -diff --git a/libswscale/input.c b/libswscale/input.c -index bb2f4933ec..de5a17bc7f 100644 ---- a/libswscale/input.c -+++ b/libswscale/input.c -@@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV, - } - } - -+static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV, -+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, -+ int width, uint32_t *unused) -+{ -+ // NIF -+} -+ - #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) - - static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, -@@ -1124,6 +1131,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) - case AV_PIX_FMT_P016BE: - c->chrToYV12 = p016BEToUV_c; - break; -+ case AV_PIX_FMT_SAND128: -+ case AV_PIX_FMT_SAND64_10: -+ c->chrToYV12 = sand128ToUV_c; // NIF -+ break; - } - if (c->chrSrcHSubSample) { - switch (srcFormat) { -diff --git a/libswscale/utils.c b/libswscale/utils.c -index dcab707de6..5b24de889a 100644 ---- a/libswscale/utils.c -+++ b/libswscale/utils.c -@@ -256,6 +256,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { - [AV_PIX_FMT_P010BE] = { 1, 1 }, - [AV_PIX_FMT_P016LE] = { 1, 0 }, - [AV_PIX_FMT_P016BE] = { 1, 0 }, -+#if CONFIG_RPI -+ [AV_PIX_FMT_SAND128] = { 1, 0 }, -+ [AV_PIX_FMT_SAND64_10] = { 1, 0 }, -+#endif - }; - - int sws_isSupportedInput(enum AVPixelFormat pix_fmt) diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 index 0000000000..b1e99a6a89 @@ -41442,7 +40616,7 @@ index 0000000000..b1e99a6a89 + diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv new file mode 100644 -index 0000000000..5e7ed4da9d +index 0000000000..e176c503f9 --- /dev/null +++ b/pi-util/conf_h265.2016.csv @@ -0,0 +1,193 @@ @@ -41556,7 +40730,7 @@ index 0000000000..5e7ed4da9d +1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 +1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 +1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 -+2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt +2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt +1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 +1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 @@ -41622,7 +40796,7 @@ index 0000000000..5e7ed4da9d +1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5 +1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5 +1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5 -+2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5 ++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5 +1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5 +1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt +1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch index 1d1fd1690e..5eac8a1bcd 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch @@ -1,4 +1,4 @@ -From 8f170986cda0695f28eb2cd4e863aaae0e14d19f Mon Sep 17 00:00:00 2001 +From e75d7807cc97b3ddd8d8f6fe2fcf3dc4de58863f Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Sat, 9 Jan 2016 16:34:09 +0100 Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles @@ -32,12 +32,12 @@ index 6c4b011b5c..8f1f5a3e53 100644 #define FF_PROFILE_VC1_SIMPLE 0 #define FF_PROFILE_VC1_MAIN 1 diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 478b7c0ffc..ff10f3b2bc 100644 +index 6a13bbbf0e..03ae4838d2 100644 --- a/libavcodec/codec_desc.c +++ b/libavcodec/codec_desc.c -@@ -1700,6 +1700,13 @@ static const AVCodecDescriptor codec_descriptors[] = { - .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), - .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, +@@ -1665,6 +1665,13 @@ static const AVCodecDescriptor codec_descriptors[] = { + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/png"), }, + { + .id = AV_CODEC_ID_H264_MVC, @@ -78,7 +78,7 @@ index 53cbcfb543..f93f06fcfb 100644 2.14.1 -From 00de72f97e8f69f5d4c614bff956ec726f97fa2e Mon Sep 17 00:00:00 2001 +From 51f6cec0b87840c32482df5d2b09f50d503d2b2b Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Sat, 9 Jan 2016 16:34:40 +0100 Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs @@ -116,7 +116,7 @@ index 86df5eb9b3..22c4f1d82a 100644 #endif /* AVCODEC_H264_H */ diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index 053325c26b..855c74896e 100644 +index dd0a965af0..855c74896e 100644 --- a/libavcodec/h264_parser.c +++ b/libavcodec/h264_parser.c @@ -62,6 +62,7 @@ typedef struct H264ParseContext { @@ -139,7 +139,7 @@ index 053325c26b..855c74896e 100644 goto found; } } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA || -- nalu_type == H264_NAL_IDR_SLICE)) { +- nalu_type == H264_NAL_IDR_SLICE) { + nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) { state += 8; + @@ -195,7 +195,7 @@ index 053325c26b..855c74896e 100644 2.14.1 -From bbf5daa149ccc2c462be1bd5f6f710eba0e82094 Mon Sep 17 00:00:00 2001 +From 6edab559331e83ad11e7940233dbbaae121e528c Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Tue, 28 Nov 2017 16:12:12 +0000 Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame @@ -223,10 +223,10 @@ index 855c74896e..90a99a19a8 100644 2.14.1 -From 3a0ebb0f7473a9a5ab93e01f7261862a3d324e50 Mon Sep 17 00:00:00 2001 +From 2263d8d3a16ccf886c3692597331779a726373b5 Mon Sep 17 00:00:00 2001 From: popcornmix -Date: Tue, 28 Nov 2017 18:32:08 +0000 -Subject: [PATCH 4/4] extract_extradata_bsf: Support H264_MVC +Date: Sun, 21 Jan 2018 20:31:31 +0000 +Subject: [PATCH 4/4] fixup --- libavcodec/extract_extradata_bsf.c | 8 +++++---